# COMP 472 Assignment 1
## Team: RoboCops
### Team Members:
- Rongxi Meng (40045067)
- Chen Qian (27867808)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score, f1_score

## Section 2.1: Load the Dataset in Python
In this section, we will load the dataset and prepare it for the machine learning models.

### 2.1(a) Preprocessing the Penguin Dataset
The Penguin dataset contains string features 'island' and 'sex' that need to be converted to a numerical format suitable for MLP models. We will explore two methods for this conversion:
- **Method i**: Convert these features into 1-hot vectors (also known as dummy-coded data).
- **Method ii**: Manually convert these features into numerical categories.

### 2.1(b) Assessing the Abalone Dataset
We need to determine if the Abalone dataset can be used in its current form. If it contains features similar to the Penguin dataset that need conversion, we will apply the two methods mentioned above to transform any string features into a numerical format.

In [None]:
# Load the datasets
penguins_data = pd.read_csv("./datasets/penguins.csv")
abalone_data = pd.read_csv("./datasets/abalone.csv")

In [None]:
# Display the first few rows of each dataset to verify
penguins_data.head()

In [None]:
abalone_data.head()

In [None]:
# For penguins_data
penguins_data_encoded = pd.get_dummies(penguins_data, columns=['island', 'sex'], drop_first=False)

# Convert boolean values to integers (0 and 1) for the one-hot encoded columns
encoded_columns = [col for col in penguins_data_encoded.columns if 'Type_' in col]
penguins_data_encoded[encoded_columns] = penguins_data_encoded[encoded_columns].astype(int)

In [None]:
# 'Type' is the original column with 'M', 'F', 'I' values for sex
# This column will be the target variable, so we don't include it in the one-hot encoding process

# Separate the features and the target variable
x_abalone = abalone_data.drop('Type', axis=1)  # Features
y_abalone = abalone_data['Type']  # Target variable

# Encode the target variable 'Type'
abalone_data_encoded = pd.get_dummies(y_abalone, drop_first=False)

In [None]:
# Display the first few rows to verify
print(penguins_data_encoded.info())

In [None]:
# Display the first few rows to verify
print(penguins_data_encoded.head())

In [None]:
x_abalone.head()

In [None]:
x_abalone.info()

In [None]:
# Display the first few rows to verify
print(abalone_data_encoded.info())

In [None]:
# Display the first few rows to verify
abalone_data_encoded.head()

## 2.2 Plot the percentage of instances in each output class
- Store the graphic in a file called `penguin-classes.gif` / `abalone-classes.gif`.
- This analysis of the dataset will allow you to determine if the classes are balanced.
- Decide which metric is more appropriate to evaluate the performance.
- Be prepared to discuss this at the demo.

In [None]:
# Plotting the distribution for 'species' in penguins_data
# Then use bash to convert the PNG to GIF in folder "result": magick convert penguin-classes.png penguin-classes.gif
species_counts = penguins_data['species'].value_counts(normalize=True) * 100
species_counts.plot(kind='bar', color='skyblue', figsize=(8, 6))
plt.title('Distribution of Species in Penguins Dataset')
plt.ylabel('Percentage (%)')
plt.xlabel('Species')
plt.xticks(rotation=0)
plt.grid(axis='y')
plt.tight_layout()
plt.show()
#plt.savefig('./result/penguin-classes.png')
plt.close()

In [None]:
# Plotting the distribution for 'Type' in abalone_data
# Then use bash to convert the PNG to GIF in folder "result": magick convert abalone-classes.png penguin-classes.gif
type_counts = abalone_data['Type'].value_counts(normalize=True) * 100
type_counts.plot(kind='bar', color='coral', figsize=(8, 6))
plt.title('Distribution of Types in Abalone Dataset')
plt.ylabel('Percentage (%)')
plt.xlabel('Type')
plt.grid(axis='y')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()
#plt.savefig('./result/abalone-classes.png')
plt.close()

## 2.3 Dataset Splitting
- Utilize the `train_test_split` function with default parameters to divide the dataset into training and testing subsets.

In [None]:
x_penguins = penguins_data_encoded.drop('species', axis=1)  # Features (excluding the target column 'species')
y_penguins = penguins_data_encoded['species']  # Target column

## 2.4 Training and Testing Classifiers
This section involves the training and evaluation of four distinct classifiers. Each classifier will be assessed based on its performance metrics, and the results will be documented accordingly.

In [None]:
def evaluate_model_performance(model, x_test, y_test):
    """
    Evaluate the model's performance.
    
    Returns:
    - cm: Confusion matrix
    - precision, recall, f1: Precision, recall, and F1-measure for each class
    - accuracy: Overall accuracy
    - macro_f1: Macro-average F1
    - weighted_f1: Weighted-average F1
    """
    y_pred = model.predict(x_test)
    cm = confusion_matrix(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    weighted_f1 = f1_score(y_test, y_pred, average='weighted')
    
    return cm, precision, recall, f1, accuracy, macro_f1, weighted_f1


def append_performance_to_file(filename, model_name, best_params, cm, precision, recall, f1, accuracy, macro_f1, weighted_f1):
    """
    Append the model's performance metrics to a file.
    """
    with open(filename, 'a') as f:
        f.write("***** " + model_name + " *****\n")
        if best_params:
            f.write("Best Parameters: " + str(best_params) + "\n")
        f.write("(B) Confusion Matrix:\n")
        f.write(str(cm) + "\n")
        f.write("(C) Precision, Recall, F1-measure for each class:\n")
        for i, (p, r, f1_score) in enumerate(zip(precision, recall, f1)):
            f.write(f"Class {i}: Precision={p:.2f}, Recall={r:.2f}, F1={f1_score:.2f}\n")
        f.write(f"(D) Accuracy: {accuracy:.2f}, Macro-average F1: {macro_f1:.2f}, Weighted-average F1: {weighted_f1:.2f}\n")
        f.write("*************************\n\n")


def run_model_multiple_times(model, x, y, num_runs=5):
    accuracies = []
    macro_f1s = []
    weighted_f1s = []

    for _ in range(num_runs):
        # Split the data in each iteration
        x_train, x_test, y_train, y_test = train_test_split(x, y)

        # Print the first 5 labels of y_test for each iteration
        print(f"Run {_+1}:")
        print(y_test.head())

        
        # Train the model
        model.fit(x_train, y_train)
        
         # Predict and calculate metrics
        y_pred = model.predict(x_test)
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        macro_f1 = f1_score(y_test, y_pred, average='macro')
        macro_f1s.append(macro_f1)
        weighted_f1 = f1_score(y_test, y_pred, average='weighted')
        weighted_f1s.append(weighted_f1)

        # Print out the metrics for this run
        print(f"Accuracy: {accuracy:.2f}")
        print(f"Macro F1: {macro_f1:.2f}")
        print(f"Weighted F1: {weighted_f1:.2f}")
        print("----------------------------")

    # Calculate average and variance for each metric
    results = {
        'avg_accuracy': np.mean(accuracies),
        'var_accuracy': np.var(accuracies),
        'avg_macro_f1': np.mean(macro_f1s),
        'var_macro_f1': np.var(macro_f1s),
        'avg_weighted_f1': np.mean(weighted_f1s),
        'var_weighted_f1': np.var(weighted_f1s)
    }

    return results


def evaluate_and_save_results(model, x, y, filename, model_name, best_params=None, num_runs=5):
    """
    Evaluate the model's performance, save the results to a file, and run the model multiple times to compute average and variance of metrics.
    
    Parameters:
    - model: The trained model to evaluate
    - x: Features of the dataset
    - y: Target labels of the dataset
    - filename: Name of the file to save results
    - model_name: Name of the model (e.g., "Base-DT")
    - best_params: Best parameters (if any) for the model
    - num_runs: Number of times to run the model for computing average and variance
    
    Returns:
    - None
    """
    
    # Split the data
    x_train, x_test, y_train, y_test = train_test_split(x, y)
    
    # Evaluate the performance of the model on the test data
    cm, precision, recall, f1, accuracy, macro_f1, weighted_f1 = evaluate_model_performance(model, x_test, y_test)

    # Append the performance metrics to the file
    append_performance_to_file(filename, model_name, best_params, cm, precision, recall, f1, accuracy, macro_f1, weighted_f1)

    # Run the model multiple times and compute average and variance of metrics
    results = run_model_multiple_times(model, x, y, num_runs)

    # Append the average and variance results to the file
    with open(filename, 'a') as f:
        f.write(f"***** {model_name} Multiple Runs ({num_runs} times) *****\n")
        f.write(f"(A) Average Accuracy: {results['avg_accuracy']:.2f}, Variance: {results['var_accuracy']:.2f}\n")
        f.write(f"(B) Average Macro F1: {results['avg_macro_f1']:.2f}, Variance: {results['var_macro_f1']:.2f}\n")
        f.write(f"(C) Average Weighted F1: {results['avg_weighted_f1']:.2f}, Variance: {results['var_weighted_f1']:.2f}\n")
        f.write("*************************\n\n")


## (a) Base-DT: Decision Tree with Default Parameters
- Illustrate the decision tree graphically.
- For the abalone dataset, you may limit the tree depth for visualization purposes.

In [None]:
def train_and_visualize_base_dt(x_train, y_train, feature_names, title, save_path, max_depth=None):
    """
    Train a base Decision Tree classifier with an optional maximum depth and visualize the tree.
    
    Parameters:
    - x_train: Training data features
    - y_train: Training data labels
    - feature_names: Names of the features in x_train
    - title: Title for the visualization
    - save_path: Path to save the visualization
    - max_depth: Optional maximum depth of the tree
    
    Returns:
    - base_dt: Trained Decision Tree classifier
    """

    # Initialize the Decision Tree classifier with optional max_depth
    base_dt = DecisionTreeClassifier(max_depth=max_depth)

    # Train the classifier
    base_dt.fit(x_train, y_train)

    # Convert class names to strings
    class_names_str = base_dt.classes_.astype(str)

    # Visualize the Decision Tree
    plt.figure(figsize=(15, 10))
    plot_tree(base_dt, filled=True, feature_names=feature_names, class_names=class_names_str, rounded=True)
    plt.title(title)
    plt.savefig(save_path)
    plt.show()
    
    return base_dt

In [None]:
# Use the function
base_dt_penguins = train_and_visualize_base_dt(x_penguins, y_penguins, x_penguins.columns, "Base Decision Tree for Penguins Data", './result/penguins_base_decision_tree.png',max_depth=5)

In [None]:
# Use the function
base_dt_abalone = train_and_visualize_base_dt(x_abalone, y_abalone, x_abalone.columns, "Base Decision Tree for Abalone Data", './result/abalone_base_decision_tree.png',max_depth=3)

In [None]:
# save result & txt for penguin model
evaluate_and_save_results(base_dt_penguins, x_penguins, y_penguins, "./result/penguin-performance.txt", "Base-DT")

In [None]:
# save result & txt for abalone model
evaluate_and_save_results(base_dt_abalone, x_abalone, y_abalone, "./result/abalone-performance.txt", "Base-DT")

### (b) Top-DT: Optimized Decision Tree via Grid Search
- Utilize grid search to find the best-performing Decision Tree based on the evaluation function established in step (3).
- Experiment with the following hyper-parameters:
  - Criterion: `gini` or `entropy`
  - Max depth: Choose two specific values or `None`
  - Min samples split: Select three specific values
- Graphically represent the decision tree.
- For the abalone dataset, consider limiting the tree depth for visualization purposes.

In [None]:
def train_and_visualize_top_dt(x_train, y_train, feature_names, param_grid, title, save_path):
    """
    Perform a grid search to find the best Decision Tree classifier, train it, and visualize the tree.
    
    Parameters:
    - x_train: Training data features
    - y_train: Training data labels
    - feature_names: Names of the features in x_train
    - param_grid: Hyperparameters and their possible values for grid search
    - title: Title for the visualization
    - save_path: Path to save the visualization
    
    Returns:
    - best_dt: Best Decision Tree classifier from the grid search
    """
    
    # Initialize the Decision Tree classifier
    dt = DecisionTreeClassifier()
    
    # Set up the grid search
    grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='accuracy', return_train_score=True)
    
    # Train the grid search on the training data
    grid_search.fit(x_train, y_train)
    
    # Extract the best Decision Tree model
    best_dt = grid_search.best_estimator_
    
    # Visualize the best Decision Tree
    plt.figure(figsize=(15, 10))
    plot_tree(best_dt, filled=True, feature_names=feature_names, class_names=best_dt.classes_, rounded=True)
    plt.title(title)
    plt.savefig(save_path)
    plt.show()
    
    return best_dt

In [None]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

In [None]:
# Use the function
top_dt_penguins = train_and_visualize_top_dt(x_penguins, y_penguins, x_penguins.columns, param_grid, "Top Decision Tree for Penguins Data", './result/penguins_top_decision_tree.png')

In [None]:
# Use the function
top_dt_abalone = train_and_visualize_top_dt(x_abalone, y_abalone, x_abalone.columns, param_grid, "Top Decision Tree for Abalone Data", './result/abalone_top_decision_tree.png')

In [None]:
evaluate_and_save_results(top_dt_penguins, x_penguins, y_penguins, "./result/penguin-performance.txt", "Top-DT")

In [None]:
evaluate_and_save_results(top_dt_abalone, x_abalone, y_abalone, "./result/abalone-performance.txt", "Top-DT")

## (c) Base-MLP: Basic Multi-Layered Perceptron
- Construct a Multi-Layered Perceptron (MLP) with the following specifications:
  - Two hidden layers, each with 100 neurons.
  - Activation function: Sigmoid/Logistic.
  - Solver: Stochastic Gradient Descent (SGD).
  - Default values for all other parameters.


In [None]:
def train_base_mlp(x_train, y_train):
    """
    Train a base Multi-Layered Perceptron (MLP) with the specified parameters.
    
    Parameters:
    - x_train: Training data features
    - y_train: Training data labels
    
    Returns:
    - base_mlp: Trained MLP classifier
    """
    
    # Initialize the MLP classifier with the specified parameters
    base_mlp = MLPClassifier(hidden_layer_sizes=(100, 100), 
                             activation='logistic', 
                             solver='sgd', 
                             random_state=42)
    
    # Train the classifier
    base_mlp.fit(x_train, y_train)
    
    return base_mlp

In [None]:
base_mlp_penguins = train_base_mlp(x_penguins, y_penguins)

In [None]:
# Evaluate the Base-MLP model's performance, save the results to the file, 
# and run the model multiple times to compute average and variance of metrics
evaluate_and_save_results(base_mlp_penguins, x_penguins, y_penguins, "./result/penguin-performance.txt", "Base-MLP")

In [None]:
base_mlp_abalone = train_base_mlp(x_abalone, y_abalone)

In [None]:
# Evaluate the Base-MLP model's performance, save the results to the file, 
# and run the model multiple times to compute average and variance of metrics
evaluate_and_save_results(base_mlp_abalone, x_abalone, y_abalone, "./result/abalone-performance.txt", "Base-MLP")

## (d) Top-MLP: Enhanced Multi-Layered Perceptron via Grid Search
- Implement a grid search to find a high-performing Multi-Layered Perceptron (MLP) model. The grid search should explore the following hyperparameters:
  - Activation functions to consider: `sigmoid`, `tanh`, and `relu`.
  - Network architectures: Choose two configurations, such as:
    - Two hidden layers with 30 and 50 nodes respectively.
    - Three hidden layers with 10 nodes each.
  - Solvers: `adam` and `stochastic gradient descent`.


In [None]:
# Define the hyperparameters and their possible values
param_grid_mlp = {
    'activation': ['logistic', 'tanh', 'relu'],
    'hidden_layer_sizes': [(30, 50), (10, 10, 10)],
    'solver': ['adam', 'sgd']
}

# Initialize the MLP classifier
mlp = MLPClassifier(max_iter=1000)  # Setting max_iter to a higher value for convergence

# Initialize GridSearchCV
grid_search_mlp = GridSearchCV(mlp, param_grid_mlp, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search_mlp.fit(x_penguins, y_penguins)

# Get the best parameters
best_params_mlp = grid_search_mlp.best_params_
print("Best Parameters for Top-MLP:", best_params_mlp)

In [None]:
# Train the MLP with the best hyperparameters
top_mlp = MLPClassifier(**best_params_mlp, max_iter=1000)
top_mlp.fit(x_penguins, y_penguins)

In [None]:
evaluate_and_save_results(top_mlp, x_penguins, y_penguins, "./result/penguin-performance.txt", "Top-MLP", best_params=best_params_mlp)

In [None]:
# Fit the model
grid_search_mlp.fit(x_abalone, y_abalone)

# Get the best parameters
best_params_mlp = grid_search_mlp.best_params_
print("Best Parameters for Top-MLP:", best_params_mlp)

In [None]:
# Train the MLP with the best hyperparameters
top_mlp = MLPClassifier(**best_params_mlp, max_iter=1000)
top_mlp.fit(x_abalone, y_abalone)

In [None]:
evaluate_and_save_results(top_mlp, x_abalone, y_abalone, "./result/abalone-performance.txt", "Top-MLP", best_params=best_params_mlp)