# Assignment 2 - Multi-label classification

## Check for GPU

In [13]:
"""
import torch

# Check if CUDA is available
cuda_available = torch.cuda.is_available()
print("CUDA available: ", cuda_available)

# Set device to CUDA if available, otherwise CPU
device = torch.device("cuda" if cuda_available else "cpu")
print("Device set to: ", device) 
""" 

'\nimport torch\n\n# Check if CUDA is available\ncuda_available = torch.cuda.is_available()\nprint("CUDA available: ", cuda_available)\n\n# Set device to CUDA if available, otherwise CPU\ndevice = torch.device("cuda" if cuda_available else "cpu")\nprint("Device set to: ", device) \n'

## Load Dataset
Load the dataset from 'assignment-2/yeast.csv' using pandas.

In [14]:
import pandas as pd

dataset_path = 'yeast.csv'

# Load the dataset
data = pd.read_csv(dataset_path)

# Print the shape of the dataset
print("Shape of the dataset: ", data.shape)

# Check how many missing values are in the dataset
print("Number of missing values in the dataset: ",data.isnull().sum().sum())

# Check for any categorical columns in the dataset
print("Categorical columns in the dataset: ",data.select_dtypes(include=['object']).columns)

# Print the head of the dataset
data.head()


Shape of the dataset:  (2417, 117)
Number of missing values in the dataset:  0
Categorical columns in the dataset:  Index([], dtype='object')


Unnamed: 0,Att1,Att2,Att3,Att4,Att5,Att6,Att7,Att8,Att9,Att10,...,Class5,Class6,Class7,Class8,Class9,Class10,Class11,Class12,Class13,Class14
0,0.004168,-0.170975,-0.156748,-0.142151,0.058781,0.026851,0.197719,0.04185,0.066938,-0.056617,...,0,0,1,1,0,0,0,1,1,0
1,-0.103956,0.011879,-0.098986,-0.054501,-0.00797,0.049113,-0.03058,-0.077933,-0.080529,-0.016267,...,0,0,0,0,0,0,0,0,0,0
2,0.509949,0.401709,0.293799,0.087714,0.011686,-0.006411,-0.006255,0.013646,-0.040666,-0.024447,...,0,0,0,0,0,0,0,1,1,0
3,0.119092,0.004412,-0.002262,0.072254,0.044512,-0.051467,0.074686,-0.00767,0.079438,0.062184,...,0,0,0,0,0,0,0,0,0,0
4,0.042037,0.007054,-0.069483,0.081015,-0.048207,0.089446,-0.004947,0.064456,-0.133387,0.068878,...,1,1,0,0,0,0,0,0,0,0


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Data Preprocessing

# Split the dataset into features and multi-label targets
X = data.iloc[:, :103].values  # Features (first 103 columns)
y = data.iloc[:, 103:].values  # Multi-label targets (last 14 columns)

# Standardize the features before splitting the dataset
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



## 1 Problem transformation 

### Binary Relavance vs Classifier Chains 

**Binary Relevance Approach**
- *Independent Models*: Treats each label as an independent binary classification problem.
- *Simplicity*: Simple to implement and understand.
- *Scalability*: Scales well with a large number of labels since each label is treated separately.
- *Label Dependency*: Ignores any potential dependencies or correlations between labels.
- *Training*: Trains one binary classifier per label.

**Classifier Chains Approach**
- *Dependent Models*: Models the dependencies between labels by chaining classifiers.
- *Complexity*: More complex to implement and understand compared to binary relevance.
- *Scalability*: Can be computationally expensive with a large number of labels due to the chaining process.
- *Label Dependency*: Takes into account the dependencies and correlations between labels, potentially leading to better performance.
- *Training*: Trains classifiers sequentially, where each classifier considers the predictions of previous classifiers in the chain.

In [16]:
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report

# Define the Neural Network base classifier
# We add early stopping to prevent overfitting and reduce training time when the validation score stops improving and attempts to converge for a large number of iterations
fixed_nn= MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42, early_stopping=True, n_iter_no_change=100)
# At 10 n_iter_no_change, the classification report gave more labels with zero division error, so we increased it significantly to 100

# Binary Relevance Classifier
binary_relevance_clf = MultiOutputClassifier(fixed_nn)
binary_relevance_clf.fit(X_train, y_train)

# Evaluate performance
y_pred_br = binary_relevance_clf.predict(X_test)
print("Binary Relevance Classification Report: ")
print(classification_report(y_test, y_pred_br, zero_division=0))


Binary Relevance Classification Report: 
              precision    recall  f1-score   support

           0       0.65      0.53      0.59       167
           1       0.60      0.59      0.60       211
           2       0.70      0.66      0.68       196
           3       0.62      0.53      0.57       171
           4       0.62      0.50      0.55       144
           5       0.47      0.36      0.41       127
           6       0.37      0.25      0.30        76
           7       0.33      0.06      0.10        83
           8       0.00      0.00      0.00        30
           9       0.40      0.04      0.07        55
          10       0.00      0.00      0.00        62
          11       0.76      0.95      0.84       366
          12       0.76      0.95      0.85       365
          13       0.00      0.00      0.00         7

   micro avg       0.68      0.62      0.65      2060
   macro avg       0.45      0.39      0.40      2060
weighted avg       0.61      0.62      

In [17]:
from sklearn.multioutput import ClassifierChain

# Classifier Chains
chain_clf = ClassifierChain(fixed_nn)

# Train the model using Classifier Chains
chain_clf.fit(X_train, y_train)

# Predict using the Classifier Chains model
y_pred_cc = chain_clf.predict(X_test)

# Evaluate performance
print("Classifier Chains Classification Report:")
print(classification_report(y_test, y_pred_cc, zero_division=0))

Classifier Chains Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.53      0.59       167
           1       0.60      0.52      0.56       211
           2       0.71      0.65      0.68       196
           3       0.65      0.60      0.63       171
           4       0.63      0.51      0.56       144
           5       0.52      0.35      0.42       127
           6       0.38      0.32      0.34        76
           7       0.35      0.29      0.32        83
           8       0.50      0.07      0.12        30
           9       0.35      0.13      0.19        55
          10       0.30      0.11      0.16        62
          11       0.77      0.92      0.84       366
          12       0.77      0.92      0.84       365
          13       0.00      0.00      0.00         7

   micro avg       0.67      0.62      0.65      2060
   macro avg       0.51      0.42      0.45      2060
weighted avg       0.64      0.62      

## 2 Adapted algorithm

In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import make_scorer, f1_score

# Redefine the Neural Network base classifier making sure 'relu' is used as the activation function since it allows each label to be predicted independently
neural_network = MLPClassifier(max_iter=1000, random_state=42, activation='relu', early_stopping=True, n_iter_no_change=100)

# Through the use of the sklaern multioutput classifier, we can train a single model to predict multiple labels. Ensuring each lable is threated independently for cross-entropy loss calculations and wrapper methods can be used to train a single model to predict multiple labels.
model = MultiOutputClassifier(fixed_nn)

# Define a grid of hyperparameters to search over for hyperparameter optimization (HPO)
param_grid = {
    # The number of neurons in the hidden layer is important to vary as it can affect the model's capacity to learn and may differ for each label
    'estimator__hidden_layer_sizes': [(50,), (100,), (150,)],
    # Certian activation functions may be baised towards certain labels so it is important to vary this hyperparameter to give a balanced model
    'estimator__activation': ['relu', 'tanh', 'logistic'],
    # The learning rate is important feature in training a model and different labels may converge at different rates so it is important to vary this hyperparameter
    'estimator__learning_rate_init': [1e-3, 1e-4, 1e-5]
}

# GridSearchCV from the sklearn library is used due to its systematic evaluation of hyperparameters and cross-validation. Especially useful for multi-label classification problems. 
# A KFold cross-validation with 5 splits is used to ensure the model is evaluated on different subsets of the data to ensure the model is generalizing well. So that is it trains on 4 folds and evaluates on the 5th fold.
grid_search = GridSearchCV(model, param_grid, scoring=make_scorer(f1_score, average='macro'), cv=5, n_jobs=-1)
# n_jobs = -1 uses all available cores to speed up the grid search (parallel processing for CPUs)

# Fit the defined model with the grid of hyperparameters to the training data
grid_search.fit(X_train, y_train)

# Best parameters from the grid search
print("Best parameters found:", grid_search.best_params_)

# Extract the best model from the grid search
best_model = grid_search.best_estimator_

# Predict and evaluate from the best model
y_pred = best_model.predict(X_test)
print("Best Model - Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))




Best parameters found: {'estimator__activation': 'tanh', 'estimator__hidden_layer_sizes': (50,), 'estimator__learning_rate_init': 0.0001}
Best Model - Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.54      0.63       167
           1       0.52      0.46      0.49       211
           2       0.66      0.69      0.67       196
           3       0.64      0.65      0.64       171
           4       0.53      0.43      0.48       144
           5       0.45      0.34      0.39       127
           6       0.37      0.20      0.26        76
           7       0.41      0.14      0.21        83
           8       0.23      0.10      0.14        30
           9       0.31      0.09      0.14        55
          10       0.25      0.05      0.08        62
          11       0.77      0.89      0.83       366
          12       0.77      0.88      0.82       365
          13       0.00      0.00      0.00         7

   micro avg  