# Assignment 2 - Multi-label classification

## Load Dataset
Load the dataset from 'assignment-2/yeast.csv' using pandas.

In [36]:
import pandas as pd

dataset_path = 'yeast.csv'

# Load the dataset
data = pd.read_csv(dataset_path)

# Print the head of the dataset
data.head()


Unnamed: 0,Att1,Att2,Att3,Att4,Att5,Att6,Att7,Att8,Att9,Att10,...,Class5,Class6,Class7,Class8,Class9,Class10,Class11,Class12,Class13,Class14
0,0.004168,-0.170975,-0.156748,-0.142151,0.058781,0.026851,0.197719,0.04185,0.066938,-0.056617,...,0,0,1,1,0,0,0,1,1,0
1,-0.103956,0.011879,-0.098986,-0.054501,-0.00797,0.049113,-0.03058,-0.077933,-0.080529,-0.016267,...,0,0,0,0,0,0,0,0,0,0
2,0.509949,0.401709,0.293799,0.087714,0.011686,-0.006411,-0.006255,0.013646,-0.040666,-0.024447,...,0,0,0,0,0,0,0,1,1,0
3,0.119092,0.004412,-0.002262,0.072254,0.044512,-0.051467,0.074686,-0.00767,0.079438,0.062184,...,0,0,0,0,0,0,0,0,0,0
4,0.042037,0.007054,-0.069483,0.081015,-0.048207,0.089446,-0.004947,0.064456,-0.133387,0.068878,...,1,1,0,0,0,0,0,0,0,0


In [37]:
# Print the shape of the dataset
print("Shape of the dataset: ", data.shape)

# Check how many missing values are in the dataset
print("Number of missing values in the dataset: ",data.isnull().sum().sum())

# Check for any categorical columns in the dataset
print("Categorical columns in the dataset: ",data.select_dtypes(include=['object']).columns)


Shape of the dataset:  (2417, 117)
Number of missing values in the dataset:  0
Categorical columns in the dataset:  Index([], dtype='object')


In [38]:

# Data Preprocessing
# Split the dataset into features and multi-label targets
X = data.iloc[:, :103].values  # Features (first 103 columns)
y = data.iloc[:, 103:].values  # Multi-label targets (last 14 columns)



## 1 Problem transformation 

### Binary Relavance vs Classifier Chains 

**Binary Relevance Approach**
- *Independent Models*: Treats each label as an independent binary classification problem.
- *Simplicity*: Simple to implement and understand.
- *Scalability*: Scales well with a large number of labels since each label is treated separately.
- *Label Dependency*: Ignores any potential dependencies or correlations between labels.
- *Training*: Trains one binary classifier per label.

**Classifier Chains Approach**
- *Dependent Models*: Models the dependencies between labels by chaining classifiers.
- *Complexity*: More complex to implement and understand compared to binary relevance.
- *Scalability*: Can be computationally expensive with a large number of labels due to the chaining process.
- *Label Dependency*: Takes into account the dependencies and correlations between labels, potentially leading to better performance.
- *Training*: Trains classifiers sequentially, where each classifier considers the predictions of previous classifiers in the chain.

In [39]:
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from keras.models import Model
from keras.layers import Input, Dense
from keras.optimizers import Adam

from sklearn.metrics import accuracy_score, f1_score



# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the features with StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define a Neural Network Architecture model 

def NeuralNetworkModel(input_dim):
    # Define the input layer
    input_layer = Input(shape=(input_dim,))
    
    # Add layers
    x = Dense(64, activation='relu')(input_layer)
    x = Dense(32, activation='relu')(x)
    output_layer = Dense(1, activation='sigmoid')(x)
    
    # Create the model
    model = Model(inputs=input_layer, outputs=output_layer)
    
    # Compile the model
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model


# Train Binary Classifiers for Each Label
models = []
for i in range(y_train.shape[1]):
    model = NeuralNetworkModel(X_train.shape[1])
    model.fit(X_train, y_train[:, i], epochs=50, batch_size=32, verbose=0)
    models.append(model)

# Evaluate the Model
y_pred = np.zeros(y_test.shape)
for i, model in enumerate(models):
    y_pred[:, i] = model.predict(X_test).flatten()

# Convert predictions to binary
y_pred = (y_pred > 0.5).astype(int)

# Calculate accuracy and F1 score for each label
accuracies = [accuracy_score(y_test[:, i], y_pred[:, i]) for i in range(y_test.shape[1])]
f1_scores = [f1_score(y_test[:, i], y_pred[:, i]) for i in range(y_test.shape[1])]

print("Accuracies: ", accuracies)
print("F1 Scores: ", f1_scores)

InternalError: cudaGetDevice() failed. Status: CUDA driver version is insufficient for CUDA runtime version