## Lab 06: Deep Neural Nets applied to Classification Problem

### Case of study 04: Breast Cancer Classification

### 1. Metadata  

* **data**: This is a 2D array (matrix) containing the features of the dataset. Each row corresponds to a sample (tumor), and each column corresponds to a feature (measurement).  
* **target**: This is a 1D array containing the labels for each sample. The labels indicate whether the tumor is malignant (1) or benign (0).   
* **feature_names**: This is a list of strings representing the names of the features in the dataset.   
* **target_names**: This is a list of strings representing the names of the target classes. In this case, it indicates the two classes: malignant and benign.  
* **DESCR**: This is a string containing a detailed description of the dataset, including information about how it was collected, its purpose, and any relevant notes.  
* **filename**: This is a string that indicates the path to the dataset file  
* **data_module**: This field indicates the source of the dataset.

In [41]:
# import packages
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
import seaborn as sbn
import matplotlib.pyplot as plt
import time

### 2. Load dataset

In [None]:
# load dataset
data = load_breast_cancer()
data

In [None]:
# get column names
metadata = data.feature_names
print(metadata)

### 3. Data Preprocessing

In [7]:
# split independent and dependent variables
x = data.data
y = data.target

In [8]:
# split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [None]:
# check dimensionality of each subset of data
print("Dimensions in x-train: ", x_train.shape)
print("Dimensions in y-train: ", y_train.shape)
print("Dimensions in x-test: ", x_test.shape)
print("Dimensions in y-test: ", y_test.shape)

In [None]:
# transformer will learn only from training data
transformer = StandardScaler()
transformer.fit(x_train)

In [18]:
# transformer will apply in the train and test data
x_train = transformer.transform(x_train)
x_test = transformer.transform(x_test)

In [19]:
# convert data to Pytorch tensor data structure
x_train_tensor = torch.tensor(x_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).reshape(-1, 1)
x_test_tensor = torch.tensor(x_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).reshape(-1, 1)

In [None]:
print("dim x-train tensor: ", x_train_tensor.shape)
print("y-train tensor: ", y_train_tensor.shape)

In [24]:
# Create DataLoaders with batches of data
train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
test_dataset = TensorDataset(x_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

### 4. Build Model Architecture

In [25]:
# Define the neural network architecture
class ClassifierNeuralNet(nn.Module):
    def __init__(self, input_neurons, output_neurons, num_hidden_layers, hidden_neurons_x_layer, activation):
        super(ClassifierNeuralNet, self).__init__()
        # input layer definition
        layers = [nn.Linear(input_neurons, hidden_neurons_x_layer[0])]
        # hidden layers definition
        for i in range(1, num_hidden_layers):
            layers.append(nn.Linear(hidden_neurons_x_layer[i-1], hidden_neurons_x_layer[i]))
            if activation == 'relu':
                layers.append(nn.ReLU())
            elif activation == 'tanh':
                layers.append(nn.Tanh())
            elif activation == 'sigmoid':
                layers.append(nn.Sigmoid())
        # output layer definition
        layers.append(nn.Linear(hidden_neurons_x_layer[-1], output_neurons))
        layers.append(nn.Sigmoid())
        # model architecture definition
        self.model = nn.Sequential(*layers)

    # forward step
    def forward(self, x):
        return self.model(x)

In [26]:
# define list of experiments
experiments = [
    {'num_layers': 2, 'neurons_per_layer': [16, 8], 'activation': 'relu'},
    {'num_layers': 3, 'neurons_per_layer': [32, 16, 8], 'activation': 'relu'},
    {'num_layers': 2, 'neurons_per_layer': [16, 8], 'activation': 'tanh'},
    {'num_layers': 3, 'neurons_per_layer': [32, 16, 8], 'activation': 'tanh'},
    {'num_layers': 2, 'neurons_per_layer': [16, 8], 'activation': 'sigmoid'},
    {'num_layers': 3, 'neurons_per_layer': [32, 16, 8], 'activation': 'sigmoid'}
]

### 5. Train/test process

In [None]:
# Check if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [34]:
# Define the training function for classification
def train_model(model, train_loader, device, num_epochs):
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(num_epochs):
        model.train()
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

    return model

In [36]:
# Define the function to run the training and evaluation process
def run_training(input_size, experiments, train_loader, x_test_tensor, y_test, device, num_epochs = 100):
    experiment_results = []

    for experiment in experiments:
        print(f"Run experiment with hidden layers: {experiment['neurons_per_layer']}, activation: {experiment['activation']}")

        model = ClassifierNeuralNet(input_neurons=input_size,
                                     output_neurons=1,
                                     num_hidden_layers=experiment['num_layers'],
                                     hidden_neurons_x_layer=experiment['neurons_per_layer'],
                                     activation=experiment['activation']).to(device)

        start_time = time.time()
        trained_model = train_model(model, train_loader, device, num_epochs)
        end_time = time.time()

        # Testing
        trained_model.eval()
        with torch.no_grad():
            y_pred_probs = trained_model(x_test_tensor.to(device)).cpu().numpy()
            y_pred = (y_pred_probs > 0.5).astype(int).flatten()  # Convert probabilities to binary predictions

        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        print(f"Training time: {end_time - start_time:.2f} seconds")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")

        experiment_results.append({
            'experiment': experiment,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'training_time': end_time - start_time
        })

    return experiment_results

In [None]:
# execute training experiment
exp_results = run_training(input_size=30, experiments=experiments, train_loader=train_loader,
                           x_test_tensor=x_test_tensor, y_test=y_test, device=device)

### 6. Monitoring the results

In [64]:
# Extracting data for plotting
def monitoring_results(exp_results):
    # define the list of metrics
    metrics = ['accuracy', 'precision', 'recall', 'f1_score', 'training_time']
    data = {
        'experiment': [],
        'metric': [],
        'value': []
    }

    # re-structure the json data to dataframe
    for idx, result in enumerate(exp_results):
        for metric in metrics:
            data['experiment'].append(f"Experiment {idx + 1}")
            data['metric'].append(metric)
            data['value'].append(result[metric])

    # Create a DataFrame
    df = pd.DataFrame(data)

    # Set up the matplotlib figure
    plt.figure(figsize=(10, 14))

    # Create a bar plot
    for metric in metrics:
        # plot the bars comparison of metrics for each experiment
        ax = plt.subplot(3, 2, metrics.index(metric) + 1)
        bar_plot = sbn.barplot(x='experiment', y='value', data=df[df['metric'] == metric], 
                    hue='experiment', palette='viridis', legend=False)
        plt.title(metric.capitalize())
        plt.xticks(rotation=45)
        plt.ylabel(metric.capitalize())
        plt.ylim(0, 1.1 if metric != 'training_time' else None)

        # Annotate each bar with the respective value
        for p in bar_plot.patches:
            ax.annotate(f'{p.get_height():.3f}', 
                        (p.get_x() + p.get_width() / 2., p.get_height()), 
                        ha='center', va='bottom', fontsize=10, color='black', 
                        xytext=(0, 5), textcoords='offset points')
    
    plt.tight_layout()
    plt.show()


In [None]:
monitoring_results(exp_results)