In [2]:
%cd /kaggle/input/chemcancer-v2/src/
%mkdir /kaggle/working/CV_CNN_cm

/kaggle/input/chemcancer-v2/src


In [3]:
import numpy as np
import matplotlib.pyplot as plt
import os
import time
from tensorflow.keras.optimizers import Adam
from data import *
from machine_learning_models import *
from deep_learning_models import *
from vision_transformer import *
from utils_dl_model import *
from utils_ml_model import print_ml_results
from sklearn.model_selection import train_test_split
from keras.callbacks import ModelCheckpoint



In [4]:
# Load the dataset
def load_data(file_name):
    data = pd.read_csv(file_name)
    return data

# Extract the X and y data from the dataset.
def extract_data(data):
    # Extract the features from the data
    X = np.array(data.iloc[:, 4:], dtype = float)

    # Extract the target variable
    y = np.array(data['Cell type'])

    # Encode the labels
    encoder = LabelEncoder()
    y_encoded = encoder.fit_transform(y)

    # Return X and y
    return X, y_encoded, encoder

def display_label_mapping(encoder):
    """Display the mapping between original labels and encoded labels."""
    for index, label in enumerate(encoder.classes_):
        print(f"{index} -> {label}")

# If you run this function with your encoder in your environment, it will print the mapping.
# display_label_mapping(encoder)

In [6]:
def compute_basic_metrics_with_labels(confusion_matrix, mapping):
    """
    Compute TP, TN, FP, and FN for each class from a given confusion matrix.
    Return results with original labels.
    
    Parameters:
    - confusion_matrix: the input confusion matrix
    - mapping: mapping from encoded labels to original labels
    
    Returns:
    - A dictionary containing TP, TN, FP, and FN for each class with original labels.
    """
    
    num_classes = confusion_matrix.shape[0]
    metrics = {}
    
    for i in range(num_classes):
        label = mapping[i]
        tp = confusion_matrix[i, i]
        fp = sum(confusion_matrix[j, i] for j in range(num_classes) if j != i)
        fn = sum(confusion_matrix[i, j] for j in range(num_classes) if j != i)
        tn = sum(confusion_matrix[j, k] for j in range(num_classes) for k in range(num_classes) if j != i and k != i)
        
        metrics[label] = {
            'TP': tp,
            'FP': fp,
            'FN': fn,
            'TN': tn
        }
        
    return metrics

def compute_metrics_with_labels(confusion_matrix, mapping):
    """
    Compute Sensitivity, Specificity, and Precision for each class from a given confusion matrix.
    Return results with original labels.
    
    Parameters:
    - confusion_matrix: the input confusion matrix
    - mapping: mapping from encoded labels to original labels
    
    Returns:
    - A dictionary containing Sensitivity, Specificity, and Precision for each class with original labels.
    """
    
    num_classes = confusion_matrix.shape[0]
    metrics = {}
    
    for i in range(num_classes):
        label = mapping[i]
        tp = confusion_matrix[i, i]
        fp = sum(confusion_matrix[j, i] for j in range(num_classes) if j != i)
        fn = sum(confusion_matrix[i, j] for j in range(num_classes) if j != i)
        tn = sum(confusion_matrix[j, k] for j in range(num_classes) for k in range(num_classes) if j != i and k != i)
        
        sensitivity = tp / (tp + fn) if (tp + fn) != 0 else 0
        specificity = tn / (tn + fp) if (tn + fp) != 0 else 0
        precision = tp / (tp + fp) if (tp + fp) != 0 else 0
        
        metrics[label] = {
            'Sensitivity': sensitivity,
            'Specificity': specificity,
            'Precision': precision
        }
        
    return metrics

# Now, these functions will return dictionaries with the original labels as keys.

In [7]:
import pandas as pd

def metrics_to_dataframe(metrics_list, model_name):
    """
    Convert a list of metrics dictionaries to a pandas DataFrame.
    
    Parameters:
    - metrics_list: list of metrics dictionaries
    - model_name: name of the model (e.g., "CNN" or "VIT")
    
    Returns:
    - A pandas DataFrame representation of the metrics.
    """
    # Flatten the metrics for DataFrame conversion
    flattened_metrics = []
    for metrics in metrics_list:
        flat_dict = {}
        for key, value in metrics.items():
            for metric_name, metric_value in value.items():
                flat_key = f"{key} {metric_name}"
                flat_dict[flat_key] = metric_value
        flattened_metrics.append(flat_dict)
    
    # Convert to DataFrame
    df = pd.DataFrame(flattened_metrics)
    
    # Calculate the average across folds
    df_avg = df.mean().to_frame(name='Average').transpose()
    
    # Add model name and average to the DataFrame for presentation
    df['Model'] = model_name
    df = pd.concat([df, df_avg])
    df.iloc[-1, df.columns.get_loc('Model')] = model_name + ' Avg'
    
    # Reorder columns for better presentation
    columns_order = ['Model'] + list(df.columns[:-1])
    df = df[columns_order]
    
    return df

In [8]:
# Load the data
data_file = "/kaggle/input/chemcancer-v2/Data/HC05_HC07.csv"
data = load_data(data_file)

X_raw, y_encoded, encoder = extract_data(data)
display_label_mapping(encoder)

0 -> Cancer cell line
1 -> Monocyte
2 -> T-cells


## 1D-ori-BLS-Non-Filtered

In [9]:
import pickle

num_folds = 5  # Replace with the actual number of folds you have
confusion_matrices = []

for fold in range(1, num_folds + 1):
    file_path = f"/kaggle/input/cmatrix-1d-ori-aug-bls/1D-ori-BLS-Non-Filtered/CV_BLS_cm/BLS_cm_fold_{fold}.pkl"
    with open(file_path, 'rb') as cm_file:
        cm = pickle.load(cm_file)
        confusion_matrices.append(cm)

# Now, confusion_matrices list contains all the loaded confusion matrices

mapping = {
    0: 'Cancer cell line',
    1: 'Monocyte',
    2: 'T-cells'
}

# Lists to store metrics for each fold
cnn_basic_metrics_list = [compute_basic_metrics_with_labels(cm, mapping) for cm in confusion_matrices]
cnn_advanced_metrics_list = [compute_metrics_with_labels(cm, mapping) for cm in confusion_matrices]
cnn_basic_df = metrics_to_dataframe(cnn_basic_metrics_list, 'CNN-500')
cnn_advanced_df = metrics_to_dataframe(cnn_advanced_metrics_list, 'CNN-500')

# Filter the DataFrame for each label
cancer_basic_df = cnn_basic_df.filter(like='Cancer cell line', axis=1)
monocyte_basic_df = cnn_basic_df.filter(like='Monocyte', axis=1)
t_cells_basic_df = cnn_basic_df.filter(like='T-cells', axis=1)

# Convert these subsets to LaTeX
cancer_basic_latex = cancer_basic_df.to_latex(index=False, float_format="%.2f")
monocyte_basic_latex = monocyte_basic_df.to_latex(index=False, float_format="%.2f")
t_cells_basic_latex = t_cells_basic_df.to_latex(index=False, float_format="%.2f")

# Repeat the same process for the combined_advanced_df DataFrame

cancer_advanced_df = cnn_advanced_df.filter(like='Cancer cell line', axis=1)
monocyte_advanced_df = cnn_advanced_df.filter(like='Monocyte', axis=1)
t_cells_advanced_df = cnn_advanced_df.filter(like='T-cells', axis=1)

cancer_advanced_latex = cancer_advanced_df.to_latex(index=False, float_format="%.2f")
monocyte_advanced_latex = monocyte_advanced_df.to_latex(index=False, float_format="%.2f")
t_cells_advanced_latex = t_cells_advanced_df.to_latex(index=False, float_format="%.2f")
print(cancer_basic_latex)
print(cancer_advanced_latex)

print(monocyte_basic_latex)
print(monocyte_advanced_latex)

print(t_cells_basic_latex)
print(t_cells_advanced_latex)

\begin{tabular}{rrrr}
\toprule
Cancer cell line TP & Cancer cell line FP & Cancer cell line FN & Cancer cell line TN \\
\midrule
167.00 & 81.00 & 16.00 & 211.00 \\
162.00 & 77.00 & 21.00 & 215.00 \\
170.00 & 92.00 & 13.00 & 200.00 \\
163.00 & 78.00 & 20.00 & 213.00 \\
167.00 & 74.00 & 16.00 & 217.00 \\
165.80 & 80.40 & 17.20 & 211.20 \\
\bottomrule
\end{tabular}

\begin{tabular}{rrr}
\toprule
Cancer cell line Sensitivity & Cancer cell line Specificity & Cancer cell line Precision \\
\midrule
0.91 & 0.72 & 0.67 \\
0.89 & 0.74 & 0.68 \\
0.93 & 0.68 & 0.65 \\
0.89 & 0.73 & 0.68 \\
0.91 & 0.75 & 0.69 \\
0.91 & 0.72 & 0.67 \\
\bottomrule
\end{tabular}

\begin{tabular}{rrrr}
\toprule
Monocyte TP & Monocyte FP & Monocyte FN & Monocyte TN \\
\midrule
88.00 & 32.00 & 54.00 & 301.00 \\
86.00 & 28.00 & 56.00 & 305.00 \\
88.00 & 29.00 & 54.00 & 304.00 \\
74.00 & 41.00 & 67.00 & 292.00 \\
84.00 & 33.00 & 57.00 & 300.00 \\
84.00 & 32.60 & 57.60 & 300.40 \\
\bottomrule
\end{tabular}

\begin{tabular}{

## 1D-ori-BLS-Filtered

In [11]:
import pickle

num_folds = 5  # Replace with the actual number of folds you have
confusion_matrices = []

for fold in range(1, num_folds + 1):
    file_path = f"/kaggle/input/cmatrix-1d-ori-aug-bls/1D-ori-BLS-Filtered/CV_BLS_cm/BLS_cm_fold_{fold}.pkl"
    with open(file_path, 'rb') as cm_file:
        cm = pickle.load(cm_file)
        confusion_matrices.append(cm)

# Now, confusion_matrices list contains all the loaded confusion matrices

mapping = {
    0: 'Cancer cell line',
    1: 'Monocyte',
    2: 'T-cells'
}

# Lists to store metrics for each fold
cnn_basic_metrics_list = [compute_basic_metrics_with_labels(cm, mapping) for cm in confusion_matrices]
cnn_advanced_metrics_list = [compute_metrics_with_labels(cm, mapping) for cm in confusion_matrices]
cnn_basic_df = metrics_to_dataframe(cnn_basic_metrics_list, 'CNN-500')
cnn_advanced_df = metrics_to_dataframe(cnn_advanced_metrics_list, 'CNN-500')

# Filter the DataFrame for each label
cancer_basic_df = cnn_basic_df.filter(like='Cancer cell line', axis=1)
monocyte_basic_df = cnn_basic_df.filter(like='Monocyte', axis=1)
t_cells_basic_df = cnn_basic_df.filter(like='T-cells', axis=1)

# Convert these subsets to LaTeX
cancer_basic_latex = cancer_basic_df.to_latex(index=False, float_format="%.2f")
monocyte_basic_latex = monocyte_basic_df.to_latex(index=False, float_format="%.2f")
t_cells_basic_latex = t_cells_basic_df.to_latex(index=False, float_format="%.2f")

# Repeat the same process for the combined_advanced_df DataFrame

cancer_advanced_df = cnn_advanced_df.filter(like='Cancer cell line', axis=1)
monocyte_advanced_df = cnn_advanced_df.filter(like='Monocyte', axis=1)
t_cells_advanced_df = cnn_advanced_df.filter(like='T-cells', axis=1)

cancer_advanced_latex = cancer_advanced_df.to_latex(index=False, float_format="%.2f")
monocyte_advanced_latex = monocyte_advanced_df.to_latex(index=False, float_format="%.2f")
t_cells_advanced_latex = t_cells_advanced_df.to_latex(index=False, float_format="%.2f")
print(cancer_basic_latex)
print(cancer_advanced_latex)

print(monocyte_basic_latex)
print(monocyte_advanced_latex)

print(t_cells_basic_latex)
print(t_cells_advanced_latex)

\begin{tabular}{rrrr}
\toprule
Cancer cell line TP & Cancer cell line FP & Cancer cell line FN & Cancer cell line TN \\
\midrule
162.00 & 57.00 & 21.00 & 235.00 \\
161.00 & 57.00 & 22.00 & 235.00 \\
164.00 & 53.00 & 19.00 & 239.00 \\
157.00 & 57.00 & 26.00 & 234.00 \\
169.00 & 58.00 & 14.00 & 233.00 \\
162.60 & 56.40 & 20.40 & 235.20 \\
\bottomrule
\end{tabular}

\begin{tabular}{rrr}
\toprule
Cancer cell line Sensitivity & Cancer cell line Specificity & Cancer cell line Precision \\
\midrule
0.89 & 0.80 & 0.74 \\
0.88 & 0.80 & 0.74 \\
0.90 & 0.82 & 0.76 \\
0.86 & 0.80 & 0.73 \\
0.92 & 0.80 & 0.74 \\
0.89 & 0.81 & 0.74 \\
\bottomrule
\end{tabular}

\begin{tabular}{rrrr}
\toprule
Monocyte TP & Monocyte FP & Monocyte FN & Monocyte TN \\
\midrule
105.00 & 33.00 & 37.00 & 300.00 \\
108.00 & 23.00 & 34.00 & 310.00 \\
115.00 & 23.00 & 27.00 & 310.00 \\
115.00 & 21.00 & 26.00 & 312.00 \\
111.00 & 20.00 & 30.00 & 313.00 \\
110.80 & 24.00 & 30.80 & 309.00 \\
\bottomrule
\end{tabular}

\begin{tab

## 1D-aug-BLS-non-filtered

In [12]:
import pickle

num_folds = 5  # Replace with the actual number of folds you have
confusion_matrices = []

for fold in range(1, num_folds + 1):
    file_path = f"/kaggle/input/cmatrix-1d-ori-aug-bls/1D-Aug-BLS-Non-Filtered/CV_BLS_cm/BLS_cm_fold_{fold}.pkl"
    with open(file_path, 'rb') as cm_file:
        cm = pickle.load(cm_file)
        confusion_matrices.append(cm)

# Now, confusion_matrices list contains all the loaded confusion matrices

mapping = {
    0: 'Cancer cell line',
    1: 'Monocyte',
    2: 'T-cells'
}

# Lists to store metrics for each fold
cnn_basic_metrics_list = [compute_basic_metrics_with_labels(cm, mapping) for cm in confusion_matrices]
cnn_advanced_metrics_list = [compute_metrics_with_labels(cm, mapping) for cm in confusion_matrices]
cnn_basic_df = metrics_to_dataframe(cnn_basic_metrics_list, 'CNN-500')
cnn_advanced_df = metrics_to_dataframe(cnn_advanced_metrics_list, 'CNN-500')

# Filter the DataFrame for each label
cancer_basic_df = cnn_basic_df.filter(like='Cancer cell line', axis=1)
monocyte_basic_df = cnn_basic_df.filter(like='Monocyte', axis=1)
t_cells_basic_df = cnn_basic_df.filter(like='T-cells', axis=1)

# Convert these subsets to LaTeX
cancer_basic_latex = cancer_basic_df.to_latex(index=False, float_format="%.2f")
monocyte_basic_latex = monocyte_basic_df.to_latex(index=False, float_format="%.2f")
t_cells_basic_latex = t_cells_basic_df.to_latex(index=False, float_format="%.2f")

# Repeat the same process for the combined_advanced_df DataFrame

cancer_advanced_df = cnn_advanced_df.filter(like='Cancer cell line', axis=1)
monocyte_advanced_df = cnn_advanced_df.filter(like='Monocyte', axis=1)
t_cells_advanced_df = cnn_advanced_df.filter(like='T-cells', axis=1)

cancer_advanced_latex = cancer_advanced_df.to_latex(index=False, float_format="%.2f")
monocyte_advanced_latex = monocyte_advanced_df.to_latex(index=False, float_format="%.2f")
t_cells_advanced_latex = t_cells_advanced_df.to_latex(index=False, float_format="%.2f")
print(cancer_basic_latex)
print(cancer_advanced_latex)

print(monocyte_basic_latex)
print(monocyte_advanced_latex)

print(t_cells_basic_latex)
print(t_cells_advanced_latex)

\begin{tabular}{rrrr}
\toprule
Cancer cell line TP & Cancer cell line FP & Cancer cell line FN & Cancer cell line TN \\
\midrule
181.00 & 67.00 & 12.00 & 215.00 \\
177.00 & 69.00 & 16.00 & 213.00 \\
178.00 & 72.00 & 15.00 & 210.00 \\
180.00 & 78.00 & 13.00 & 204.00 \\
179.00 & 68.00 & 14.00 & 214.00 \\
179.00 & 70.80 & 14.00 & 211.20 \\
\bottomrule
\end{tabular}

\begin{tabular}{rrr}
\toprule
Cancer cell line Sensitivity & Cancer cell line Specificity & Cancer cell line Precision \\
\midrule
0.94 & 0.76 & 0.73 \\
0.92 & 0.76 & 0.72 \\
0.92 & 0.74 & 0.71 \\
0.93 & 0.72 & 0.70 \\
0.93 & 0.76 & 0.72 \\
0.93 & 0.75 & 0.72 \\
\bottomrule
\end{tabular}

\begin{tabular}{rrrr}
\toprule
Monocyte TP & Monocyte FP & Monocyte FN & Monocyte TN \\
\midrule
85.00 & 39.00 & 53.00 & 298.00 \\
84.00 & 36.00 & 54.00 & 301.00 \\
89.00 & 38.00 & 49.00 & 299.00 \\
77.00 & 28.00 & 61.00 & 309.00 \\
86.00 & 31.00 & 52.00 & 306.00 \\
84.20 & 34.40 & 53.80 & 302.60 \\
\bottomrule
\end{tabular}

\begin{tabular}{

## 1D-Aug-BLS-filtered

In [13]:
import pickle

num_folds = 5  # Replace with the actual number of folds you have
confusion_matrices = []

for fold in range(1, num_folds + 1):
    file_path = f"/kaggle/input/cmatrix-1d-ori-aug-bls/1D-Aug-BLS-Filtered/CV_BLS_cm/BLS_cm_fold_{fold}.pkl"
    with open(file_path, 'rb') as cm_file:
        cm = pickle.load(cm_file)
        confusion_matrices.append(cm)

# Now, confusion_matrices list contains all the loaded confusion matrices

mapping = {
    0: 'Cancer cell line',
    1: 'Monocyte',
    2: 'T-cells'
}

# Lists to store metrics for each fold
cnn_basic_metrics_list = [compute_basic_metrics_with_labels(cm, mapping) for cm in confusion_matrices]
cnn_advanced_metrics_list = [compute_metrics_with_labels(cm, mapping) for cm in confusion_matrices]
cnn_basic_df = metrics_to_dataframe(cnn_basic_metrics_list, 'CNN-500')
cnn_advanced_df = metrics_to_dataframe(cnn_advanced_metrics_list, 'CNN-500')

# Filter the DataFrame for each label
cancer_basic_df = cnn_basic_df.filter(like='Cancer cell line', axis=1)
monocyte_basic_df = cnn_basic_df.filter(like='Monocyte', axis=1)
t_cells_basic_df = cnn_basic_df.filter(like='T-cells', axis=1)

# Convert these subsets to LaTeX
cancer_basic_latex = cancer_basic_df.to_latex(index=False, float_format="%.2f")
monocyte_basic_latex = monocyte_basic_df.to_latex(index=False, float_format="%.2f")
t_cells_basic_latex = t_cells_basic_df.to_latex(index=False, float_format="%.2f")

# Repeat the same process for the combined_advanced_df DataFrame

cancer_advanced_df = cnn_advanced_df.filter(like='Cancer cell line', axis=1)
monocyte_advanced_df = cnn_advanced_df.filter(like='Monocyte', axis=1)
t_cells_advanced_df = cnn_advanced_df.filter(like='T-cells', axis=1)

cancer_advanced_latex = cancer_advanced_df.to_latex(index=False, float_format="%.2f")
monocyte_advanced_latex = monocyte_advanced_df.to_latex(index=False, float_format="%.2f")
t_cells_advanced_latex = t_cells_advanced_df.to_latex(index=False, float_format="%.2f")
print(cancer_basic_latex)
print(cancer_advanced_latex)

print(monocyte_basic_latex)
print(monocyte_advanced_latex)

print(t_cells_basic_latex)
print(t_cells_advanced_latex)

\begin{tabular}{rrrr}
\toprule
Cancer cell line TP & Cancer cell line FP & Cancer cell line FN & Cancer cell line TN \\
\midrule
182.00 & 68.00 & 11.00 & 214.00 \\
177.00 & 90.00 & 16.00 & 192.00 \\
174.00 & 85.00 & 19.00 & 197.00 \\
177.00 & 70.00 & 16.00 & 212.00 \\
180.00 & 83.00 & 13.00 & 199.00 \\
178.00 & 79.20 & 15.00 & 202.80 \\
\bottomrule
\end{tabular}

\begin{tabular}{rrr}
\toprule
Cancer cell line Sensitivity & Cancer cell line Specificity & Cancer cell line Precision \\
\midrule
0.94 & 0.76 & 0.73 \\
0.92 & 0.68 & 0.66 \\
0.90 & 0.70 & 0.67 \\
0.92 & 0.75 & 0.72 \\
0.93 & 0.71 & 0.68 \\
0.92 & 0.72 & 0.69 \\
\bottomrule
\end{tabular}

\begin{tabular}{rrrr}
\toprule
Monocyte TP & Monocyte FP & Monocyte FN & Monocyte TN \\
\midrule
91.00 & 27.00 & 47.00 & 310.00 \\
78.00 & 24.00 & 60.00 & 313.00 \\
85.00 & 29.00 & 53.00 & 308.00 \\
83.00 & 31.00 & 55.00 & 306.00 \\
82.00 & 24.00 & 56.00 & 313.00 \\
83.80 & 27.00 & 54.20 & 310.00 \\
\bottomrule
\end{tabular}

\begin{tabular}{