# Training and testing 

We train and test different models that perform classification tasks 

In [2]:
import os
import sys
import numpy as np
import pandas as pd
import scipy.stats
#%matplotlib widget
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

In [4]:
base_dir = '.' 

dataframes = {}

files = ['Part1_Results.csv', 'Part2_Results.csv', 'Part3_Results.csv']

for file in files:
    file_path = os.path.join(base_dir, file)
    if os.path.exists(file_path):
        # Extract name without extension for dictionary key
        name = file.replace('.csv', '')
        dataframes[name] = pd.read_csv(file_path, sep=';', index_col=0)
        print(f"Loaded {name}: {dataframes[name].shape}")
    else:
        print(f"Warning: {file} not found!")

Loaded Part1_Results: (232, 9)
Loaded Part2_Results: (232, 16)
Loaded Part3_Results: (134, 16)


In [5]:
X_1 = dataframes['Part1_Results'].copy()
X_1.drop(columns=['cell_id', 'cell_type'], inplace=True)
y_1 = dataframes['Part1_Results']['cell_type'].copy()
label_encoder = LabelEncoder()
y_1 = label_encoder.fit_transform(y_1)

if X_1['ap_duration'].isna().any():
    median_val = X_1['ap_duration'].median()
    X_1.loc[:, 'ap_duration'] = X_1['ap_duration'].fillna(median_val)

X_2 = dataframes['Part2_Results'].copy()
X_2.drop(columns=['cell_id', 'cell_type'], inplace=True)
y_2 = dataframes['Part1_Results']['cell_type'].copy()
label_encoder = LabelEncoder()
y_2 = label_encoder.fit_transform(y_2)

In [6]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(
    X_1, y_1, 
    test_size=0.2, 
    stratify=y_1,  #ensures percentages of cell types are preserved
    random_state=42  
)

X_train2, X_test2, y_train2, y_test2 = train_test_split(
    X_2, y_2, 
    test_size=0.2, 
    stratify=y_2,  
    random_state=42  
)

We define a function that will be used to evaluate the model 

In [7]:
def evaluate_model(model, X_train, X_test, y_train, y_test, dataset_name="", scale=False):    
    if scale:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
    
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1_macro = f1_score(y_test, y_pred, average='macro')
    f1_weighted = f1_score(y_test, y_pred, average='weighted')
    
    # Classification report
    report = classification_report(y_test, y_pred, 
                                   target_names=label_encoder.classes_,
                                   output_dict=True)
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    results = {
        'dataset': dataset_name,
        'model': model.__class__.__name__,
        'accuracy': accuracy,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'n_features': X_train.shape[1],
        'report': report,
        'confusion_matrix': cm
    }
    
    return results, model

## Model training and testing on Part 1 Data

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

In [9]:
# Model to train and test
models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)),
    ('SVM Linear', SVC(kernel='linear', class_weight='balanced', random_state=42)),
    ('SVM RBF', SVC(kernel='rbf', class_weight='balanced', random_state=42)),
    ('Random Forest', RandomForestClassifier(n_estimators=100, max_depth=5, 
                                             class_weight='balanced', random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(n_estimators=100, max_depth=3, 
                                                     random_state=42)),
    ('KNN', KNeighborsClassifier(n_neighbors=5))
]

In [14]:
# We test each model first on the data from Part 1
all_results = []

for model_name, model in models:

    print(f"-------Testing {model_name}-------")

    
    res1, _ = evaluate_model(model, X_train1, X_test1, y_train1, y_test1, 
                            dataset_name="Part1", scale=True)
    all_results.append(res1)
    print(f"Accuracy: {res1['accuracy']:.3f}, F1-macro: {res1['f1_macro']:.3f}")
    print(f"confusion matrix:")
    print(res1['confusion_matrix'])


# Convert results to DataFrame for comparison
results_df = pd.DataFrame(all_results)

print("SUMMARY OF ALL RESULTS:")

print(results_df[['dataset', 'model', 'accuracy', 'f1_macro', 'n_features']])

-------Testing Logistic Regression-------
Accuracy: 0.830, F1-macro: 0.789
confusion matrix:
[[16  0  0  1]
 [ 0  8  2  0]
 [ 2  0 12  1]
 [ 0  1  1  3]]
-------Testing SVM Linear-------
Accuracy: 0.851, F1-macro: 0.805
confusion matrix:
[[16  0  0  1]
 [ 0  8  2  0]
 [ 1  0 13  1]
 [ 0  1  1  3]]
-------Testing SVM RBF-------
Accuracy: 0.830, F1-macro: 0.765
confusion matrix:
[[17  0  0  0]
 [ 0  8  2  0]
 [ 2  0 12  1]
 [ 0  1  2  2]]
-------Testing Random Forest-------
Accuracy: 0.851, F1-macro: 0.782
confusion matrix:
[[17  0  0  0]
 [ 0  8  2  0]
 [ 1  0 13  1]
 [ 0  1  2  2]]
-------Testing Gradient Boosting-------
Accuracy: 0.851, F1-macro: 0.781
confusion matrix:
[[17  0  0  0]
 [ 0  8  2  0]
 [ 1  0 13  1]
 [ 1  1  1  2]]
-------Testing KNN-------
Accuracy: 0.830, F1-macro: 0.727
confusion matrix:
[[17  0  0  0]
 [ 0  8  2  0]
 [ 2  0 13  0]
 [ 0  1  3  1]]
SUMMARY OF ALL RESULTS:
  dataset                       model  accuracy  f1_macro  n_features
0   Part1          Logistic