In [232]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, mean_squared_error, r2_score, classification_report, confusion_matrix, f1_score,root_mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, TensorDataset
from xgboost import XGBClassifier

In [233]:
features_path = 'D:/perg/outputs/features/perg_features.csv'
features_df = pd.read_csv(features_path)
print('Features shape:', features_df.shape)
features_df.head()

Features shape: (1354, 26)


Unnamed: 0,record_id,eye,N35_amp,N35_ms,P50_amp,P50_ms,N95_amp,N95_ms,N95P50_ratio,mean,...,kurt,skew,zcr,power_0_30Hz,dom_freq,dom_power,peak_freq,total_power,bp_1_30,bp_8_16
0,1,RE,-0.408717,28.9,2.199887,52.5,-1.716555,91.5,0.780293,5.572884000000001e-17,...,-0.217159,0.453907,0.015686,1.542985,13.333333,0.10725,13.333333,0.241531,1.20793,0.0
1,1,LE,-0.712719,29.5,2.394248,53.7,-1.588476,92.1,0.663455,-1.114577e-16,...,0.084987,0.844791,0.011765,1.501042,13.333333,0.115341,13.333333,0.237099,1.250359,0.0
2,2,RE,-0.045393,31.9,1.063206,46.6,-1.188255,111.6,1.117615,-5.572884000000001e-17,...,-1.371636,-0.063129,0.011765,0.543183,6.666667,0.054079,6.666667,0.083373,0.3609,0.0
3,2,LE,,,,,-1.327168,92.1,,2.716781e-16,...,-0.869275,0.661798,0.007843,0.557313,6.666667,0.052181,6.666667,0.105886,0.24558,0.0
4,2,RE,-1.622818,42.5,,,-1.36347,102.1,,2.786442e-16,...,0.445414,0.855903,0.031373,0.361835,13.333333,0.02502,13.333333,0.08628,0.320347,0.0


In [234]:
metadata_path = 'D:/perg/data/raw/participants_info.csv'
metadata = pd.read_csv(metadata_path)
print('Metadata shape:', metadata.shape)

Metadata shape: (336, 12)


In [235]:
# Merge features with labels
# Example: merge on 'record_id'. Map diagnosis or visual acuity as label.
data = features_df.merge(metadata, how='inner', left_on='record_id', right_on='id_record')
print('Merged data shape:', data.shape)

Merged data shape: (1354, 38)


In [236]:
# Classification target (Normal vs Abnormal)
data['label'] = data['diagnosis1'].apply(lambda x: 0 if x.lower() == 'normal' else 1)

In [237]:
# Regression target (LogMAR visual acuity average for eyes)
data['logmar'] = data[['va_re_logMar', 'va_le_logMar']].mean(axis=1)

In [238]:
# Cell 5: Select features & labels for classification
feature_columns = [
    'N35_amp', 'P50_amp', 'N95_amp',
    'N35_ms', 'P50_ms', 'N95_ms',
    'N95P50_ratio',
    'mean', 'std', 'min', 'max', 'median',
    'ptp', 'rms', 'skew', 'kurt', 'zcr',
    'power_0_30Hz', 'dom_freq', 'dom_power',    
    'bp_1_30', 'bp_8_16',
    'peak_freq', 'total_power'
]
X = data[feature_columns].fillna(0).values
y_cls = data['label'].values
y_reg = data['logmar'].values

In [239]:
# Standardize features for classical models
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [240]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_cls, test_size=0.3, random_state=42, stratify=y_cls)

In [252]:
def evaluate_classification_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    train_acc = accuracy_score(y_train, y_pred_train)
    test_acc = accuracy_score(y_test, y_pred_test)
    test_f1 = f1_score(y_test, y_pred_test)
    print(f"Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}, Test F1: {test_f1:.4f}")
    print("\nClassification Report:\n", classification_report(y_test, y_pred_test))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))


In [242]:
from sklearn.decomposition import PCA
pca = PCA(n_components=10)  # keep 10 best components
X_pca = pca.fit_transform(X)


In [243]:
param_grid_lr = {
    "C": [0.01, 0.1, 1, 10, 100],
    "penalty": ["l2"],
    "solver": ["lbfgs", "saga"],
    "max_iter": [300, 500]
}
lr = LogisticRegression()
grid_lr = GridSearchCV(lr, param_grid_lr, cv=5, refit=True, n_jobs=-1, verbose=2)
grid_lr.fit(X_train, y_train)
print("Best Logistic Regression Params:", grid_lr.best_params_)

# Evaluate on test set
y_pred = grid_lr.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Logistic Regression Params: {'C': 0.1, 'max_iter': 300, 'penalty': 'l2', 'solver': 'lbfgs'}
Test Accuracy: 0.6928746928746928


In [244]:
# Logistic Regression
print("Logistic Regression")
logreg = LogisticRegression(C=0.01,max_iter=300,penalty='l2',solver='lbfgs')
evaluate_classification_model(logreg, X_train, y_train, X_test, y_test)

Logistic Regression
Train Acc: 0.7043, Test Acc: 0.6978, Test F1: 0.8150

Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.10      0.17       127
           1       0.70      0.97      0.82       280

    accuracy                           0.70       407
   macro avg       0.65      0.54      0.49       407
weighted avg       0.67      0.70      0.62       407

Confusion Matrix:
 [[ 13 114]
 [  9 271]]


In [None]:
param_grid_rf = {
    "n_estimators": [100, 300],
    "max_depth": [3, 6, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 3, 5],
    "max_features": ["sqrt", "log2", None],
    "class_weight": ["balanced", None]
}
rf = RandomForestClassifier(random_state=42)
grid_rf = GridSearchCV(rf, param_grid_rf, cv=5,scoring="f1_macro", n_jobs=-1, verbose=0)
grid_rf.fit(X_train, y_train)
print("Best Random Forest Params:", grid_rf.best_params_)
# Evaluate on test set
y_pred = grid_rf.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Best CV Score:", grid_rf.best_score_)

Best Random Forest Params: {'class_weight': 'balanced', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Test Accuracy: 0.7100737100737101
Best CV Score: 0.6744566811028212


In [254]:
# Random Forest
print("Random Forest")
rf = RandomForestClassifier(
    n_estimators=300, max_depth=6, min_samples_split=10,max_features='sqrt',
    min_samples_leaf=2, class_weight='balanced', random_state=42
)

evaluate_classification_model(rf, X_train, y_train, X_test, y_test) 


Random Forest
Train Acc: 0.8078, Test Acc: 0.6757, Test F1: 0.7519

Classification Report:
               precision    recall  f1-score   support

           0       0.48      0.59      0.53       127
           1       0.79      0.71      0.75       280

    accuracy                           0.68       407
   macro avg       0.64      0.65      0.64       407
weighted avg       0.70      0.68      0.68       407

Confusion Matrix:
 [[ 75  52]
 [ 80 200]]


In [247]:
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [0.001, 0.01, 0.1, 1, 10],
    'kernel': ['rbf']
}

svm = SVC(probability=True)

# Grid search with 5-fold CV
grid = GridSearchCV(svm, param_grid, refit=True, cv=5, n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", grid.best_params_)

# Evaluate on test set
y_pred = grid.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))


Fitting 5 folds for each of 25 candidates, totalling 125 fits


Best Parameters: {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
Test Accuracy: 0.7051597051597052


In [248]:

# SVM
print("SVM")
svm = SVC(C=1, gamma=0.1, kernel='rbf', probability=True, random_state=42)
evaluate_classification_model(svm, X_train, y_train, X_test, y_test)

SVM
Train Acc: 0.7624, Test Acc: 0.7052, Test F1: 0.8007

Classification Report:
               precision    recall  f1-score   support

           0       0.54      0.36      0.43       127
           1       0.75      0.86      0.80       280

    accuracy                           0.71       407
   macro avg       0.64      0.61      0.62       407
weighted avg       0.68      0.71      0.69       407

Confusion Matrix:
 [[ 46  81]
 [ 39 241]]


In [249]:
param_grid_xgb = {
    "n_estimators": [100, 200],
    "max_depth": [3, 6],
    "learning_rate": [0.01, 0.1, 0.2],
    "subsample": [0.7, 1.0],
    "colsample_bytree": [0.7, 1.0]
}
xgb = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
grid_xgb = GridSearchCV(xgb, param_grid_xgb, cv=5, scoring="roc_auc", n_jobs=-1, verbose=2)
grid_xgb.fit(X_train, y_train)
print("Best XGBoost Params:", grid_xgb.best_params_)
# Evaluate on test set
y_pred = grid_xgb.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best XGBoost Params: {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.7}
Test Accuracy: 0.7051597051597052


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [250]:
# XGBoost
print("XGBoost")
xgb = XGBClassifier(
    n_estimators=200,      # number of boosting rounds (trees)
    
    learning_rate=0.01,     # step size shrinkage
    max_depth=3,           # maximum depth of trees
    subsample=0.7,         # subsample ratio for training instances
    colsample_bytree=1,  # subsample ratio for features
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

evaluate_classification_model(xgb, X_train, y_train, X_test, y_test)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost
Train Acc: 0.7751, Test Acc: 0.7052, Test F1: 0.8107

Classification Report:
               precision    recall  f1-score   support

           0       0.57      0.24      0.33       127
           1       0.73      0.92      0.81       280

    accuracy                           0.71       407
   macro avg       0.65      0.58      0.57       407
weighted avg       0.68      0.71      0.66       407

Confusion Matrix:
 [[ 30  97]
 [ 23 257]]


CNN