In [83]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, TensorDataset
from xgboost import XGBClassifier

In [84]:
features_path = 'D:/perg/outputs/features/perg_features.csv'
features_df = pd.read_csv(features_path)
print('Features shape:', features_df.shape)
features_df.head()

Features shape: (1354, 23)


Unnamed: 0,record_id,eye,N35_amp,N35_ms,P50_amp,P50_ms,N95_amp,N95_ms,N95P50_ratio,mean,...,median,ptp,rms,kurt,skew,zcr,peak_freq,total_power,bp_1_30,bp_8_16
0,1,RE,-0.408717,28.9,2.199887,52.5,-1.716555,91.5,0.780293,5.572884000000001e-17,...,-0.032117,3.916442,1.0,-0.217159,0.453907,0.015686,13.333333,0.241531,1.20793,0.0
1,1,LE,-0.712719,29.5,2.394248,53.7,-1.588476,92.1,0.663455,-1.114577e-16,...,-0.236019,3.982724,1.0,0.084987,0.844791,0.011765,13.333333,0.237099,1.250359,0.0
2,2,RE,-0.045393,31.9,1.063206,46.6,-1.188255,111.6,1.117615,-5.572884000000001e-17,...,0.088699,3.294507,1.0,-1.371636,-0.063129,0.011765,6.666667,0.083373,0.3609,0.0
3,2,LE,,,,,-1.327168,92.1,,2.716781e-16,...,-0.414977,3.674116,1.0,-0.869275,0.661798,0.007843,6.666667,0.105886,0.24558,0.0
4,2,RE,-1.622818,42.5,,,-1.36347,102.1,,2.786442e-16,...,-0.16862,4.387108,1.0,0.445414,0.855903,0.031373,13.333333,0.08628,0.320347,0.0


In [85]:
metadata_path = 'D:/perg/data/raw/participants_info.csv'
metadata = pd.read_csv(metadata_path)
print('Metadata shape:', metadata.shape)

Metadata shape: (336, 12)


In [86]:
# Merge features with labels
# Example: merge on 'record_id'. Map diagnosis or visual acuity as label.
data = features_df.merge(metadata, how='inner', left_on='record_id', right_on='id_record')
print('Merged data shape:', data.shape)

Merged data shape: (1354, 35)


In [87]:
# Classification target (Normal vs Abnormal)
data['label'] = data['diagnosis1'].apply(lambda x: 0 if x.lower() == 'normal' else 1)

In [88]:
# Regression target (LogMAR visual acuity average for eyes)
data['logmar'] = data[['va_re_logMar', 'va_le_logMar']].mean(axis=1)

In [89]:
# Cell 5: Select features & labels for classification
feature_columns = [
    'N35_amp', 'P50_amp', 'N95_amp',
    'N35_ms', 'P50_ms', 'N95_ms',
    'N95P50_ratio',
    'mean', 'std', 'min', 'max', 'median',
    'ptp', 'rms', 'skew', 'kurt', 'zcr',
    'bp_1_30', 'bp_8_16',
    'peak_freq', 'total_power'
]
X = data[feature_columns].fillna(0).values
y_cls = data['label'].values
y_reg = data['logmar'].values

In [90]:
# Standardize features for classical models
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [91]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_cls, test_size=0.4, random_state=42, stratify=y_cls)

In [92]:
def evaluate_classification_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    train_acc = accuracy_score(y_train, y_pred_train)
    test_acc = accuracy_score(y_test, y_pred_test)
    test_f1 = f1_score(y_test, y_pred_test)
    print(f"Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}, Test F1: {test_f1:.4f}")

In [93]:
# Logistic Regression
print("Logistic Regression")
logreg = LogisticRegression(max_iter=500)
evaluate_classification_model(logreg, X_train, y_train, X_test, y_test)

Logistic Regression
Train Acc: 0.7143, Test Acc: 0.6937, Test F1: 0.8042


In [94]:
# Random Forest
print("Random Forest")
rf = RandomForestClassifier(n_estimators=500, max_depth=None, random_state=42)
evaluate_classification_model(rf, X_train, y_train, X_test, y_test) 

Random Forest
Train Acc: 0.9975, Test Acc: 0.7011, Test F1: 0.8034


In [96]:
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [0.001, 0.01, 0.1, 1, 10],
    'kernel': ['rbf']
}

svm = SVC(probability=True)

# Grid search with 5-fold CV
grid = GridSearchCV(svm, param_grid, refit=True, cv=5, n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", grid.best_params_)

# Evaluate on test set
y_pred = grid.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))


Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best Parameters: {'C': 100, 'gamma': 1, 'kernel': 'rbf'}
Test Accuracy: 0.6974169741697417


In [99]:

# SVM
print("SVM")
svm = SVC(C=10, gamma=0.01, kernel='rbf', probability=True, random_state=42)
evaluate_classification_model(svm, X_train, y_train, X_test, y_test)

SVM
Train Acc: 0.7229, Test Acc: 0.7085, Test F1: 0.8196


In [None]:
# XGBoost
print("XGBoost")
xgb = XGBClassifier(
    n_estimators=100,      # number of boosting rounds (trees)
    learning_rate=0.1,     # step size shrinkage
    max_depth=6,           # maximum depth of trees
    subsample=0.8,         # subsample ratio for training instances
    colsample_bytree=0.8,  # subsample ratio for features
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

evaluate_classification_model(xgb, X_train, y_train, X_test, y_test)

XGBoost


Train Acc: 0.9975, Test Acc: 0.7066, Test F1: 0.8030


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
