In [1]:
import pandas as pd
import numpy as np
import nibabel as nib
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
files_csv = pd.read_csv("ATR_GT_Training.csv")


In [4]:
training_files = files_csv.dropna()
testing_files = files_csv[~files_csv.index.isin(training_files.index)]


In [5]:
images = []
for training_file in training_files.iloc[:,0]:
    nii_img = nib.load(f'./data/{training_file}.nii.gz')
    nii_data = nii_img.get_fdata()
    images.append(nii_data)


In [6]:
hist_features = []
bins = 100
min_intensity, max_intensity = 300, 3000
for img in images:
  hist, _ = np.histogram(img.flatten(), bins=bins, range=(min_intensity, max_intensity))
  hist_norm = hist / np.sum(hist)
  hist_features.append(hist_norm)

X = np.array(hist_features)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, training_files.iloc[:,1], test_size=0.2, random_state=42)

# Hyperparameter optimization

## SVM

### SVM without PCA

In [8]:
param_grid = {
    'svm__C': [1.0, 5.0, 7.0, 10.0],
    'svm__kernel': ['sigmoid', 'rbf', 'poly'],
}

svm = SVC(random_state=42)
clf = GridSearchCV(estimator=Pipeline(steps=[('svm', svm)]), param_grid=param_grid, cv=10, return_train_score=True)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
print('Best parameters:', clf.best_params_)

Accuracy: 0.835820895522388
Precision: 0.7566994069681244
Recall: 0.746016081871345
F1 Score: 0.7483626710364386
Best parameters: {'svm__C': 10.0, 'svm__kernel': 'rbf'}


### SVM with PCA

In [9]:
param_grid = {
    'pca__n_components': [2,5, 10,20, 50],
    'svm__C': [1.0, 5.0, 7.0, 10.0],
    'svm__kernel': ['sigmoid', 'rbf', 'poly']
}

pca = PCA(random_state=42)
svm = SVC(random_state=42)
clf = GridSearchCV(estimator=Pipeline(steps=[('pca', pca), ('svm', svm)]), param_grid=param_grid, cv=10, return_train_score=True)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
print('Best parameters:', clf.best_params_)

Accuracy: 0.8208955223880597
Precision: 0.732496158278909
Recall: 0.7527621136173768
F1 Score: 0.7363018778160244
Best parameters: {'pca__n_components': 20, 'svm__C': 10.0, 'svm__kernel': 'rbf'}


## KNN

In [10]:
param_grid = {
    'knn__n_neighbors': [3, 5, 7, 9], 
    'knn__weights': ['uniform', 'distance'], 
    'knn__metric': ['euclidean', 'manhattan']
}

knn = KNeighborsClassifier()
clf = GridSearchCV(estimator=Pipeline(steps=[('knn', knn)]), param_grid=param_grid, cv=10, n_jobs=-1)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
print('Best parameters:', clf.best_params_)

Accuracy: 0.8706467661691543
Precision: 0.8099104011984919
Recall: 0.8241906850459482
F1 Score: 0.812307983976583
Best parameters: {'knn__metric': 'manhattan', 'knn__n_neighbors': 9, 'knn__weights': 'distance'}


# Logistic Regression

In [11]:
param_grid = {
    'lr__solver': ['newton-cg', 'sag', 'saga', 'lbfgs'],
    'lr__C': [0.01, 0.1, 1, 10],
    'lr__penalty': ['l1', 'l2', 'elasticnet']
}

lr = LogisticRegression(random_state=42)
clf = GridSearchCV(estimator=Pipeline(steps=[('lr', lr)]), param_grid=param_grid, cv=10, verbose=0)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


clf.fit(X_train_scaled, y_train)

y_pred = clf.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
print('Best parameters:', clf.best_params_)

Accuracy: 0.8109452736318408
Precision: 0.7250201653559185
Recall: 0.6998224728487887
F1 Score: 0.7104413164694683
Best parameters: {'lr__C': 1, 'lr__penalty': 'l2', 'lr__solver': 'newton-cg'}


# Decision Tree with AdaBoost

## Descision Tree

In [12]:
param_grid = {
    'dt__max_depth': [6, 7, 8, ],
    'dt__min_samples_split': [2, 3, 4, 5],
    'dt__criterion': ['gini', 'entropy',],
}

dt = DecisionTreeClassifier(random_state=42)
clf = GridSearchCV(estimator=Pipeline(steps=[('dt', dt)]), param_grid=param_grid, cv=10, verbose=0)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf.fit(X_train_scaled, y_train)

y_pred = clf.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)

print('Best parameters:', clf.best_params_)

Accuracy: 0.8407960199004975
Precision: 0.8439657652474108
Recall: 0.6801169590643275
F1 Score: 0.7419967269481271
Best parameters: {'dt__criterion': 'gini', 'dt__max_depth': 7, 'dt__min_samples_split': 5}


## Decision Tree with AdaBoost 

In [13]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ada', AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=8, criterion='gini', min_samples_split=2, random_state=42), random_state=42))
])

param_grid = {
    'ada__n_estimators': [50, 100],
}

clf = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=10)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)

print('Best parameters:', clf.best_params_)


Accuracy: 0.8855721393034826
Precision: 0.8729092643768918
Recall: 0.8009137426900585
F1 Score: 0.8278913166112176
Best parameters: {'ada__n_estimators': 100}
