## Imports


In [1]:
import sklearn as skl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import shap

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedShuffleSplit
from sklearn import tree

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import classification_report
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import RFECV
from sklearn.decomposition import PCA
%matplotlib inline

### Load datasets

In [2]:
test = pd.read_csv('datasets/test_clean.csv')
train = pd.read_csv('datasets/train_clean.csv')

#train = pd.read_csv('datasets/controlo_clean.csv')

### Normalizar dados

In [3]:
train_normalized = train.copy()
test_normalized = test.copy()

X_scale = train_normalized.drop(columns=['Transition'])

scaler_X = MinMaxScaler(feature_range=(0,1)).fit(X_scale)
scaler_y = MinMaxScaler(feature_range=(0,1)).fit(test_normalized)
X_scale = pd.DataFrame(scaler_X.transform(X_scale[X_scale.columns]), columns=X_scale.columns)
test_normalized = pd.DataFrame(scaler_y.transform(test_normalized[test_normalized.columns]), columns=test_normalized.columns)

train_normalized = pd.concat([X_scale, train_normalized['Transition']], axis=1)

#test_normalized.head()

### Split Dados Treino

In [4]:
X = train.drop(columns=['Transition'])
y = train['Transition']


#### Split Dados Treino normalizados

In [5]:
X_nm = train_normalized.drop(columns=['Transition'])
y_nm = train_normalized['Transition']


### Feature Selection

#### RFECV

In [6]:

rf_model = RandomForestClassifier(random_state=2022, class_weight='balanced', n_estimators=500)
cv = StratifiedKFold(5)

rfecv = RFECV(estimator=rf_model, step=25, cv=cv, scoring='f1_macro',min_features_to_select=50, n_jobs=-1)

rfecv.fit(X_nm, y_nm)

print(f"Optimal number of features: {rfecv.n_features_}")

X = X.iloc[:, rfecv.support_]
X_nm = X_nm.iloc[:, rfecv.support_]

test = test.iloc[:, rfecv.support_]
test_normalized = test_normalized.iloc[:, rfecv.support_]


Optimal number of features: 126


#### PCA

In [7]:

pca = PCA(n_components=100)
X_nm = pca.fit_transform(X_nm)
test_normalized= pca.transform(test_normalized)


#### SMOTE(podes ou não fazer)

In [10]:

sampling_strategy = {0.0: 60, 1.0: 96, 2.0: 68, 3.0: 71, 4.0: 40}#só aumenta para 4.0

smote = SMOTE(random_state=42,sampling_strategy=sampling_strategy, k_neighbors = 3)
X_nm, y_nm = smote.fit_resample(X_nm, y_nm)

### split train_test

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=2022
)

X_train_nm, X_test_nm, y_train_nm, y_test_nm = train_test_split(
    X_nm, y_nm, test_size=0.2, random_state=2022, stratify=y_nm
)


### GridSearch

In [8]:

estimator_mlp = MLPClassifier(random_state=2023)

param_grid = {
    'hidden_layer_sizes' : [(100,), (150,), (100, 50), (150, 100)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'lbfgs'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'learning_rate_init': [0.001, 0.01],
    'max_iter': [1000],
    'early_stopping': [True],
}
grid_mlp = GridSearchCV(
    estimator_mlp, 
    param_grid, 
    refit=True, 
    verbose = 1,
    scoring= 'f1_macro',
    cv=cv
    )

grid_mlp.fit(X_train_nm, y_train_nm)

predictions = grid_mlp.predict(X_test_nm)

print("Best estimator:", grid_mlp.best_estimator_)


print(classification_report(y_test_nm, predictions))

Fitting 5 folds for each of 288 candidates, totalling 1440 fits
Best estimator: MLPClassifier(activation='tanh', alpha=0.01, early_stopping=True,
              hidden_layer_sizes=(100, 50), max_iter=1000, random_state=2023,
              solver='lbfgs')
              precision    recall  f1-score   support

         0.0       0.46      0.50      0.48        12
         1.0       0.58      0.74      0.65        19
         2.0       0.22      0.14      0.17        14
         3.0       0.33      0.36      0.34        14
         4.0       0.00      0.00      0.00         2

    accuracy                           0.44        61
   macro avg       0.32      0.35      0.33        61
weighted avg       0.40      0.44      0.42        61



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Modelo final(usar hiperparametros do gridsearch)

In [9]:

mlp_model = grid_mlp.best_estimator_
mlp_model.fit(X_nm, y_nm)
predictionsMLP = mlp_model.predict(test_normalized)

### Save the result

In [10]:
label_mapping = {
    0: 'AD-AD',
    1: 'CN-CN',
    2: 'MCI-AD',
    3: 'MCI-MCI',
    4: 'CN-MCI'
}

result_labels = [label_mapping[pred] for pred in predictionsMLP]
result_df = pd.DataFrame({
    'RowId': range(1, len(predictionsMLP) + 1),
    'Result': result_labels
})

result_df.to_csv('datasets/MLP.csv', index=False)