## Imports


In [1]:
import sklearn as skl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import shap

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedShuffleSplit
from sklearn import tree

from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE

from sklearn.metrics import classification_report
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import RFECV
from sklearn.decomposition import PCA
%matplotlib inline

### Load datasets

In [2]:
test = pd.read_csv('datasets/test_clean.csv')
train = pd.read_csv('datasets/train_clean.csv')

#train = pd.read_csv('datasets/controlo_clean.csv')

### Normalizar dados

In [3]:
train_normalized = train.copy()
test_normalized = test.copy()

X_scale = train_normalized.drop(columns=['Transition'])

scaler_X = MinMaxScaler(feature_range=(0,1)).fit(X_scale)
scaler_y = MinMaxScaler(feature_range=(0,1)).fit(test_normalized)
X_scale = pd.DataFrame(scaler_X.transform(X_scale[X_scale.columns]), columns=X_scale.columns)
test_normalized = pd.DataFrame(scaler_y.transform(test_normalized[test_normalized.columns]), columns=test_normalized.columns)

train_normalized = pd.concat([X_scale, train_normalized['Transition']], axis=1)

#test_normalized.head()

### Split Dados Treino

In [4]:
X = train.drop(columns=['Transition'])
y = train['Transition']


#### Split Dados Treino normalizados

In [5]:
X_nm = train_normalized.drop(columns=['Transition'])
y_nm = train_normalized['Transition']


### Feature Selection

#### RFECV

In [None]:

rf_model = RandomForestClassifier(random_state=2022, class_weight='balanced', n_estimators= 500)
cv = StratifiedKFold(5)

rfecv = RFECV(estimator=rf_model, step=25, cv=cv, scoring='f1_macro',min_features_to_select=50, n_jobs=-1)

rfecv.fit(X_nm, y_nm)

print(f"Optimal number of features: {rfecv.n_features_}")

X = X.iloc[:, rfecv.support_]
X_nm = X_nm.iloc[:, rfecv.support_]

test = test.iloc[:, rfecv.support_]
test_normalized = test_normalized.iloc[:, rfecv.support_]


### PCA

In [None]:

pca = PCA(n_components=100)
X_nm = pca.fit_transform(X_nm)
test_normalized= pca.transform(test_normalized)




#### SMOTE(podes ou não fazer)

In [7]:

sampling_strategy = {0.0: 60, 1.0: 96, 2.0: 68, 3.0: 71, 4.0: 40}#só aumenta para 4.0

smote = SMOTE(random_state=42,sampling_strategy=sampling_strategy, k_neighbors = 5)
X_nm, y_nm = smote.fit_resample(X_nm, y_nm)

### split train_test

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=2022
)

X_train_nm, X_test_nm, y_train_nm, y_test_nm = train_test_split(
    X_nm, y_nm, test_size=0.2, random_state=2022, stratify=y_nm
)


### GridSearch

In [None]:

estimator_xgb = XGBClassifier(random_state=2023, objective='multi:softprob', num_class=5)

param_grid_xgb = {
    'learning_rate': [0.01, 0.1],
    'n_estimators': [300, 500, 800],
    'max_depth': [5],
    'gamma': [0.1],
    'min_child_weight': [ 2, 5],
    'colsample_bytree': [0.6, 1.0]
}

grid_xgb = GridSearchCV(
    estimator_xgb, 
    param_grid_xgb, 
    refit=True, 
    verbose = 3,
    scoring= 'f1_macro',
    cv=cv
    )

grid_xgb.fit(X_train_nm, y_train_nm)

predictions = grid_xgb.predict(X_test_nm)

print("Best estimator:", grid_xgb.best_estimator_)


print(classification_report(y_test_nm, predictions))

### Modelo final(usar hiperparametros do gridsearch)

In [10]:

xgb_model = grid_xgb.best_estimator_
xgb_model.fit(X_nm, y_nm)
predictionsXGB = xgb_model.predict(test_normalized)

### Save the result

In [11]:
label_mapping = {
    0: 'AD-AD',
    1: 'CN-CN',
    2: 'MCI-AD',
    3: 'MCI-MCI',
    4: 'CN-MCI'
}

result_labels = [label_mapping[pred] for pred in predictionsXGB]
result_df = pd.DataFrame({
    'RowId': range(1, len(predictionsXGB) + 1),
    'Result': result_labels
})

result_df.to_csv('datasets/XGB.csv', index=False)