## Max voting

In [1]:

import sklearn as skl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import shap

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedShuffleSplit
from sklearn import tree

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import classification_report
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import RFECV
from sklearn.decomposition import PCA
%matplotlib inline

#### Load datasets

In [2]:
test = pd.read_csv('datasets/test_clean.csv')
train = pd.read_csv('datasets/train_clean.csv')

### Normalizar dados

In [3]:
train_normalized = train.copy()
test_normalized = test.copy()

X_scale = train_normalized.drop(columns=['Transition'])

scaler_X = MinMaxScaler(feature_range=(0,1)).fit(X_scale)
scaler_y = MinMaxScaler(feature_range=(0,1)).fit(test_normalized)
X_scale = pd.DataFrame(scaler_X.transform(X_scale[X_scale.columns]), columns=X_scale.columns)
test_normalized = pd.DataFrame(scaler_y.transform(test_normalized[test_normalized.columns]), columns=test_normalized.columns)

train_normalized = pd.concat([X_scale, train_normalized['Transition']], axis=1)

#test_normalized.head()

### Split dados treino

In [4]:
X = train.drop(columns=['Transition'])
y = train['Transition']

#### Split Dados Treino normalizados

In [5]:
X_nm = train_normalized.drop(columns=['Transition'])
y_nm = train_normalized['Transition']


### Feature Selection

#### RFECV

In [6]:
rf_model = RandomForestClassifier(random_state=2022, class_weight='balanced', n_estimators= 500)
cv = StratifiedKFold(5)

rfecv = RFECV(estimator=rf_model, step=25, cv=cv, scoring='f1_macro',min_features_to_select=50, n_jobs=-1)

rfecv.fit(X_nm, y_nm)

print(f"Optimal number of features: {rfecv.n_features_}")

X = X.iloc[:, rfecv.support_]
X_nm = X_nm.iloc[:, rfecv.support_]

test = test.iloc[:, rfecv.support_]
test_normalized = test_normalized.iloc[:, rfecv.support_]

Optimal number of features: 126


#### PCA

In [52]:
pca = PCA(n_components=80)
X_nm = pca.fit_transform(X_nm)
test_normalized= pca.transform(test_normalized)

#### SMOTE(podes ou não fazer)

In [None]:

sampling_strategy = {0.0: 60, 1.0: 96, 2.0: 68, 3.0: 71, 4.0: 40}#só aumenta para 4.0

smote = SMOTE(random_state=42,sampling_strategy='auto', k_neighbors = 5)
X_nm, y_nm = smote.fit_resample(X_nm, y_nm)

### split train_test

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=2022
)

X_train_nm, X_test_nm, y_train_nm, y_test_nm = train_test_split(
    X_nm, y_nm, test_size=0.2, random_state=2022, stratify=y_nm
)

### Modelos


#### Decision Tree

In [8]:
dt_model = DecisionTreeClassifier(random_state=2023)

dt_model.fit(X_train_nm, y_train_nm)

dt_score = dt_model.score(X_test_nm, y_test_nm)

print("Accuracy: %.2f%%" % (dt_score * 100))

dt_predictions = dt_model.predict(X_test_nm)

print(classification_report(y_test_nm, dt_predictions))


Accuracy: 29.51%
              precision    recall  f1-score   support

         0.0       0.10      0.08      0.09        12
         1.0       0.53      0.53      0.53        19
         2.0       0.20      0.29      0.24        14
         3.0       0.25      0.21      0.23        14
         4.0       0.00      0.00      0.00         2

    accuracy                           0.30        61
   macro avg       0.22      0.22      0.22        61
weighted avg       0.29      0.30      0.29        61



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### Random Forest

In [9]:
rf_model = RandomForestClassifier(random_state=2023, class_weight='balanced', n_estimators=500)

rf_model.fit(X_train_nm, y_train_nm)

rf_score = rf_model.score(X_test_nm, y_test_nm)

print("Accuracy: %.2f%%" % (rf_score * 100))

rf_predictions = rf_model.predict(X_test_nm)

print(classification_report(y_test_nm, rf_predictions))

Accuracy: 47.54%
              precision    recall  f1-score   support

         0.0       0.50      0.50      0.50        12
         1.0       0.58      0.95      0.72        19
         2.0       0.20      0.14      0.17        14
         3.0       0.38      0.21      0.27        14
         4.0       0.00      0.00      0.00         2

    accuracy                           0.48        61
   macro avg       0.33      0.36      0.33        61
weighted avg       0.41      0.48      0.42        61



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### Support Vector

In [10]:
svm_model = SVC(C=0.1, class_weight='balanced', kernel='linear', random_state=2022)
svm_model.fit(X_train_nm, y_train_nm)

svm_score = svm_model.score(X_test_nm, y_test_nm)

print("Accuracy: %.2f%%" % (svm_score * 100))

svm_predictions = svm_model.predict(X_test_nm)

print(classification_report(y_test_nm, svm_predictions))


Accuracy: 47.54%
              precision    recall  f1-score   support

         0.0       0.36      0.33      0.35        12
         1.0       0.62      0.84      0.71        19
         2.0       0.31      0.29      0.30        14
         3.0       0.44      0.29      0.35        14
         4.0       0.50      0.50      0.50         2

    accuracy                           0.48        61
   macro avg       0.45      0.45      0.44        61
weighted avg       0.45      0.48      0.45        61



#### MLP

In [11]:
mlp_model = MLPClassifier(activation='tanh', alpha=0.01, early_stopping=True,
              hidden_layer_sizes=(100, 50), max_iter=1000, random_state=2023,
              solver='lbfgs')

mlp_model.fit(X_train_nm, y_train_nm)

mlp_score = mlp_model.score(X_test_nm, y_test_nm)

print("Accuracy: %.2f%%" % (mlp_score * 100))

mlp_predictions = mlp_model.predict(X_test_nm)

print(classification_report(y_test_nm, mlp_predictions))


Accuracy: 44.26%
              precision    recall  f1-score   support

         0.0       0.46      0.50      0.48        12
         1.0       0.58      0.74      0.65        19
         2.0       0.22      0.14      0.17        14
         3.0       0.33      0.36      0.34        14
         4.0       0.00      0.00      0.00         2

    accuracy                           0.44        61
   macro avg       0.32      0.35      0.33        61
weighted avg       0.40      0.44      0.42        61



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### Gradient Boosting

In [12]:
from sklearn.ensemble import GradientBoostingClassifier


gbc_model = GradientBoostingClassifier(learning_rate=0.01, min_samples_leaf=2,
                           n_estimators=500, random_state=2023)

gbc_model.fit(X_train_nm, y_train_nm)

gbc_score = gbc_model.score(X_test_nm, y_test_nm)

print("Accuracy: %.2f%%" % (gbc_score * 100))

gbc_predictions = gbc_model.predict(X_test_nm)

print(classification_report(y_test_nm, gbc_predictions))

Accuracy: 59.02%
              precision    recall  f1-score   support

         0.0       0.55      0.50      0.52        12
         1.0       0.61      0.89      0.72        19
         2.0       0.50      0.43      0.46        14
         3.0       0.70      0.50      0.58        14
         4.0       0.00      0.00      0.00         2

    accuracy                           0.59        61
   macro avg       0.47      0.46      0.46        61
weighted avg       0.57      0.59      0.57        61



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### Stacking

In [13]:
from sklearn.ensemble import StackingClassifier


st_model = StackingClassifier(cv=cv,
                   estimators=[('gbc',
                                GradientBoostingClassifier(learning_rate=0.01,
                                                           min_samples_leaf=2,
                                                           n_estimators=500,
                                                           random_state=2023))],
                   final_estimator=SVC(C=0.1, class_weight='balanced',
                                       kernel='linear', random_state=2022),
                   passthrough=True)

st_model.fit(X_train_nm, y_train_nm)

#### Max Voting 

In [14]:
#estimators = [("dt", dt_model), ("svm", svm_model), ("rf", rf_model), ("mlp", mlp_model), ("gbc", gbc_model)]
estimators = [ ("svm", svm_model), ("st", st_model), ("gbc", gbc_model)]


vt_model = VotingClassifier(estimators=estimators, voting='hard', weights=[2,1,2])

vt_model.fit(X_train_nm, y_train_nm)
    

vt_predictions = vt_model.predict(X_test_nm)

print(classification_report(y_test_nm, vt_predictions))

              precision    recall  f1-score   support

         0.0       0.36      0.33      0.35        12
         1.0       0.65      0.89      0.76        19
         2.0       0.36      0.36      0.36        14
         3.0       0.67      0.43      0.52        14
         4.0       1.00      0.50      0.67         2

    accuracy                           0.54        61
   macro avg       0.61      0.50      0.53        61
weighted avg       0.54      0.54      0.53        61



#### Predictions

In [15]:
vt_model.fit(X_nm, y_nm)
predictionVT = vt_model.predict(test_normalized)

#### Save the result

In [16]:
label_mapping = {
    0: 'AD-AD',
    1: 'CN-CN',
    2: 'MCI-AD',
    3: 'MCI-MCI',
    4: 'CN-MCI'
}

result_labels = [label_mapping[pred] for pred in predictionVT]
result_df = pd.DataFrame({
    'RowId': range(1, len(predictionVT) + 1),
    'Result': result_labels
})

result_df.to_csv('datasets/MaxVoting.csv', index=False)