# Decision Tree

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
import seaborn as sns
%matplotlib inline

### Load csv

In [5]:
test = pd.read_csv('datasets/test_clean.csv')
train = pd.read_csv('datasets/train_clean.csv')

### Normalizar dados

In [6]:

train_normalized = train.copy()
test_normalized = test.copy()

X_scale = train_normalized.drop(columns=['Transition'])

scaler_X = MinMaxScaler(feature_range=(0,1)).fit(X_scale)
scaler_y = MinMaxScaler(feature_range=(0,1)).fit(test_normalized)
X_scale = pd.DataFrame(scaler_X.transform(X_scale[X_scale.columns]), columns=X_scale.columns)
test_normalized = pd.DataFrame(scaler_y.transform(test_normalized[test_normalized.columns]), columns=test_normalized.columns)

train_normalized = pd.concat([X_scale, train_normalized['Transition']], axis=1)

#test_normalized.head()

### Split dados treino

In [7]:
X = train.drop(columns=['Transition'])
y = train['Transition']

#### Split dados treino normalizados

In [8]:
X_nm = train_normalized.drop(columns=['Transition'])
y_nm = train_normalized['Transition']

### Feature Selection(tentar com pca??)

#### RFECV

In [9]:

from sklearn.ensemble import RandomForestClassifier


dt_model = RandomForestClassifier(random_state=2022, n_estimators= 500, class_weight= 'balanced')
cv = StratifiedKFold(5)

rfecv = RFECV(estimator=dt_model, step=25, cv=cv, scoring='f1_macro',min_features_to_select=50, n_jobs=-1)

rfecv.fit(X_nm, y_nm)

print(f"Optimal number of features: {rfecv.n_features_}")

X_nm = X_nm.iloc[:, rfecv.support_]

test_normalized = test_normalized.iloc[:, rfecv.support_]


Optimal number of features: 126


#### PCA

In [None]:
from sklearn.decomposition import PCA


pca = PCA(n_components=100)
X_nm = pca.fit_transform(X_nm)
test_normalized= pca.transform(test_normalized)

#### SMOTE

In [None]:

sampling_strategy = {0.0: 60, 1.0: 96, 2.0: 68, 3.0: 71, 4.0: 40}#só aumenta para 4.0

smote = SMOTE(random_state=42,sampling_strategy=sampling_strategy,k_neighbors = 3)
X_nm, y_nm = smote.fit_resample(X_nm, y_nm)

### Split train_test

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=2022
)

X_train_nm, X_test_nm, y_train_nm, y_test_nm = train_test_split(
    X_nm, y_nm, test_size=0.2, random_state=2022, stratify=y_nm
)

### GridSearch

In [11]:


estimator_rf = DecisionTreeClassifier(random_state=2022)


param_grid_rf = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [5, 10, 20],
    'min_samples_split': [5, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'max_features': ['sqrt', 'log2', 0.1, 0.2],  # Fração dos 1500 recursos
    'class_weight': [None, 'balanced']
}

grid_rf = GridSearchCV(
    estimator_rf, 
    param_grid_rf, 
    refit=True, 
    verbose = 1,
    cv=cv
    )

grid_rf.fit(X_train_nm, y_train_nm)

predictions = grid_rf.predict(X_test_nm)

print("Best estimator:", grid_rf.best_estimator_)


print(classification_report(y_test_nm, predictions))

Fitting 5 folds for each of 432 candidates, totalling 2160 fits
Best estimator: DecisionTreeClassifier(max_depth=5, max_features='sqrt', min_samples_leaf=5,
                       min_samples_split=5, random_state=2022)
              precision    recall  f1-score   support

         0.0       0.25      0.17      0.20        12
         1.0       0.48      0.58      0.52        19
         2.0       0.32      0.43      0.36        14
         3.0       0.18      0.14      0.16        14
         4.0       0.00      0.00      0.00         2

    accuracy                           0.34        61
   macro avg       0.25      0.26      0.25        61
weighted avg       0.31      0.34      0.32        61



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Modelo

In [None]:
dt_model = grid_rf.best_estimator_
dt_model.fit(X_nm, y_nm)
predictionsDT = dt_model.predict(test_normalized)

### Saving the result

In [None]:
label_mapping = {
    0: 'AD-AD',
    1: 'CN-CN',
    2: 'MCI-AD',
    3: 'MCI-MCI',
    4: 'CN-MCI'
}

result_labels = [label_mapping[pred] for pred in predictionsDT]
result_df = pd.DataFrame({
    'RowId': range(1, len(predictionsDT) + 1),
    'Result': result_labels
})

result_df.to_csv('datasets/decision_tree.csv', index=False)