# Bibliotecas

In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer


# Preparação dos Dados

In [2]:
data = pd.read_csv('dados.csv')

dados_pre = data[['M/F', 'Age', 'Educ', 'SES','MMSE', 'eTIV', 'nWBV', 'CDR']].copy()
dados_pre.dropna(inplace=True)
dados_pre.reset_index(drop=True, inplace=True)
dados_pre.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216 entries, 0 to 215
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   M/F     216 non-null    object 
 1   Age     216 non-null    int64  
 2   Educ    216 non-null    float64
 3   SES     216 non-null    float64
 4   MMSE    216 non-null    float64
 5   eTIV    216 non-null    int64  
 6   nWBV    216 non-null    float64
 7   CDR     216 non-null    float64
dtypes: float64(5), int64(2), object(1)
memory usage: 13.6+ KB


In [11]:
df = pd.read_csv('arquivos/oasis_cross-sectional_imputado.csv')

In [12]:
df.describe()

Unnamed: 0,Age,Educ,SES,MMSE,eTIV,nWBV,CDR
count,436.0,436.0,436.0,436.0,436.0,436.0,436.0
mean,51.357798,3.580275,2.394495,28.233945,1481.919725,0.79167,0.15367
std,25.269862,0.937336,0.711396,3.024854,158.740866,0.059937,0.31515
min,18.0,2.0,2.0,14.0,1123.0,0.644,0.0
25%,23.0,3.0,2.0,28.0,1367.75,0.74275,0.0
50%,54.0,4.0,2.0,29.0,1475.5,0.809,0.0
75%,74.0,4.0,3.0,30.0,1579.25,0.842,0.0
max,96.0,5.0,5.0,30.0,1992.0,0.893,2.0


In [17]:
df.CDR.value_counts()

CDR
0.0    336
0.5     70
1.0     28
2.0      2
Name: count, dtype: int64

In [19]:
X = df.drop('CDR', axis=1)

y = df['CDR'].astype(str)


X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=y)


In [20]:
# Colunas por tipo
categorical = ['M/F']
ordinal = ['Educ', 'SES']
numeric = ['Age', 'MMSE', 'eTIV', 'nWBV']

# Preprocessadores
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder())  # ou OneHotEncoder, se preferir
])

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder())
])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Pipeline de pré-processamento completo
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical),
        ('ord', ordinal_transformer, ordinal),
        ('num', numeric_transformer, numeric)
    ])


In [21]:
y_train.value_counts()

CDR
0.0    268
0.5     56
1.0     22
2.0      2
Name: count, dtype: int64

In [23]:
# Pré-processamento dos dados de treino
from imblearn.over_sampling import SMOTE

X_train_transformed = preprocessor.fit_transform(X_train)

# Aplicando SMOTE
smote = SMOTE(random_state=42, k_neighbors=1)
X_train_res, y_train_res = smote.fit_resample(X_train_transformed, y_train)

# Verificando distribuição após SMOTE
from collections import Counter
print("Distribuição das classes após SMOTE:", Counter(y_train_res))

Distribuição das classes após SMOTE: Counter({'0.0': 268, '0.5': 268, '1.0': 268, '2.0': 268})


In [24]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'criterion': ['gini', 'entropy'],
    'max_features': [None, 'sqrt', 'log2']
}

grid_search = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,  # validação cruzada com 5 folds
    scoring='f1_macro',  # bom para problemas com classes desbalanceadas
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train_res, y_train_res)

print("Melhores parâmetros encontrados:")
print(grid_search.best_params_)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Melhores parâmetros encontrados:
{'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [25]:
best_tree = grid_search.best_estimator_

X_test_transformed = preprocessor.transform(X_test)

# Teste com dados normalizados
y_pred = best_tree.predict(X_test_transformed)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[63  4  1]
 [ 6  6  2]
 [ 0  3  3]]
              precision    recall  f1-score   support

         0.0       0.91      0.93      0.92        68
         0.5       0.46      0.43      0.44        14
         1.0       0.50      0.50      0.50         6

    accuracy                           0.82        88
   macro avg       0.62      0.62      0.62        88
weighted avg       0.81      0.82      0.82        88



In [26]:
# Modelo de árvore com dados balanceados
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train_res, y_train_res)

# Transformar dados de teste
X_test_transformed = preprocessor.transform(X_test)

# Avaliação
y_pred = clf.predict(X_test_transformed)

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[63  5  0]
 [ 4  7  3]
 [ 1  2  3]]
              precision    recall  f1-score   support

         0.0       0.93      0.93      0.93        68
         0.5       0.50      0.50      0.50        14
         1.0       0.50      0.50      0.50         6

    accuracy                           0.83        88
   macro avg       0.64      0.64      0.64        88
weighted avg       0.83      0.83      0.83        88



In [5]:
clf_tree = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

clf_tree.fit(X_train, y_train)


In [6]:
y_pred = clf_tree.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[19  8  0]
 [ 1  6  5]
 [ 0  3  2]]
              precision    recall  f1-score   support

         0.0       0.95      0.70      0.81        27
         0.5       0.35      0.50      0.41        12
         1.0       0.29      0.40      0.33         5

    accuracy                           0.61        44
   macro avg       0.53      0.53      0.52        44
weighted avg       0.71      0.61      0.65        44



In [7]:
import joblib

joblib.dump(clf_tree, 'decision_tree_model.pkl')

['decision_tree_model.pkl']