# Imports


In [40]:
!pip -q install yellowbrick
!pip -q install imbalanced-learn
import math
import random
import pickle
import operator
import itertools
import functools
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.ensemble import IsolationForest
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.calibration import cross_val_predict
from sklearn.model_selection import GridSearchCV
from yellowbrick.classifier import ConfusionMatrix
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, make_scorer
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier


# Base de Dados

#### Abrindo dataset

In [41]:
base = pd.read_csv('2015.csv')

In [None]:
base

Fazendo o processo de binarização do atributo DIABETE3, já que o atributo é categorico nominal, mas focamos apenas em ser diabético ou não-diabético

In [43]:
teste = base.copy()
teste = base[base['DIABETE3'].isin([1, 3])].copy()
teste.loc[teste['DIABETE3'] == 3, 'DIABETE3'] = 0

Atributos selecionados com base na literatura

In [44]:
atributos_selecionados =["BPMEDS","_BMI5CAT","TOLDHI2","_SMOKER3","CVDSTRK3","CVDCRHD4","EXERANY2","_FRTLT1",
                        "_VEGLT1","_RFDRHV5","CHCKIDNY","PHYSHLTH","GENHLTH","_EDUCAG","SEX","_AGE_G",
                        "INCOME2","_PA150R2","CHECKUP1","DIABETE3"]

In [45]:
teste = teste[atributos_selecionados]

In [None]:
teste

# Pré-Processamento

#### Remoção de instâncias e atributos com muitos dados nulos!

Processo de tratamento de valores fora do range de utilização

In [None]:
base_pre = teste

above_seven_nullable = ['BPMEDS','TOLDHI2','EXERANY2','_PA150R2','_FRTLT1','_VEGLT1','CVDSTRK3','CVDCRHD4','CHCKIDNY','GENHLTH','_RFDRHV5','_SMOKER3','_EDUCAG']
above_tirty_nullabe = ['PHYSHLTH','INCOME2']
conditional = lambda x: 0 if x == 88 else (np.nan if x == 77 or x == 99 else x)

for column in above_seven_nullable:
    base_pre[column] = base_pre[column].apply(lambda x: np.nan if x >= 7 else x)

for column in above_tirty_nullabe:
    base_pre[column] = base_pre[column].apply(conditional)

base_pre['CHECKUP1'] = base_pre['CHECKUP1'].apply(lambda x: np.nan if x >= 9 else x)
base_pre


Remoção de instâncias com base em um taxa de atributos nulos

In [None]:
max_null_rate = 0.2
taxaNullPorLinha = base_pre.isnull().mean(axis=1)

base_filtrada = base_pre[taxaNullPorLinha <= max_null_rate]

print("DataFrame original:")
print(base_pre.shape[0])
print("\nDataFrame filtrado:")
print(base_filtrada.shape[0])

In [None]:
base_filtrada

Remoção de atributos com muitos dados nulos

In [None]:
for col_name, col_data in base_filtrada.items():
    num_nulls = col_data.isnull().sum()
    print(f'Coluna: {col_name}, Número de valores nulos: {num_nulls}')

In [55]:
taxaMaxNull = 0.2
maxNulls = base_filtrada.shape[0] * taxaMaxNull
print("Atributos removidos:")
for col_name, col_data in base_filtrada.items():
    num_nulls = col_data.isnull().sum()
    if(num_nulls > maxNulls):
        base_filtrada = base_filtrada.drop([col_name],axis=1)
        print(col_name)

BPMEDS


#### Visualização dos dados na matriz de correlação

In [None]:
matriz_correlation = base_filtrada.corr()
plt.figure(figsize=(12, 8))

sns.heatmap(matriz_correlation, annot=True, cmap='coolwarm', vmin=-1,vmax=1,annot_kws={"size": 11},fmt=".1f")
plt.title('Matriz de Correlação')
plt.show()



Com base na análise e visualização da matriz de correlação, o atributo EXERANY2 foi removido pelo fato de ele ser menos insignificante do que PAI150R2

In [57]:
base_selecao = base_filtrada.drop('EXERANY2',axis = 1)

#### Balanceamto

In [None]:

X = base_selecao.drop('DIABETE3', axis=1)
y = base_selecao['DIABETE3']

# Aplicar o RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Criar um DataFrame com os dados reamostrados
base_balanceamento= pd.DataFrame(X_resampled, columns=X.columns)
base_balanceamento['DIABETE3'] = y_resampled

print("Original class distribution:")
print(y.value_counts())

print("\nResampled class distribution:")
print(y_resampled.value_counts())



#### Dados Ausentes

In [59]:
nomeColunas = base_balanceamento.columns
imputer = KNNImputer(n_neighbors =5)
X_imputed = imputer.fit_transform(base_balanceamento)
base_notAusente = pd.DataFrame(X_imputed,columns=nomeColunas)

Pelo falo do KNNImputer poder atribuir dados em ponto flutuante em atributos categóricos, foi aplicado o processo de arredondamento

In [None]:
colunas_arredondar = ["_BMI5CAT","TOLDHI2","_SMOKER3","CVDSTRK3","CVDCRHD4","_FRTLT1",
                        "_VEGLT1","_RFDRHV5","CHCKIDNY","GENHLTH","_EDUCAG","SEX","_AGE_G",
                        "INCOME2","_PA150R2","CHECKUP1"]

casas_decimais = 0
base_round = base_notAusente
base_round[colunas_arredondar] = base_notAusente[colunas_arredondar].round(casas_decimais)
base_round

#### Remoção de Outliers

In [None]:
out_IF = IsolationForest(contamination=0.1).fit_predict(base_notAusente)
out_IF_series = pd.Series(out_IF, index=base_notAusente.index)
outliers = base_notAusente[out_IF_series == -1]
inliers = base_notAusente[out_IF_series == 1]
inliers.reset_index(drop=True,inplace=True)
base_notOutliers = pd.DataFrame(inliers)
print('\nDados com Outliers')
print('Nº Amostras:', outliers.shape[0])
print('\nDados sem Outliers')
print('Nº Amostras:', base_notOutliers.shape[0])


In [None]:
base_notOutliers

#### Redundância

In [63]:
base_notDuplicatas = base_notOutliers.drop_duplicates()

In [None]:
base_notDuplicatas

#### Normalização

In [None]:
scaler = MinMaxScaler()
colunas_normalizar = ['PHYSHLTH']
base_normalizada = base_notDuplicatas
base_normalizada[colunas_normalizar] = scaler.fit_transform(base_normalizada[colunas_normalizar])

In [None]:
base_normalizada

Transformação de atributos categóricos dividindo em colunas binarizadas

In [67]:
base_categorica = base_normalizada
nome_colunas = base_categorica.columns
onehotencoder = ColumnTransformer(transformers=[('OneHot', OneHotEncoder(), [0,1,2,3,4,5,6,7,8,10,11,12,13,14,15,16])], remainder='passthrough')
X_sparse = onehotencoder.fit_transform(base_categorica)
novas_colunas = onehotencoder.get_feature_names_out()
X_prev = pd.DataFrame(X_sparse, columns=novas_colunas)
base_final = X_prev

In [None]:
base_final

#### Análise de redução de dimensionalidade

In [None]:
X = base_final.drop(columns=['remainder__DIABETE3'])
y = base_final['remainder__DIABETE3']
print(y.value_counts())
sample_size = 0.1
X_sample, _, y_sample, _ = train_test_split(X, y, train_size=sample_size, stratify=y, random_state=42)
print(y_sample.value_counts())
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_sample)
tsne = TSNE(n_components=2, perplexity=30, n_iter=1000, random_state=42)
X_tsne = tsne.fit_transform(X_pca)
df = pd.DataFrame(data=X_tsne, columns=['Componente Principal 1', 'Componente Principal 2'])
df['Classe'] = y_sample.reset_index(drop=True)
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Componente Principal 1', 
                y='Componente Principal 2',
                hue='Classe',
                palette=sns.color_palette("hls", len(df['Classe'].unique())),
                data=df,
                legend="full")
plt.title('t-SNE - Conjunto de Dados Diabetes')
plt.xlabel('Componente Principal 1')
plt.ylabel('Componente Principal 2')
plt.legend(title='Classe')
plt.show()

# Treinamento

In [71]:
y = base_final['remainder__DIABETE3']
X = base_final.drop('remainder__DIABETE3', axis=1).copy()

#### Calculo de Hiperparâmetros com GridSearchCV

In [83]:
rf = RandomForestClassifier()
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

dt = DecisionTreeClassifier()
dt_param_grid = {
    'max_depth': [10, 20, None], 
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

mlp = MLPClassifier(max_iter=1000)
mlp_param_grid = {
    'hidden_layer_sizes': [3,5, 10],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant', 'adaptive']
}

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rf_grid_search = GridSearchCV(rf, rf_param_grid, cv=cv, scoring='accuracy', verbose=1)
rf_grid_search.fit(X, y)


In [None]:
dt_grid_search = GridSearchCV(dt, dt_param_grid, cv=cv, scoring='accuracy', verbose=1)
dt_grid_search.fit(X, y)


In [None]:
mlp_grid_search = GridSearchCV(mlp, mlp_param_grid, cv=cv, scoring='accuracy', verbose=1)
mlp_grid_search.fit(X, y)

In [None]:
print("Melhores Parâmetros do Random Forest:")
print(rf_grid_search.best_params_)
rf_best = rf_grid_search.best_estimator_

rf_scores = cross_val_score(rf_best, X, y, cv=cv, scoring='accuracy')
print(f"Acurácia do Cross-Validation do Random Forest : {np.mean(rf_scores):.4f} (+/- {np.std(rf_scores):.4f})")



print("\nMelhores Parâmetros da Decision Tree:")
print(dt_grid_search.best_params_)
dt_best = dt_grid_search.best_estimator_

dt_scores = cross_val_score(dt_best, X, y, cv=cv, scoring='accuracy')
print(f"Acurácia do Cross-Validation do Decision Tree: {np.mean(dt_scores):.4f} (+/- {np.std(dt_scores):.4f})")



print("\nMelhores Parâmetros da Neural Network:")
print(mlp_grid_search.best_params_)
mlp_best = mlp_grid_search.best_estimator_

mlp_scores = cross_val_score(mlp_best, X, y, cv=cv, scoring='accuracy')
print(f"Acurácia do Cross-Validation da Neural Network: {np.mean(mlp_scores):.4f} (+/- {np.std(mlp_scores):.4f})")


#### Treinamento dos Modelos baseado nos melhores parâmetros 

In [None]:
rf_best = RandomForestClassifier(max_depth=10, min_samples_split=5, n_estimators=200)

dt_best = DecisionTreeClassifier(criterion='gini', max_depth=10, min_samples_split=5)

mlp_best = MLPClassifier(max_iter=1000, activation='tanh', alpha=0.05, hidden_layer_sizes=(3,), learning_rate='constant', solver='adam')

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

y_pred_rf = cross_val_predict(rf_best, X, y, cv=cv)
y_pred_dt = cross_val_predict(dt_best, X, y, cv=cv)
y_pred_mlp = cross_val_predict(mlp_best, X, y, cv=cv)

# Resultados

In [None]:

print("Cálculo das métricas do Random Forest :")
print(classification_report(y, y_pred_rf))
print("Matrix de Confusão:")
print(confusion_matrix(y, y_pred_rf))


plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y, y_pred_rf), annot=True, fmt='d', cmap='Blues')
plt.title('Matrix de Confusão do Random Forest:')
plt.xlabel('Previsado')
plt.ylabel('True')
plt.show()


print("Cálculo das métricas da Decision Tree:")
print(classification_report(y, y_pred_dt))
print("Matrix de Confusão:")
print(confusion_matrix(y, y_pred_dt))


plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y, y_pred_dt), annot=True, fmt='d', cmap='Blues')
plt.title('Matrix de Confusão do Decision Tree')
plt.xlabel('Previsado')
plt.ylabel('True')
plt.show()


print("Cálculo das métricas da Neural Network (MLP):")
print(classification_report(y, y_pred_mlp))
print("Matrix de Confusão:")
print(confusion_matrix(y, y_pred_mlp))

plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y, y_pred_mlp), annot=True, fmt='d', cmap='Blues')
plt.title('Matrix de Confusão do Neural Network (MLP)')
plt.xlabel('Previsão')
plt.ylabel('True')
plt.show()

#### Teste T

In [None]:
from scipy.stats import ttest_rel
t_stat_rf_dt, p_value_rf_dt = ttest_rel(rf_scores, dt_scores)
print(f"T-test between Random Forest and Decision Tree: t_stat={t_stat_rf_dt:.4f}, p_value={p_value_rf_dt:.4f}")

# Teste T entre Random Forest e MLP
t_stat_rf_mlp, p_value_rf_mlp = ttest_rel(rf_scores, mlp_scores)
print(f"T-test between Random Forest and MLP: t_stat={t_stat_rf_mlp:.4f}, p_value={p_value_rf_mlp:.4f}")

# Teste T entre Decision Tree e MLP
t_stat_dt_mlp, p_value_dt_mlp = ttest_rel(dt_scores, mlp_scores)
print(f"T-test between Decision Tree and MLP: t_stat={t_stat_dt_mlp:.4f}, p_value={p_value_dt_mlp:.4f}")