# Projeto Integrador 6

## Predição de diabetes com modelos de IA

## Documentação dataset -> https://archive.ics.uci.edu/dataset/891/cdc+diabetes+health+indicators

# DADOS BRUTOS

## Import das Bibliotecas e Pacotes

In [None]:
!pip install optuna
!pip install ucimlrepo

Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.0.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.8/362.8 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.6-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Mak

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import optuna
from plotly.io import show
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, FunctionTransformer, RobustScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import accuracy_score, f1_score, make_scorer, confusion_matrix, ConfusionMatrixDisplay, recall_score, precision_score
from sklearn.compose import ColumnTransformer
from ucimlrepo import fetch_ucirepo
from functools import partial

## Import do Dataset

In [None]:
cdc_diabetes_health_indicators = fetch_ucirepo(id=891)

df = cdc_diabetes_health_indicators.data.features
aux = cdc_diabetes_health_indicators.data.targets

df["diabetes"] = aux["Diabetes_binary"]


print(cdc_diabetes_health_indicators.metadata)

print(cdc_diabetes_health_indicators.variables)

{'uci_id': 891, 'name': 'CDC Diabetes Health Indicators', 'repository_url': 'https://archive.ics.uci.edu/dataset/891/cdc+diabetes+health+indicators', 'data_url': 'https://archive.ics.uci.edu/static/public/891/data.csv', 'abstract': 'The Diabetes Health Indicators Dataset contains healthcare statistics and lifestyle survey information about people in general along with their diagnosis of diabetes. The 35 features consist of some demographics, lab test results, and answers to survey questions for each patient. The target variable for classification is whether a patient has diabetes, is pre-diabetic, or healthy. ', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Tabular', 'Multivariate'], 'num_instances': 253680, 'num_features': 21, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Sex', 'Age', 'Education Level', 'Income'], 'target_col': ['Diabetes_binary'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
df.sample(20)

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,diabetes
94130,1,1,1,27,0,0,0,1,1,1,...,1,2,0,0,0,0,6,6,5,0
249505,1,1,1,24,1,0,0,0,0,0,...,1,5,15,15,0,1,7,2,3,0
28956,0,0,1,42,0,0,0,0,1,0,...,0,3,0,5,1,0,9,6,7,0
854,0,1,1,35,0,0,0,1,1,1,...,0,2,0,0,0,0,10,6,8,0
55356,1,0,1,28,0,0,0,1,1,1,...,0,1,3,0,0,0,9,6,5,0
7522,1,1,1,38,0,0,0,0,0,1,...,0,3,5,2,0,0,9,4,6,0
113198,1,1,1,31,1,0,1,1,1,1,...,0,4,0,0,1,0,10,5,6,0
61097,1,0,1,32,1,0,0,0,0,1,...,0,3,0,2,0,1,8,4,6,0
29138,1,0,1,21,1,0,0,0,1,1,...,0,3,0,8,1,0,13,4,1,0
204706,1,0,1,32,0,0,0,0,0,0,...,0,2,0,0,0,1,5,6,8,0


# Separação de conjuntos estratificados de aprendizado, teste e validação

In [None]:
X = df.drop(columns=["diabetes"])
y = df["diabetes"]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10, stratify=y)

categorical_columns = ['GenHlth', 'Age', 'Education', 'Income']
numerical_columns = ['BMI', 'MentHlth', 'PhysHlth']
binary_columns = ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Sex']


print(y_train.value_counts())
print(y_test.value_counts())

x_train_split, x_validation, y_train_split, y_validation = train_test_split(x_train, y_train, test_size=0.2, random_state=10, stratify=y_train)

print(y_train_split.value_counts())
print(y_validation.value_counts())

diabetes
0    174667
1     28277
Name: count, dtype: int64
diabetes
0    43667
1     7069
Name: count, dtype: int64
diabetes
0    139733
1     22622
Name: count, dtype: int64
diabetes
0    34934
1     5655
Name: count, dtype: int64


# PIPELINE

## KNN

### Pré-processamento + Seleção de Hiperparâmetros

In [None]:
def remove_outliers_iqr(x, y, columns):
    condition = pd.Series(True, index=x.index)

    for column in columns:
        Q1 = x[column].quantile(0.25)
        Q3 = x[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Atualiza a condição para incluir a coluna atual
        condition &= (x[column] >= lower_bound) & (x[column] <= upper_bound)

    # Aplica a condição para filtrar x e y
    x_filtered = x[condition]
    y_filtered = y[condition]

    return x_filtered, y_filtered



In [None]:
x_train_split.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
146728,0,1,1,27,1,0,0,1,1,1,...,1,0,2,0,0,0,0,8,5,8
77325,0,0,1,32,0,0,0,1,1,1,...,1,0,1,3,0,0,0,4,6,7
24237,0,1,1,32,0,0,0,1,0,1,...,1,0,3,0,0,0,1,9,5,7
29668,0,1,1,34,1,0,0,1,1,1,...,1,0,3,1,2,0,0,11,6,7
122763,1,1,1,24,0,0,1,1,1,1,...,1,0,4,0,30,0,0,13,4,3


In [None]:
# x_train_split, y_train_split = remove_outliers_iqr(x_train_split, y_train_split, columns=['BMI', 'PhysHlth', 'MentHlth'])

preprocessor_knn = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_columns),
        ('cat', OneHotEncoder(), categorical_columns),
        ('bin', 'passthrough', binary_columns)
    ])



In [None]:
preprocessor_knn.fit_transform(x_train_split).todense().shape

In [None]:
.columns

Index(['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke',
       'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')

In [None]:
# Obter nomes das colunas categóricas após o OneHotEncoding
cat_column_names = preprocessor_knn.named_transformers_['cat'].get_feature_names_out(categorical_columns)

# Combinar nomes das colunas
all_column_names = np.concatenate([numerical_columns, binary_columns, cat_column_names])

# Criar um DataFrame
transformed_data = pd.DataFrame(preprocessor_knn.fit_transform(x_train_split), columns=all_column_names)

# Exibir o DataFrame com os nomes das colunas
print(transformed_data.shape)

(162355, 49)


In [None]:
def objective(trial):
    pipeline = Pipeline([
        #('outlier_remover', outlier_remover),
        ('preprocessor', preprocessor_knn),
        ('knn', KNeighborsClassifier())
    ])
    k = trial.suggest_int('k', 2, 15) # alteração de 25 para 15
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    metric = trial.suggest_categorical('metric', ['euclidean', 'manhattan', 'minkowski'])
    pipeline.set_params(knn__n_neighbors=k, knn__weights=weights, knn__metric=metric)
    pipeline.fit(x_train_split, y_train_split)

    y_pred = pipeline.predict(x_validation)
    score = f1_score(y_validation, y_pred)
    return score


#objective_func = partial(objective, pipeline=pipe_knn)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10, n_jobs=-1) # incrementação do parâmetro n_jobs=-1
# study.optimize(objective_func, n_trials=100)

study.best_params

[I 2024-10-31 21:04:08,418] A new study created in memory with name: no-name-7c807b01-1f15-4575-80c8-107d56894579
[I 2024-10-31 21:05:40,750] Trial 1 finished with value: 0.2613124851930822 and parameters: {'k': 5, 'weights': 'uniform', 'metric': 'euclidean'}. Best is trial 1 with value: 0.2613124851930822.
[I 2024-10-31 21:07:15,078] Trial 2 finished with value: 0.15693215339233038 and parameters: {'k': 12, 'weights': 'uniform', 'metric': 'minkowski'}. Best is trial 1 with value: 0.2613124851930822.
[I 2024-10-31 21:14:39,546] Trial 0 finished with value: 0.27609635577516983 and parameters: {'k': 4, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 0 with value: 0.27609635577516983.
[I 2024-10-31 21:16:12,672] Trial 4 finished with value: 0.16100230414746544 and parameters: {'k': 8, 'weights': 'uniform', 'metric': 'euclidean'}. Best is trial 0 with value: 0.27609635577516983.
[I 2024-10-31 21:17:41,065] Trial 3 finished with value: 0.25783475783475784 and parameters: {'k': 

{'k': 4, 'weights': 'distance', 'metric': 'manhattan'}

In [None]:
knn = KNeighborsClassifier(n_neighbors=4, weights='distance', metric='manhattan')

In [None]:
knn.fit(x_train, y_train)

y_pred = knn.predict(x_test)

f1_score(y_test, y_pred)

0.27027528037816295

In [None]:


print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(precision_score(y_test, y_pred))

[[41405  2262]
 [ 5611  1458]]
0.844824187953327
0.20625265242608573
0.3919354838709677


Acurácia relativamente alta, f1_score baixo. Modelo sem capacidade de generalizar resultados.

# OBS - treinar 30 vezes com shuffle e fazer média e desvio padrão das métricas obtidas


# OBS 2 - Reavaliar separação dos dados. Executar divisão entre Treino, Teste e Validação.