In [1]:
import pandas as pd
import numpy as np

# Leer Dataset

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/LEON240196/PF_ML/main/heart_2020_cleaned.csv')
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [3]:
df.columns

Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',
       'Asthma', 'KidneyDisease', 'SkinCancer'],
      dtype='object')

In [4]:
df.dtypes

HeartDisease         object
BMI                 float64
Smoking              object
AlcoholDrinking      object
Stroke               object
PhysicalHealth      float64
MentalHealth        float64
DiffWalking          object
Sex                  object
AgeCategory          object
Race                 object
Diabetic             object
PhysicalActivity     object
GenHealth            object
SleepTime           float64
Asthma               object
KidneyDisease        object
SkinCancer           object
dtype: object

¿Cuántas variables hay?

In [5]:
n = len(df.columns)
n_cat = len(df.select_dtypes(include="object").columns)
n_num = len(df.select_dtypes(include="number").columns)

print(f"número de variables: {n}")
print(f"número de variables categóricas: {n_cat}")
print(f"número de variables numéricas: {n_num}")

número de variables: 18
número de variables categóricas: 14
número de variables numéricas: 4


## Análisis Exploratorio de Datos

In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
BMI,319795.0,28.325399,6.3561,12.02,24.03,27.34,31.42,94.85
PhysicalHealth,319795.0,3.37171,7.95085,0.0,0.0,0.0,2.0,30.0
MentalHealth,319795.0,3.898366,7.955235,0.0,0.0,0.0,3.0,30.0
SleepTime,319795.0,7.097075,1.436007,1.0,6.0,7.0,8.0,24.0


In [7]:
df.shape[0]

319795

* No hay valores nulos!
* La media de BMI parece tener al menos un orden de magnitud de diferencia respecto a las otras variables numéricas
* Aún así, el máximo no tiene un orden de magnitud más que el resto de las variables. Probablemente esté sesgada la distribución

In [8]:
df.describe(include="object").T

Unnamed: 0,count,unique,top,freq
HeartDisease,319795,2,No,292422
Smoking,319795,2,No,187887
AlcoholDrinking,319795,2,No,298018
Stroke,319795,2,No,307726
DiffWalking,319795,2,No,275385
Sex,319795,2,Female,167805
AgeCategory,319795,13,65-69,34151
Race,319795,6,White,245212
Diabetic,319795,4,No,269653
PhysicalActivity,319795,2,Yes,247957


Sabemos que el número de observaciones que tenemos son 319 795, de esos 292 422 pertenecen a la clase negativa. Hay que balancear los datos.

In [9]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
fig, ax = plt.subplots(2, 2, figsize = (20, 10))

sns.distplot(df.BMI, ax = ax[0, 0])
ax[0, 0].set_title("Densidad de probabilidad - BMI")

sns.distplot(df.PhysicalHealth, ax = ax[0, 1])
ax[0, 1].set_title("Densidad de probabilidad - Physical Health")

sns.distplot(df.MentalHealth, ax = ax[1, 0])
ax[1, 0].set_title("Densidad de probabilidad - Mental Health")

sns.distplot(df.SleepTime, ax = ax[1, 1])
ax[1, 1].set_title("Densidad de probabilidad - Sleep Time")



Parece que Sleep Time y BMI son más o menos simétrocas. dependiendo de qué modelo decidamos usar, puede que las otras debamos hacerle alguna transformación
logarítmica.

In [None]:
fig, ax = plt.subplots(2, 2, figsize = (20, 10))

sns.distplot(df[df.HeartDisease == 'No'].BMI, ax = ax[0, 0])
sns.distplot(df[df.HeartDisease == 'Yes'].BMI, ax = ax[0, 0])
ax[0, 0].set_title("Densidad de probabilidad - BMI")

sns.distplot(df[df.HeartDisease == 'No'].PhysicalHealth, ax = ax[0, 1])
sns.distplot(df[df.HeartDisease == 'Yes'].PhysicalHealth, ax = ax[0, 1])
ax[0, 1].set_title("Densidad de probabilidad - Physical Health")

sns.distplot(df[df.HeartDisease == 'No'].MentalHealth, ax = ax[1, 0])
sns.distplot(df[df.HeartDisease == 'Yes'].MentalHealth, ax = ax[1, 0])
ax[1, 0].set_title("Densidad de probabilidad - Mental Health")

sns.distplot(df[df.HeartDisease == 'No'].SleepTime, ax = ax[1, 1])
sns.distplot(df[df.HeartDisease == 'Yes'].SleepTime, ax = ax[1, 1])
ax[1, 1].set_title("Densidad de probabilidad - Sleep Time")

Hay muy poca separabilidad entre las clases.
### Ahora veamos las variables categóricas

In [None]:
sex_df = df.groupby(['Sex', 'HeartDisease']).count().reset_index().loc[:, ['Sex', 'HeartDisease', 'BMI']]
sex_df

In [None]:
## como no hay nulos, podemos tomar cualquier columna y usarla para obtener el conteo

sns.barplot(x="Sex", y="BMI", hue="HeartDisease", data = sex_df)

Parece que proporcionalmente, hay más hombres con enfermedades cardiacas. 
Variables como beber alcohol o fumar seguramente proveen separabilidad.

In [None]:
alcohol_df = df.groupby(['AlcoholDrinking', 'HeartDisease']).count()['BMI'].reset_index()

sns.barplot(x = "AlcoholDrinking", y = "BMI", hue = "HeartDisease", data = alcohol_df)

Dificil decir a simple vista. veamos proporciones

In [None]:
non_drinkers = alcohol_df[alcohol_df.AlcoholDrinking == 'No'].BMI.sum()
drinkers = alcohol_df[alcohol_df.AlcoholDrinking == 'Yes'].BMI.sum()

clean_and_sick = alcohol_df[(alcohol_df.AlcoholDrinking == 'No') & (alcohol_df.HeartDisease == 'Yes')].BMI.sum() # usa sum para obtener int en vez de series
drink_and_sick = alcohol_df[(alcohol_df.AlcoholDrinking == 'Yes') & (alcohol_df.HeartDisease == 'Yes')].BMI.sum()


print(f"{float(clean_and_sick) / non_drinkers} es la proporción de gente enferma que no toma alcohol")
print(f"{float(drink_and_sick) / drinkers} es la proporción de gente enferma que sí toma alcohol")

No nos dice mucho 

In [None]:
test_df = df.groupby(["Stroke", 'HeartDisease']).count()['BMI'].reset_index()
total_df = df.groupby(['HeartDisease']).count()['Smoking'].reset_index()

total_df = test_df.merge(total_df, on="HeartDisease", how="left")
total_df['percent'] = total_df.BMI.astype('float') / total_df.Smoking

total_df

In [None]:
plot_df = total_df.drop(['Smoking', 'BMI'], axis=1)
plot_df.pivot(index="HeartDisease", columns="Stroke", values="percent").plot(kind='bar', stacked=True)

In [None]:
def plot_categorical(df: pd.DataFrame, feature: str):
    test_df = df.groupby([feature, 'HeartDisease']).count()['BMI'].reset_index()
    total_df = df.groupby(['HeartDisease']).count()['Smoking'].reset_index()

    total_df = test_df.merge(total_df, on="HeartDisease", how="left")
    total_df['percent'] = total_df.BMI.astype('float') / total_df.Smoking
    
   
    plot_df = total_df.drop(['Smoking', 'BMI'], axis=1)

    plot_df.pivot(index="HeartDisease", columns=feature, values="percent").plot(kind='bar', stacked=True)

In [None]:
plot_categorical(df, "AgeCategory")

Parece que ciertos grupos de edad están mejor representdos en la clase positiva. Podría ser una buena variable.

Ahora veamos la misma gráfica para el resto de las variables categóricas

In [None]:
plotted = ['AgeCategory', 'Stroke', 'HeartDisease', "Smoking"]
to_be_plotted = [col for col in df.select_dtypes(include="object").columns if col not in plotted]

for col in to_be_plotted:
  plot_categorical(df, col)

Aunque no hay ninguna variable que sea notoriamente diferente para cada una de las clases, si la persona tuvo diabetes, cánceer en la piel, enfermedades de riñón o actividad física sí muestran algunas diferencias.

In [None]:
test_df = df.groupby(["Smoking", 'HeartDisease']).count()['BMI'].reset_index()

total_df = df.groupby(['HeartDisease']).count()['Smoking'].reset_index()
total_df = test_df.merge(total_df, on="HeartDisease", how="left")
total_df['percent'] = total_df.BMI.astype('float') / total_df.Smoking_y
total_df = total_df.drop(['Smoking_y', 'BMI'],axis=1)
total_df

In [None]:
total_df.pivot(index="HeartDisease", columns="Smoking_x", values="percent").plot(kind='bar', stacked=True)

También hay un poco de diferencia en fumadores

## Preprocesamiento de Datos

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

Vamos a separar los conjuntos de prueba y entrenamiento

In [None]:

y = df.pop('HeartDisease')
X = df

In [None]:
y

In [None]:
X

In [None]:
# como no tenemos valores nulos solo debemos escalar variables numéricas y hacer one hot encoding de variables categóricas

scale_numeric = Pipeline(steps=[("scaling", StandardScaler())])
encode_category = Pipeline(steps=[("encoding", OneHotEncoder())])


columns_number = X.select_dtypes(include="number").columns.to_list()
columns_category = X.select_dtypes(include="category").columns.to_list()


preprocessing = ColumnTransformer(transformers=[("numeric", scale_numeric, columns_number),
                                                ("category", encode_category, columns_category)
])

In [None]:
##### 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=34)

X_train

In [None]:
### preparar los datos

label_encoder = LabelEncoder()

y_train = label_encoder.fit_transform(y_train)
y_train

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
X_train = preprocessing.fit_transform(X_train)
X_train

In [None]:
over_sampler = SMOTE()
X_train_res, y_train_res = over_sampler.fit_resample(X_train, y_train)

sum(y_train_res) / len(y_train_res)

In [None]:
### instanciar modelos

tree_model = DecisionTreeClassifier()
logit_model = LogisticRegression()

In [None]:
tree_model.fit(X_train_res, y_train_res)
logit_model.fit(X_train_res, y_train_res)

Evaluemos el modelo en el conjunto de prueba

In [None]:
X_test = preprocessing.transform(X_test)
y_test = label_encoder.transform(y_test)

In [None]:
tree_pred = tree_model.predict(X_test)
logit_pred = logit_model.predict(X_test)

In [None]:
from sklearn.metrics import recall_score, precision_score

In [None]:
print(f"Conjunto de prueba (arbol) - accuracy: {recall_score(y_test, tree_pred)} precision: {precision_score(y_test, tree_pred)}")
print(f"Conjunto de prueba (logit) - accuracy: {recall_score(y_test, logit_pred)} precision: {precision_score(y_test, logit_pred)}")

In [None]:
### comparar con conjunto de entrenamiento

tree_pred_train = tree_model.predict(X_train_res)
logit_pred_train = logit_model.predict(X_train_res)

In [None]:
print(f"Conjunto de prueba (arbol) - accuracy: {recall_score(y_train_res, tree_pred_train)} precision: {precision_score(y_train_res, tree_pred_train)}")
print(f"Conjunto de prueba (logit) - accuracy: {recall_score(y_train_res, logit_pred_train)} precision: {precision_score(y_train_res, logit_pred_train)}")

El Precision cayó mucho. Probablemente hay algo de overfitting o algún problema con el over sampling

# Busqueda de Hiperparametros

In [None]:
import optuna
import sklearn

In [None]:
# Variables de entrenamiento X_train_res, y_train_res

In [None]:
def objective(trial):
    x, y = X_train_res, y_train_res
    
    classifier_name = trial.suggest_categorical('classifier', ['SVC', 'LogisticRegression' 'RandomForest'])
    if classifier_name == 'SVC':
        svc_c = trial.suggest_float('svc_c', 1e-10, 1e10, log=True)
        svc_kernel = trial.suggest_categorical('svc_kernel', ['linear', 'poly', 'sigmoid', 'rbf'])
        svc_gamma = trial.suggest_categorical('svc_gamma', ['auto', 'scale'])
        classifier_obj = sklearn.svm.SVC(C=svc_c, gamma=svc_gamma, kernel=svc_kernel)
    elif classifier_name == 'LogisticRegression':
        LogisticRegression_penalty = trial.suggest_categorical(['none', 'l2', 'l1', 'elasticnet'])
        LogisticRegression_c = trial.suggest_float('LogisticRegression_c', 1e-10, 5, log=True)
        LogisticRegression_solver = trial.suggest_categorical('LogisticRegression_solver', ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'])
#         LogisticRegression_fit_intercept = trial.suggest_categorical('LogisticRegression_fit_intercept', [False, True])
        classifier_obj = sklearn.linear_model.LogisticRegression(
            penalty=LogisticRegression_penalty, C=LogisticRegression_c, solver=LogisticRegression_solver)
    else:
        rf_max_depth = trial.suggest_int('rf_max_depth', 2, 200, log=True)
        rf_n_estimators = trial.suggest_int('rf_n_estimators', 100, 500, log=True)
        rf_criterion = trial.suggest_categorical('rf_criterion', ['gini', 'entropy'])
        rf_max_features = trial.suggest_categorical('rf_max_features', ['auto', 'sqrt', 'log2'])
        classifier_obj = sklearn.ensemble.RandomForestClassifier(
            max_depth=rf_max_depth, n_estimators=rf_n_estimators, criterion=rf_criterion,
        max_features=rf_max_features)
        
        
    score = sklearn.model_selection.cross_val_score(classifier_obj, x, 
                                    y, n_jobs=-1, cv=2
                                                   )
    accuracy = score.mean()
    return accuracy


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=2)
print(study.best_trial)