In [None]:
### Funciones

"""
Conjunto de instrucciones que resuelven tarea específica.

sintaxis:  funcion(parámetros)

def funcion(parametros):
  input.
  definir la función.
  output.

  instrucciones
  return()
"""

elements = ['Juan', 2, 56, 'Maria', 9.6, 'Catalina']

def string_cleaner(elements):
  strings_extracted = []
  for element in elements:
    if (type(element) == str):
      strings_extracted.append(element)
  return(strings_extracted)


In [None]:
elements2 = ['Juan', 2, 56, 'Maria', 9.6, 'Catalina', 'Francisca', 'Mateo', 123.45]

string_cleaner(elements2)


In [None]:
import numpy as np
import pandas as pd
import stats
import json
import requests
from collections import Counter
import scipy.stats
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics as mt


## Visualización
from plotnine import *
import matplotlib.pyplot as plt

## Descarga de token de kaggle que permite acceder al set de datos
json_response= requests.get("https://raw.githubusercontent.com/HectorHenriquez/basic-programming-for-radiologists/main/kaggle.json")

token = json.loads(json_response.text)
with open("kaggle.json", "w") as outfile:
    json.dump(token, outfile)

## Carga de set de datos desde kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d alexteboul/diabetes-health-indicators-dataset
!unzip diabetes-health-indicators-dataset.zip

In [None]:
pd.set_option('display.max_columns', None)

data = pd.read_csv('/content/diabetes_binary_health_indicators_BRFSS2015.csv')

data.shape

In [None]:
data.head(5)

In [None]:
## Ver nombre de las columnas
data.columns

In [None]:
## Etiquetas:
## 0: normal; 1: pre-diabetes; 2: diabetes
data.Diabetes_binary.unique()

In [None]:
## Cantidad de pacientes en cada clase
Counter(data.Diabetes_binary)

In [None]:
data.info()

In [None]:
data.describe()

### Exploración de datos

In [None]:
data.head(5)

In [None]:

(
 ggplot(data)+
 aes(y=data.Income, x='factor(Diabetes_binary)', fill='factor(Diabetes_binary)')+
 geom_boxplot(show_legend=False) +
 labs(title='Distribución de Income según la variable target', x ='Clase', fill='Clase')+
 theme_bw()
)

In [None]:
(
 ggplot(data)+
 aes(y=data.BMI, x='factor(Diabetes_binary)', fill='factor(Diabetes_binary)')+
 geom_boxplot(show_legend=False) +
 labs(title='Distribución de BMI según la variable target', x ='Clase', fill='Clase')+
 theme_bw()
)

In [None]:
### Evaluación de BMI

data.BMI

In [None]:
data.Diabetes_binary == 1

In [None]:
bmi_diabetes = data[data.Diabetes_binary == 1]['BMI']
bmi_normal = data[data.Diabetes_binary == 0]['BMI']

print(bmi_diabetes.mean())
print(bmi_normal.mean())

In [None]:
## ## Probar normalidad
(
 ggplot()+
 aes(x=bmi_normal)+
 geom_histogram(bins=100, fill='dodgerblue', color='black') +
 theme_bw()
)

In [None]:
## Test de media: No paramétrico

U1, p = mannwhitneyu(bmi_diabetes, bmi_normal)
print(p)

In [None]:
### Ajustar modelo simple con pocas variables

data.columns

In [None]:
variables_select = ['Diabetes_binary', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker','PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump','Stroke', 'HeartDiseaseorAttack','DiffWalk','Sex', 'Age', 'Education',
       'Income']


data_select = data.loc[:,variables_select]
data_select.head()

In [None]:
data_select.iloc[:,1:]

In [None]:

X_train, X_test, y_train, y_test = train_test_split(data_select.iloc[:,1:], data_select.iloc[:,0], test_size=0.3, random_state=10)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:

tree_clf = DecisionTreeClassifier()
tree_clf = tree_clf.fit(X_train, y_train)

In [None]:

pred_train= tree_clf.predict(X_train)
pred_test = tree_clf.predict(X_test)

print(classification_report(y_test, pred_test))

confusion_matrix(y_test, pred_test, labels=[0, 1])

In [None]:

##Se guardan resultados
F1Train = []
F1Test =[]

## Profundidad máxima a evaluar
depth = 30

for i in range(1,depth):

    tree_clf = DecisionTreeClassifier(splitter='best', max_depth=i)
    tree_clf = tree_clf.fit(X_train,y_train)
    pred_train = tree_clf.predict(X_train)

    pred_test = tree_clf.predict(X_test)

    modelF1Train = mt.f1_score(y_train,pred_train, pos_label=1)
    modelF1Test = mt.f1_score(y_test,pred_test, pos_label=1)
    F1Train.append(modelF1Train)
    F1Test.append(modelF1Test)

tempDF = pd.DataFrame(columns=['Profundidad', 'F1 Score Train', 'F1 Score Test'])
tempDF['Profundidad'] = list(range(1,depth))
tempDF['F1 Score Train'] = F1Train
tempDF['F1 Score Test'] = F1Test

In [None]:
(
ggplot(tempDF) +
    aes(x='Profundidad', y='F1 Score') +
    geom_line(aes(y='F1 Score Train'), color="steelblue") +
    geom_line(aes(y='F1 Score Test'), color ="darkred") +
    theme_bw() +
    scale_x_continuous(breaks=(list(range(0,depth+1))))+
    annotate(geom = "text", label= "Train", x=20, y=0.5, size = 12, color="steelblue") +
    annotate(geom = "text", label= "Test", x=20, y=0.4, size = 12, color="darkred") +
    labs(title='F1 Score según profundidad')
)

In [None]:

tree_clf = DecisionTreeClassifier(max_depth=6)
tree_clf = tree_clf.fit(X_train, y_train)
pred_test = tree_clf.predict(X_test)

print(classification_report(y_test, pred_test))

confusion_matrix(y_test, pred_test, labels=[0, 1])