<a href="https://colab.research.google.com/github/MonicaACM/Web_app_Machine_learning/blob/main/Flujo_completo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tableone > NULL

In [None]:
# For processing data
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy import stats
import numpy as np
import re

from tableone import TableOne

## To modeling
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier


# Assesment
from sklearn.metrics import classification_report


# Funciones limpieza y preprocesamiento

In [None]:
def percentage_nulls(df,nan_str):
    """
    This function returns a dictionary with the column and
    the porcentage of missing values

    nan_str: array con los valores nulos no reconocidos (ej. nan, NA, null) identificados
    """
    df = df.replace(nan_str,np.nan,regex=True)

    N_rows = df.shape[0]
    percentage_vars = {}
    for var in df.columns:
        percentage_vars[var]=(df[var].isnull().sum() / N_rows)
    return percentage_vars

In [None]:
#bloque para clasificar variables
def binary(data):
  bin_reg = r"^[01](?:\.0)?\.?$"
  return bool(re.findall(bin_reg, str(data)))

def flotante(data):
  float_reg = r"\b\d+(?:\.)?\d*\b"
  return bool(re.findall(float_reg, str(data)))

def vars_type(base, umbral):
  binarias = [] #se guardan primero las binarias y se agregan las string al final
  numericas = []
  cat_str = []
  for columna in base.columns:
    if base[columna].apply(binary).sum()/len(base[columna]) > umbral:
      binarias.append(columna)
    elif base[columna].apply(flotante).sum()/len(base[columna]) > umbral:
      numericas.append(columna)
    else:
      cat_str.append(columna)
  #return pd.DataFrame({"cat_bin": categoricas, "num":numericas, 'cat_str':cat_str})
  #print('cat binarias, cat string, númericas')
  return binarias, cat_str, numericas

def normalidad(df,numericas):
    nonormal = []
    normal = []
    for columna in df[numericas]:
        if columna in numericas:
                n,p = stats.shapiro(df[columna])
                if p<0.05:
                    nonormal.append(columna)
                else:
                    normal.append(columna)
    return nonormal, normal


def remove_nan(df_, percentages_dict, threshold, nonormal, normal):
    """
    Receive a dictionary with the percatege of missing of each varaible and drop them
    according to the threshold defined
    """
    df = df_.copy()
    for var in percentages_dict:
        if percentages_dict[var] > threshold:
            df.drop(columns = [var], inplace=True)
        elif var in nonormal:
          mediana = df[var].median()
          df = df.replace(np.nan,mediana) #variable no normal pon mediana
        elif var in normal:
          media = df[var].mean()
          df = df.replace(np.nan,media) #variable normal pon la media
    return df

def std_scaler(nums, df_):
    """
    standardizing nums(array con nombres de numerical) variables
    con standard scaler
    """
    df = df_.copy()
    scaler = StandardScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(df[nums]), columns=nums)
    df.drop(columns=nums, inplace=True)
    df = pd.concat([df,df_scaled], axis=1)
    return df

def dummies_ohe(df_,cats):
    """
    Returns a dataframe with dummies,and dropped the categorical in original
    the cats arguments receive the cats to transform.
    """
    df = df_.copy()
    ohe = OneHotEncoder(drop='first',handle_unknown='ignore', sparse_output=False)
    dummies = pd.DataFrame(ohe.fit_transform(df[cats]))
    dummies.columns = ohe.get_feature_names_out()  #Names ohe.get_feature_names_out()-> all dummies
    df.drop(columns=cats, inplace=True)
    df = pd.concat([df,dummies], axis=1)
    return df


def split_df(df,target):
    """
    Split the data in X,y to ML implementations
    """
    X = df.loc[ : , df.columns != target]
    y = df[target]

    print(f'next: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True, random_state = 666, stratify=y)')
    return X,y


# Procesamiento

In [None]:
#usarla nos ayuda a ver desde antes (del análisis) si las variables son informativas o no
#copiar y pegar
'''
TableOne(df,
         nonnormal = nonormal,
         categorical=cat_str,
         groupby='', #por cual se quiere agrupar
         pval=True,
         htest_name=True)
'''

"\nTableOne(df,\n         nonnormal = nonormal,\n         categorical=cat_str,\n         groupby='', #por cual se quiere agrupar\n         pval=True,\n         htest_name=True)\n"

In [None]:
## Decision tree grid
def grid_dt(X_train, y_train):
    model = DecisionTreeClassifier(random_state=1000)
    class_weight =  [{0:0.05, 1:0.95}, {0:0.1, 1:0.9}, {0:0.2, 1:0.8}, {0:0.5, 1:0.5}]
    max_depth = None,
    min_samples_leaf = [5, 10, 20, 50, ]
    criterion  = ["gini", "entropy"]
    grid = dict(class_weight=class_weight, max_depth=max_depth, min_samples_leaf=min_samples_leaf, criterion=criterion)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv,
                           scoring='accuracy',error_score='raise')
    grid_result = grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

def best_pred_dt(X_train,y_train, X_test):
  best_model = grid_dt(X_train, y_train) #entreno el modelo con X_train y y_train
  preds_dt = best_model.predict(X_test)
  return best_model,preds_dt

In [None]:
# Grid search hyperparameters for a logistic regression model
def grid_search_lr(X_train, y_train):
    model = LogisticRegression(random_state=666, max_iter=1000)
    class_weight =  [{0:0.05, 1:0.95}, {0:0.1, 1:0.9}, {0:0.2, 1:0.8}]
    solvers = ['liblinear']
    penalty = ['l2','l1']
    c_values = [ 10, 1.0, 0.1, 0.01, 0.001, ]
    grid = dict(solver=solvers,penalty=penalty,C=c_values, class_weight= class_weight)
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv,
                           scoring='f1',error_score=0)
    grid_result = grid_search.fit(X_train, y_train)
    best_params = grid_result.best_params_
    return grid_result.best_estimator_

# best_model = grid_search_lr(X_train, y_train) #entreno el modelo con X_train y y_train
# preds_lr = best_model.predict(X_test)

In [None]:
## MLP grid perceptrón
def grid_MLP(X_train, y_train):
    model = MLPClassifier(random_state=1,
                          max_iter=100)
    hidden_layer_sizes = [(8,), (100,), (3,3,16,) ,(5,5,5,) ]
    activation =  ['tanh', 'relu', 'logistic']
    solver =  ['sgd', 'adam']
    alpha  = [0.0001, 0.05]
    learning_rate = ['constant','adaptive']
    grid = dict(hidden_layer_sizes=hidden_layer_sizes, activation= activation, solver= solver, learning_rate=learning_rate, alpha=alpha)
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv,
                           scoring='f1',error_score='raise')
    grid_result = grid_search.fit(X_train, y_train)
    return  grid_result.best_estimator_

# best_model = grid_MLP(X_train, y_train)
# preds_MLP = best_model.predict(X_test)

In [None]:
# print(classification_report(y_test, preds_dt))  # recall igual a sensibilidad
# print(classification_report(y_test, preds_MLP))
# print(classification_report(y_test, preds_lr))

#Aplicación con diabetes.csv

# PRE

In [None]:
url = "https://raw.githubusercontent.com/4GeeksAcademy/decision-tree-project-tutorial/main/diabetes.csv"
df = pd.read_csv(url)

In [None]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [None]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [None]:
(df['BloodPressure'] == 0).value_counts()

Unnamed: 0_level_0,count
BloodPressure,Unnamed: 1_level_1
False,733
True,35


In [None]:
(df['Insulin']== 0).value_counts()

Unnamed: 0_level_0,count
Insulin,Unnamed: 1_level_1
False,394
True,374


In [None]:
(df['Glucose'] == 0).value_counts() #posible NAN

Unnamed: 0_level_0,count
Glucose,Unnamed: 1_level_1
False,763
True,5


In [None]:
df['Glucose'] = df['Glucose'].replace(0,np.nan) #prueba para funcion remove nan

In [None]:
(df['Glucose'] == 0).value_counts()

Unnamed: 0_level_0,count
Glucose,Unnamed: 1_level_1
False,768


In [None]:
valores = []
for columna in df.columns:
  valores.append(df[columna].unique())

In [None]:
nan_str = ['NAN']
porcent_nulos = percentage_nulls(df,nan_str)
print(porcent_nulos)

{'Pregnancies': np.float64(0.0), 'Glucose': np.float64(0.006510416666666667), 'BloodPressure': np.float64(0.0), 'SkinThickness': np.float64(0.0), 'Insulin': np.float64(0.0), 'BMI': np.float64(0.0), 'DiabetesPedigreeFunction': np.float64(0.0), 'Age': np.float64(0.0), 'Outcome': np.float64(0.0)}


In [None]:
binarias, categoricas, numericas = vars_type(df, 0.8)
print(binarias, categoricas, numericas)

['Outcome'] [] ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']


In [None]:
nonormal, normal = normalidad(df,numericas)
print(nonormal, normal)

['Pregnancies', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'] ['Glucose']


In [None]:
df = remove_nan(df, porcent_nulos, 0.65, nonormal, normal)

In [None]:
df_scaled = std_scaler(numericas, df)
df_scaled

Unnamed: 0,Outcome,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,1,0.639947,0.849656,0.149641,0.907270,-0.692891,0.204013,0.468492,1.425995
1,0,-0.844885,-1.126584,-0.160546,0.530902,-0.692891,-0.684422,-0.365061,-0.190672
2,1,1.233880,1.947567,-0.263941,-1.288212,-0.692891,-1.103255,0.604397,-0.105584
3,0,-0.844885,-1.001109,-0.160546,0.154533,0.123302,-0.494043,-0.920763,-1.041549
4,1,-1.141852,0.504598,-1.504687,0.907270,0.765836,1.409746,5.484909,-0.020496
...,...,...,...,...,...,...,...,...,...
763,0,1.827813,-0.624682,0.356432,1.722735,0.870031,0.115169,-0.908682,2.532136
764,0,-0.547919,0.034065,0.046245,0.405445,-0.692891,0.610154,-0.398282,-0.531023
765,0,0.342981,0.002696,0.149641,0.154533,0.279594,-0.735190,-0.685193,-0.275760
766,1,-0.844885,0.159540,-0.470732,-1.288212,-0.692891,-0.240205,-0.371101,1.170732


In [None]:
#df = dummies_ohe(df_scaled,categoricas) #si hubiera categóricas, aquí se convertirían

In [None]:
TableOne(df,
         nonnormal = nonormal,
         categorical=categoricas,
         groupby='Outcome', #por cual se quiere agrupar
         pval=True,
         htest_name=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by Outcome,Grouped by Outcome,Grouped by Outcome,Grouped by Outcome,Grouped by Outcome,Grouped by Outcome
Unnamed: 0_level_1,Unnamed: 1_level_1,Missing,Overall,0,1,P-Value,Test
n,,,768,500,268,,
"Pregnancies, median [Q1,Q3]",,0.0,"3.0 [1.0,6.0]","2.0 [1.0,5.0]","4.0 [1.8,8.0]",<0.001,Kruskal-Wallis
"Glucose, mean (SD)",,0.0,120.9 (31.9),110.0 (26.1),141.3 (31.8),<0.001,Welch’s T-test
"BloodPressure, median [Q1,Q3]",,0.0,"72.0 [62.0,80.0]","70.0 [62.0,78.0]","74.0 [66.0,82.0]",<0.001,Kruskal-Wallis
"SkinThickness, median [Q1,Q3]",,0.0,"23.0 [0.0,32.0]","21.0 [0.0,31.0]","27.0 [0.0,36.0]",0.013,Kruskal-Wallis
"Insulin, median [Q1,Q3]",,0.0,"30.5 [0.0,127.2]","39.0 [0.0,105.0]","0.0 [0.0,167.2]",0.066,Kruskal-Wallis
"BMI, median [Q1,Q3]",,0.0,"32.0 [27.3,36.6]","30.1 [25.4,35.3]","34.2 [30.8,38.8]",<0.001,Kruskal-Wallis
"DiabetesPedigreeFunction, median [Q1,Q3]",,0.0,"0.4 [0.2,0.6]","0.3 [0.2,0.6]","0.4 [0.3,0.7]",<0.001,Kruskal-Wallis
"Age, median [Q1,Q3]",,0.0,"29.0 [24.0,41.0]","27.0 [23.0,37.0]","36.0 [28.0,44.0]",<0.001,Kruskal-Wallis


#Procesamiento

In [None]:
X,y = split_df(df,'Outcome')

next: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True, random_state = 666, stratify=y)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True, random_state = 666, stratify=y) #modificar si se quiere

In [None]:
#Decision tree
grid_dt(X_train, y_train)

In [None]:
best_model, preds_dt = best_pred_dt(X_train,y_train, X_test)

In [None]:
#LR
best_lr = grid_search_lr(X_train, y_train)
best_lr

In [None]:
preds_lr = best_lr.predict(X_test)

In [None]:
best_MLP = grid_MLP(X_train, y_train)



In [None]:
best_MLP

In [None]:
preds_MLP = best_MLP.predict(X_test)

In [None]:
print(f'Decision tree: \n {classification_report(y_test, preds_dt)}')
print(f'Logistic regression:\n {classification_report(y_test, preds_lr)}')
print(f'MLP: \n {classification_report(y_test, preds_MLP)}')

Decision tree: 
               precision    recall  f1-score   support

           0       0.78      0.76      0.77       100
           1       0.58      0.61      0.59        54

    accuracy                           0.71       154
   macro avg       0.68      0.69      0.68       154
weighted avg       0.71      0.71      0.71       154

Logistic regression:
               precision    recall  f1-score   support

           0       0.85      0.53      0.65       100
           1       0.49      0.83      0.62        54

    accuracy                           0.64       154
   macro avg       0.67      0.68      0.64       154
weighted avg       0.73      0.64      0.64       154

MLP: 
               precision    recall  f1-score   support

           0       0.75      0.79      0.77       100
           1       0.56      0.50      0.53        54

    accuracy                           0.69       154
   macro avg       0.65      0.65      0.65       154
weighted avg       0.68     