# Librearias

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import classification_report
from lightgbm import LGBMClassifier
import pickle

# Lectura de datos

In [2]:
df = pd.read_csv("heart.csv")
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
#Reconociendo los tipos de datos
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [4]:
#renombrando las columnas para que los nombres sean iguales a las del request
df.columns = ['age', 'sex', 'chestPainType', 'restingBP', 'cholesterol', 'fastingBS', 'restingECG', 'maxHR', 'exerciseAngina', 'oldpeak', 'sTSlope', 'HeartDisease']

In [5]:
#Separando las columnas de las variables en categoricas y numericas
numericas= df.drop(['HeartDisease'], axis=1).select_dtypes('number').columns

categoricas = df.select_dtypes('object').columns

# Preprocesamiento

In [6]:
encoder = OneHotEncoder(handle_unknown='ignore')
column_transformer = make_column_transformer((encoder,categoricas), remainder='passthrough')  

## Separacion del conjunto de entrenamiento y de test

In [7]:
x = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

In [8]:
#Entrenamiento del column transformer
x_train_t = column_transformer.fit_transform(x_train)
x_test_t = column_transformer.transform(x_test)

In [9]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42).split(X=x_train_t, y=y_train)

# Construccion y entrenamiento del modelo

In [10]:
model = LGBMClassifier(random_state=0, objective='binary')

In [13]:
# creando parametros del gridsearch para optimizar hiperparametros

param_grid = {
    'boosting_type': ['gbdt', 'dart', 'goss', 'rf'],
    'num_leaves': [ 10, 20, 25, 30], 
    'max_depth': [-1, 10, 20],
    'min_data_in_leaf': [10, 20, 30, 40, 50]
}

# se utilizara como score el recall porque se busca reducir el error de tipo II, ya que puede llegar a ser mortal
# pronosticar un falso negativo sobre una enfermedad cardiaca.
g_search = GridSearchCV(model, param_grid=param_grid, cv=kfold, scoring='recall')


In [14]:
model = g_search.fit(x_train_t,y_train)
model



300 fits failed out of a total of 1200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
300 fits failed with the following error:
Traceback (most recent call last):
  File "d:\files.al\Python3.9\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\files.al\Python3.9\lib\site-packages\lightgbm\sklearn.py", line 967, in fit
    super().fit(X, _y, sample_weight=sample_weight, init_score=init_score, eval_set=valid_sets,
  File "d:\files.al\Python3.9\lib\site-packages\lightgbm\sklearn.py", line 748, in fit
    self._Booster = train(
  File "d:\files.al\Python3.9\lib\site-packages\lightgbm\engine.py", line 271, in train
    booster = Booster(params=p

In [15]:
print(model.best_params_, model.best_score_)

{'boosting_type': 'goss', 'max_depth': -1, 'min_data_in_leaf': 40, 'num_leaves': 20} 0.9023183232481484


## Elaborando modelo en limpio

In [22]:
lgbm_model = LGBMClassifier(boosting_type= 'goss', max_depth= -1, min_data_in_leaf= 40, num_leaves= 20, random_state=0, objective='binary')

In [23]:
lgbm_model.fit(x_train_t, y_train)



## Pipeline

In [24]:
pipe = make_pipeline(column_transformer, lgbm_model)

# Evaluacion del modelo

In [25]:
y_pred = pipe.predict(x_test)

In [26]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.75      0.78        92
           1       0.84      0.88      0.86       138

    accuracy                           0.83       230
   macro avg       0.83      0.82      0.82       230
weighted avg       0.83      0.83      0.83       230



# Exportacion del pipe

In [27]:
pickle.dump( pipe, open( "../artifacts/pipe_file.pkl", "wb" ) )