# Selección de modelos con PyCaret

In [2]:
# Imports
import pandas as pd
import pycaret

In [3]:
# Carga de datos
data = pd.read_csv('../data/raw/diabetes_data_upload.csv')
data.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


In [4]:
# Creacion del experimento
from pycaret.classification import *
experimentoClasificacion = setup(
    data,
    target = 'class',
    session_id = 123
    )

Unnamed: 0,Description,Value
0,Session id,123
1,Target,class
2,Target type,Binary
3,Target mapping,"Negative: 0, Positive: 1"
4,Original data shape,"(520, 17)"
5,Transformed data shape,"(520, 17)"
6,Transformed train set shape,"(364, 17)"
7,Transformed test set shape,"(156, 17)"
8,Numeric features,1
9,Categorical features,15


In [5]:
# Comparacion de modelos
mejorModelo = experimentoClasificacion.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9696,0.9963,0.9696,0.971,0.9696,0.9363,0.9376,0.05
gbc,Gradient Boosting Classifier,0.9696,0.981,0.9696,0.9707,0.9696,0.9361,0.9372,0.037
et,Extra Trees Classifier,0.9669,0.9968,0.9669,0.9682,0.9669,0.9306,0.9317,0.043
lightgbm,Light Gradient Boosting Machine,0.9641,0.9849,0.9641,0.9662,0.9642,0.925,0.9269,0.062
dt,Decision Tree Classifier,0.9394,0.9427,0.9394,0.9436,0.9397,0.874,0.8775,0.026
qda,Quadratic Discriminant Analysis,0.9312,0.989,0.9312,0.9359,0.9296,0.85,0.8572,0.023
ada,Ada Boost Classifier,0.9285,0.9715,0.9285,0.9298,0.9283,0.8485,0.8501,0.033
lr,Logistic Regression,0.9147,0.9769,0.9147,0.9174,0.9144,0.8193,0.8224,0.74
nb,Naive Bayes,0.9146,0.9715,0.9146,0.9153,0.9141,0.8178,0.8196,0.023
ridge,Ridge Classifier,0.8845,0.9721,0.8845,0.8966,0.8856,0.7637,0.7725,0.025


In [6]:
print(mejorModelo)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       monotonic_cst=None, n_estimators=100, n_jobs=-1,
                       oob_score=False, random_state=123, verbose=0,
                       warm_start=False)


In [7]:
# Analisis del modelo
experimentoClasificacion.evaluate_model(mejorModelo)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [8]:
# Prediccion
experimentoClasificacion.predict_model(mejorModelo)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.9551,0.9934,0.9551,0.9562,0.9553,0.9061,0.9068


Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class,prediction_label,prediction_score
232,56,Male,No,Yes,No,Yes,Yes,No,Yes,Yes,No,Yes,No,Yes,Yes,No,Negative,Positive,0.57
381,53,Male,Yes,No,Yes,No,No,No,No,No,No,Yes,Yes,No,No,No,Positive,Positive,0.89
332,50,Female,No,No,No,Yes,No,No,Yes,Yes,No,Yes,No,No,Yes,No,Negative,Negative,0.99
44,40,Female,Yes,Yes,Yes,Yes,No,No,Yes,Yes,No,No,Yes,Yes,No,No,Positive,Positive,0.98
396,30,Male,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Negative,Negative,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
356,43,Male,No,No,Yes,No,No,Yes,No,No,No,Yes,No,No,Yes,No,Negative,Negative,0.97
408,59,Female,No,No,No,Yes,No,No,Yes,Yes,No,Yes,No,Yes,Yes,Yes,Negative,Negative,0.87
391,58,Male,No,Yes,No,No,No,No,Yes,Yes,No,No,No,Yes,No,No,Negative,Negative,0.93
512,43,Male,No,No,No,No,No,No,No,No,No,No,No,No,Yes,No,Negative,Negative,0.99


In [9]:
modeloFinal = experimentoClasificacion.finalize_model(estimator=mejorModelo)
modeloFinal

In [14]:
# Guardar modelo
experimentoClasificacion.save_model(
    model=mejorModelo,
    model_name='../model/classificationModel_randomForestClass_v1'
)


Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('label_encoding',
                  TransformerWrapperWithInverse(exclude=None, include=None,
                                                transformer=LabelEncoder())),
                 ('numerical_imputer',
                  TransformerWrapper(exclude=None, include=['Age '],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_values=nan,
                                                               strategy='mean')...
                  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                         class_weight=None, criterion='gini',
                  

In [10]:
# Export jonlib
import joblib
joblib.dump(mejorModelo, '../pipelines/pipeline_classificationModel_randomForestClass_v1.joblib')

['../pipelines/pipeline_classificationModel_randomForestClass_v1.joblib']