# Eligiendo el mejor modelo de regresión con Pycaret

### 1. Instalamos la librería Pycaret y traemos un Dataset de ella

In [12]:
# Instalamos la librería pycaret
!pip install pycaret==3.3.2
!pip install --upgrade pycaret
# Importamos la librería pycaret y la clase det_data
import pycaret 
from pycaret.datasets import get_data
#Creamos una instancia (objeto, variable) que contendrá los datos de df diabetes
Datos_prueba = get_data(dataset = 'diabetes') 



Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


### 2. Configuramos la prueba de regresión

In [2]:
# Traemos la clase RegressionExperiment
from pycaret.regression import RegressionExperiment
# Instanciamos (creamos un objeto para llamar en futuros scripts) la clase RegressionExperiment
ExperimentoRegresion = RegressionExperiment()

In [3]:
# Configuramos el experimento
ExperimentoRegresion.setup(Datos_prueba, target='Class variable',session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Class variable
2,Target type,Regression
3,Original data shape,"(768, 9)"
4,Transformed data shape,"(768, 9)"
5,Transformed train set shape,"(537, 9)"
6,Transformed test set shape,"(231, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,simple


<pycaret.regression.oop.RegressionExperiment at 0x77ee903370d0>

##### El data set tiene 768 registros y 9 características (variables). Una de ellas se quitará por ser el target de la predicción. Del total, entrará con 537 y probará sus resultados con 237.
#####  Usamos la VALIDACIÓN CRUZADA KFold, cuya técnica divide el conjunto de datos en K partes (o folds). En cada iteración del proceso, se utiliza uno de los folds como conjunto de prueba (con el que valida sus predicciones) y los otros K-1 como conjunto de entrenamiento. Esto se repite K veces.

### 3. Analizamos los errores de las pruebas y escogemos las 3 mejores

In [6]:
# Mostramos las métricas del experimento para todas pruebas hechas
Total_modelos = ExperimentoRegresion.compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.3223,0.1671,0.4074,0.2395,0.2875,0.4652,0.103
rf,Random Forest Regressor,0.3166,0.1686,0.4078,0.2343,0.2868,0.4671,0.162
ridge,Ridge Regression,0.3463,0.1683,0.4094,0.2338,0.2871,0.4958,0.013
lr,Linear Regression,0.3462,0.1684,0.4095,0.2336,0.2872,0.4958,0.019
lar,Least Angle Regression,0.3456,0.1686,0.4098,0.2322,0.2871,0.4952,0.016
br,Bayesian Ridge,0.3507,0.1691,0.4104,0.2309,0.2881,0.5036,0.013
gbr,Gradient Boosting Regressor,0.3213,0.1698,0.4096,0.2296,0.288,0.4687,0.077
ada,AdaBoost Regressor,0.375,0.177,0.42,0.195,0.302,0.4849,0.037
en,Elastic Net,0.3676,0.177,0.4201,0.1949,0.2935,0.5415,0.016
llar,Lasso Least Angle Regression,0.378,0.1825,0.4266,0.1699,0.2981,0.5605,0.013


In [7]:
# Mostramos las 5 pruebas que hayan tenido menor error en MSE y R2, ordenandolas según el resultado del R2
Mejores_5modelos = ExperimentoRegresion.compare_models(include=['et','rf','ridge','lr','lar'])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.3223,0.1671,0.4074,0.2395,0.2875,0.4652,0.103
rf,Random Forest Regressor,0.3166,0.1686,0.4078,0.2343,0.2868,0.4671,0.151
ridge,Ridge Regression,0.3463,0.1683,0.4094,0.2338,0.2871,0.4958,0.012
lr,Linear Regression,0.3462,0.1684,0.4095,0.2336,0.2872,0.4958,0.014
lar,Least Angle Regression,0.3456,0.1686,0.4098,0.2322,0.2871,0.4952,0.017


In [13]:
# Ordenamos la tabla anterior en orden descente según el R2
Mejores_3modelosR2 = ExperimentoRegresion.compare_models(sort = 'R2', n_select=3)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.3223,0.1671,0.4074,0.2395,0.2875,0.4652,0.099
rf,Random Forest Regressor,0.3166,0.1686,0.4078,0.2343,0.2868,0.4671,0.181
ridge,Ridge Regression,0.3463,0.1683,0.4094,0.2338,0.2871,0.4958,0.013
lr,Linear Regression,0.3462,0.1684,0.4095,0.2336,0.2872,0.4958,0.012
lar,Least Angle Regression,0.3456,0.1686,0.4098,0.2322,0.2871,0.4952,0.013
br,Bayesian Ridge,0.3507,0.1691,0.4104,0.2309,0.2881,0.5036,0.012
gbr,Gradient Boosting Regressor,0.3213,0.1698,0.4096,0.2296,0.288,0.4687,0.077
ada,AdaBoost Regressor,0.375,0.177,0.42,0.195,0.302,0.4849,0.033
en,Elastic Net,0.3676,0.177,0.4201,0.1949,0.2935,0.5415,0.012
llar,Lasso Least Angle Regression,0.378,0.1825,0.4266,0.1699,0.2981,0.5605,0.013


In [14]:
# Escogemos nuestro mejor modelo
Mejor_modelo = Mejores_3modelosR2[0]

### 4. Cambiamos los hyperparámetros del modelo en la prueba escogida y evaluamos con cuál modelo nos quedamos

In [15]:
# Tuneamos nuestro modelo, a fin de modificar los hyperparámetros y observar si se genera un modelo mejor
Mejor_modelo_tuneado = ExperimentoRegresion.tune_model(estimator=Mejor_modelo)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.3864,0.1829,0.4277,0.1519,0.3047,0.5551
1,0.423,0.2128,0.4613,0.144,0.3086,0.5822
2,0.3659,0.1666,0.4082,0.2276,0.2879,0.5422
3,0.3791,0.1895,0.4353,0.1474,0.3008,0.6057
4,0.3551,0.1661,0.4075,0.1723,0.2906,0.5789
5,0.3446,0.1456,0.3816,0.274,0.2762,0.5112
6,0.3548,0.156,0.395,0.2516,0.2803,0.5618
7,0.3873,0.1832,0.4281,0.2201,0.2977,0.5477
8,0.3413,0.1437,0.3791,0.3592,0.2682,0.4862
9,0.3514,0.1586,0.3983,0.3103,0.2704,0.5657


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


##### Escogemos el modelo original por tener el mayor R2(0.2395>0.2258) y menor MSE (0.1671<0.1705)

### 5. Terminamos, guardamos y cargamos el modelo ganador

In [17]:
# Terminamos el modelo ganador
Modelo_final = ExperimentoRegresion.finalize_model(estimator=Mejor_modelo)
Modelo_final

In [18]:
# Guardamos el modelo ganador en Pycaret
ExperimentoRegresion.save_model(model=Modelo_final, model_name= './Modelo_final')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['Number of times pregnant',
                                              'Plasma glucose concentration a 2 '
                                              'hours in an oral glucose '
                                              'tolerance test',
                                              'Diastolic blood pressure (mm Hg)',
                                              'Triceps skin fold thickness (mm)',
                                              '2-Hour serum insulin (mu U/ml)',
                                              'Body mass index (weight in '
                                              'kg/(height in m)^2)',
                                              'Diabetes pedigree function',
                                              'Age (years)'],
                                     transformer=SimpleImputer())),
                 ('categorical_i

##### Te permite guardar todo el pipeline completo, lo que es conveniente cuando quieres reutilizar el mismo flujo de trabajo (preprocesamiento, selección de características, etc.) más adelante. 

In [20]:
# Guardamos el modelo ganador en Python, para que pueda ser usado con independencia de la librería
# Importamos a librería joblib, que se utiliza para serializar (guardar) objetos en archivos y también para cargarlos de nuevo más adelante.
import joblib
# Guardamos nuestro mejor modelo con la clase
joblib.dump(Mejor_modelo, 'Modelo_final_joblib.pkl')

['Modelo_final_joblib.pkl']

##### Joblib es un método es más flexible y ligero si solo necesitas guardar y cargar el modelo final y no todo el pipeline. Puedes usarlo fuera de PyCaret o en otros contextos. Mas no contiene los detalles del preprocesamiento en Pycaret.

In [23]:
# Cargamos el modelo ganador como si fueramos a usarlo con la librería Pycaret
Modelo_cargado_desde_disco=ExperimentoRegresion.load_model('./Modelo_final')

Transformation Pipeline and Model Successfully Loaded
