In [None]:
# pycaret demo - regression

using the insurance dataset, this demo covers some steps used to perform regression analysis using pycaret. prior to running the notebook, ensure you have the followng packes installed.

the relevant packages are:
- Pandas
- Numpy
- Matplotlib
- Seaborn
- PyCaret
- MLFlow
- PyCaret[Analysis]



In [None]:
### Importing packages and dataset

In [None]:
# import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#Pycaret
import pycaret
import mlflow
from pycaret.utils import version
from pycaret.regression import *

#mlflow
# in your web browser, please copy and past the uri into a separate window: http://localhost:5000
# you can see your experiment(s) after the code is complete

mlflow.set_tracking_uri("http://localhost:5000")

In [None]:
# import dataset
from pycaret.datasets import get_data
data = get_data('insurance')

In [None]:
### check out dataset

In [None]:
# review data
data.info()

In [None]:
# check for null values
data.isnull().values.sum()

In [None]:
###Exploratory data analysis

In [None]:
# summary stats
data.describe(include ='all').T

In [None]:
#splitting numerical and categorical columns to perform further analysis
cat_cols=data.select_dtypes(include=['object']).columns
num_cols=data.select_dtypes(include=np.number).columns.tolist()
print("Categorical Variables:")
print(cat_cols)
print("Numberical Variables:")
print(num_cols)

In [None]:
#look at numerical values first
for col in num_cols:
    print(col)
    print('Skew :', round(data[col].skew(), 2))
    plt.figure(figsize = (15,4))
    plt.subplot(1,2,1)
    data[col].hist(grid=False)
    plt.ylabel('count')
    plt.subplot(1,2,2)
    sns.boxplot(x=data[col])
    plt.show()
    

In [None]:
#plot categorical variables
fig,axes = plt.subplots(1,3,figsize = (16,7))
fig.subtitle('Bar plots for categorical variables')

sns.countplot(ax=axes[0], data=data, x="sex", order=data['sex'].value_counts().index);
sns.countplot(ax=axes[1], data=data, x="smoker", order=data['smoker'].value_counts().index);
sns.countplot(ax=axes[2], data=data, x="region", order=data['region'].value_counts().index);


In [None]:
### setting up the regression

In [None]:
# regression setup
reg_insurance = setup(data = data, target = 'charges', session_id=123, log_experiment = True, experiment_name ='insurance1')


In [None]:
get_config('dataset_transformed') # this is to look at our transformed data, if needed

In [None]:
### compare baseline

In [None]:
# return best model
best = compare_models(fold = 5)


In [None]:
best # this is the best model from the list

In [None]:
# get parameters of best model
best.get_params()

In [None]:
### hyperparameter tuning

In [None]:
# tuning the best model
tune_model(best, fold=5, n_iter=10,optimize='MAE')

In [None]:
### blending models

In [None]:
# taking the top 3 MAE models
best_mae_3 = compare_models(sort = 'MAE', n_select = 3, fold = 5)

In [None]:
best_mae_3

In [None]:
# blending them together
blend_models(best_mae_3,fold=5)

In [None]:
### plot/evaluate the model

In [None]:
# plotting the model
plot_model(best)

In [None]:
# plotting the prediction error
plot_model(best, plot = 'error')

In [None]:
# plotting the residuals
plot_model(best,plot = 'feature')

In [None]:
# evaluate model
evaluate_model(best) # note, this contains most of the above graphs, which you access from clicking the grey boxes

In [None]:
### model interpretation

In [None]:
# Interpret Model - note, this only supports tree-based models. so using xgboost as an example
xgboost = create_model('xgboost', fold=5,n_iter=10)
interpet_model(xgboost)

In [None]:
interpret_model(xgboost,plot='correlation')

In [None]:
interpret_model(xgboost, plot='reason',observation=12)

In [None]:
### Saving/Load Model

In [None]:
# saving model
save_model(best, model_name = 'best-model')

In [None]:
# loading model
loaded_bestmodel = load_model('best-model')
print(loaded_bestmodel)

In [None]:
### predictions

In [None]:
# predict on holdout
predict_model(best)

In [None]:
# copy data and remove target variable for predicting
data_unseen = data.copy()
data_unseen.drop('charges', axis=1, inplace = True)

predict_model(best, data = data_unseen)

In [None]:
# finalize the model
finalize_model(best)

In [None]:
mlflow.end_run()

In [None]:
### Interactive Dashboard

In [None]:
# creating interactive dashboard for trained model
dashboard(best)

when the above runs, it shows a link which you can click to open up the dashboard in a new window. the cell will continuously run unless you stop it.