### In this notebook we will learn:

* Getting Data: How to import data from PyCaret repository
* Setting up the sqlite database: To store the model artifacts, it is important to have a persistent database through sqlite.
* Setting up Environment: How to setup an experiment in PyCaret and get started with building classification models
* Create & Compare Models: How to create and compare various models, perform stratified cross validation and evaluate classification metrics
* Predict Model: How to make predictions on new / unseen data

In [1]:
#make sure your path is set to source folder
%cd /home

/home


In [2]:
!pwd

/home


## Loading Data 

### 1.1 Importing packages

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
!pip install pycaret


In [None]:
!pip uninstall -y numpy cupy cuml


!pip install numpy==1.23.5
!pip install "cupy-cuda11x"  # Replace 11x with your CUDA version (e.g., 110, 117, etc.) or use "cupy" for CPU
!pip install pycaret==3.2

!pip install cupy


Found existing installation: numpy 1.23.5
Uninstalling numpy-1.23.5:
  Successfully uninstalled numpy-1.23.5
[0mFound existing installation: cuml 21.12.0a0+116.g4ce5bd609
Uninstalling cuml-21.12.0a0+116.g4ce5bd609:
  Successfully uninstalled cuml-21.12.0a0+116.g4ce5bd609
[0mCollecting numpy==1.23.5
  Using cached numpy-1.23.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
Installing collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
visions 0.8.1 requires pandas>=2.0.0, but you have pandas 1.5.3 which is incompatible.
dask-cudf 21.12.0a0+293.g0930f712e6 requires dask<=2021.11.2,>=2021.11.1, but you have dask 2023.5.0 which is incompatible.
dask-cudf 21.12.0a0+293.g0930f712e6 requires pandas<1.4.0dev0,>=1.0, but you have pandas 1.5.3 which is incompatible.
dask-cuda 21.12.0 requires dask<=2021.11.2,>=2021.11.1, b

In [8]:
!pip install joblib==1.2.0 --force-reinstall


Collecting joblib==1.2.0
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 KB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: joblib
  Attempting uninstall: joblib
    Found existing installation: joblib 1.4.2
    Uninstalling joblib-1.4.2:
      Successfully uninstalled joblib-1.4.2
Successfully installed joblib-1.2.0
You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [4]:
# Imported Libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from pycaret.classification import *
# Other Libraries
import mlflow

In [5]:
from scripts import utils

root_folder = "/home/"
data_directory = root_folder+"data/raw/"
data_profile_path = root_folder+"/data/profile_report/"
intermediate_data_path = root_folder+"data/interim/"
database_path = root_folder+"database/"
print("directory loaded")

directory loaded


 ### 1.2 Reading the merged data

In [6]:
%%time
interim_data = "final_train_data_interim_1748666766.csv" # set the data recieved from the previous notebook
dataset = utils.load_data( [f"{intermediate_data_path}{interim_data}",
                            ]
                         )[0] #since we are only loading single data, we can access it with index 0, since it return multiple dfs in list
dataset.shape

CPU times: user 1.14 s, sys: 151 ms, total: 1.29 s
Wall time: 1.29 s


(324000, 24)

In [7]:
dataset.head()

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time,is_churn,payment_method_id,payment_plan_days,plan_list_price,...,is_cancel,transaction_date_y,membership_expire_date_y,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
0,/7XuYVGXYHPggWsdtok0JEurQ10CtUO1Y8dDgy1/B0M=,1,0,others,7,20161223,0,41.0,30.0,149.0,...,0.0,2017-02-22,2017-03-22,0.0,0.0,0.0,0.0,2.289867,2.289867,7.808594
1,gB3/kawEQSauWFArU9Z0kZo+ikw9GqJv0rXqNbpVnTY=,1,0,others,7,20161223,0,41.0,30.0,99.0,...,0.0,2017-02-23,2017-03-23,0.274653,0.0,0.0,0.0,2.845647,2.28193,8.363281
2,2aFAPs3QmxD+bNcCe8beuWcI7SZHg1k+1irALOxiw3k=,15,23,female,4,20161224,0,40.0,30.0,149.0,...,0.0,2017-02-27,2017-03-26,0.0,0.0,0.0,0.0,4.708342,4.6837,10.236328
3,FjEZAhwFky8sWoaNGTp+p/r3/hH30WxLr396iSho3gs=,1,0,others,7,20161225,0,41.0,30.0,99.0,...,0.0,2017-02-24,2017-03-24,0.621227,0.173287,0.0,0.346574,3.070758,2.640511,8.630859
4,C5PNTuQxUQmHOXPptQnokhqH1XQoAHHL8pMWIX0nAh0=,1,0,others,7,20161225,0,41.0,30.0,99.0,...,0.0,2017-02-24,2017-03-24,0.0,0.0,0.0,0.0,1.595831,1.499937,7.083334


 ### 1.3 Splitting the data to seen and unseen

In [None]:
# this function is also available in utils.py 
# def get_validation_unseen_set(dataframe, validation_frac=0.05, sample=False, sample_frac=0.1):
#     if not sample:
#         dataset = dataframe.copy()
#     else:
#         dataset = dataframe.sample(frac=sample_frac)
#     data = dataset.sample(frac=(1-validation_frac), random_state=786)
#     data_unseen = dataset.drop(data.index)
#     data.reset_index(inplace=True, drop=True)
#     data_unseen.reset_index(inplace=True, drop=True)
#     return data, data_unseen

In [8]:
data_for_model, data_unseen = utils.get_validation_unseen_set(dataset, validation_frac=0.05, sample=False, sample_frac=0.1)
print('Data for Modeling: ' + str(data_for_model.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (307800, 24)
Unseen Data For Predictions: (16200, 24)


In [9]:
data_for_model.head()

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time,is_churn,payment_method_id,payment_plan_days,plan_list_price,...,is_cancel,transaction_date_y,membership_expire_date_y,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
0,vGJs8h3uZxW36keVUW50uB7NW70pkwKEbCkgoOZ8jKc=,13,39,female,3,20150628,0,39.142857,29.047619,142.095238,...,0.0,2017-02-27,2017-03-26,1.914855,0.405704,0.250397,0.250397,3.759292,3.612866,9.35197
1,fG4vbNXL63LB0/d/0buvDjkvtbC2WkB7s/vsy3N+rtg=,1,0,others,7,20151126,0,41.0,30.0,99.0,...,0.0,2017-02-26,2017-03-26,2.15288,0.170598,0.247943,0.338321,1.846531,2.6823,7.569824
2,bVsIGCC3TL7DrEHeUPX89aJmRU435hiIr4OpSjfV3pQ=,5,18,male,7,20160324,0,41.0,30.0,149.0,...,0.0,2017-02-24,2017-03-24,0.673611,0.164635,0.163826,0.132419,3.404267,3.465027,8.966634
3,V5RaZnBDu75Lyt6RgwXJ7Asz8/h2lm8YaYFfJ5cPbes=,13,24,male,4,20160224,0,36.5,30.0,172.25,...,0.0,2017-02-24,2017-03-26,2.167712,0.866918,0.924841,0.099021,2.058752,2.995891,8.03683
4,NIvWn3PzuH15beiEt2dHxilH4dh5vt2LxlJtyKWo0z0=,1,0,others,7,20160226,0,41.0,30.0,99.0,...,0.0,2017-02-26,2017-03-26,0.693204,0.19356,0.069315,0.089588,2.419033,2.417244,7.997656


In [27]:
data_for_model.columns

Index(['msno', 'city', 'bd', 'gender', 'registered_via',
       'registration_init_time', 'is_churn', 'payment_method_id',
       'payment_plan_days', 'plan_list_price', 'actual_amount_paid',
       'is_auto_renew', 'transaction_date_x', 'membership_expire_date_x',
       'is_cancel', 'transaction_date_y', 'membership_expire_date_y', 'num_25',
       'num_50', 'num_75', 'num_985', 'num_100', 'num_unq', 'total_secs'],
      dtype='object')

In [29]:
data_for_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307800 entries, 0 to 307799
Data columns (total 24 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   msno                      307800 non-null  object 
 1   city                      307800 non-null  int64  
 2   bd                        307800 non-null  int64  
 3   gender                    307800 non-null  object 
 4   registered_via            307800 non-null  int64  
 5   registration_init_time    307800 non-null  int64  
 6   is_churn                  307800 non-null  int64  
 7   payment_method_id         307800 non-null  float64
 8   payment_plan_days         307800 non-null  float64
 9   plan_list_price           307800 non-null  float64
 10  actual_amount_paid        307800 non-null  float64
 11  is_auto_renew             307800 non-null  float64
 12  transaction_date_x        307800 non-null  object 
 13  membership_expire_date_x  307800 non-null  o

### 2 Setting up the sqlite database

In [10]:
data_for_model['registration_init_time'] = pd.to_datetime(data_for_model['registration_init_time'], errors='coerce')
data_for_model['transaction_date_x'] = pd.to_datetime(data_for_model['transaction_date_x'], errors='coerce')
data_for_model['membership_expire_date_x'] = pd.to_datetime(data_for_model['membership_expire_date_x'], errors='coerce')

In [None]:
# this function is also available in utils.py 

#create a sqlite db fo storing all the model artifacts etc
# import sqlite3
# from sqlite3 import Error

# def create_sqlit_connection(db_path,db_file):
#     """ create a database connection to a SQLite database """
#     conn = None
#     # opening the conncetion for creating the sqlite db
#     try:
#         conn = sqlite3.connect(db_path+db_file)
#         print(sqlite3.version)
#     # return an error if connection not established
#     except Error as e:
#         print(e)
#     # closing the connection once the database is created
#     finally:
#         if conn:
#             conn.close()

In [11]:
database_path

'/home/database/'

In [12]:
utils.create_sqlit_connection(database_path,r"mlflow_v01.db")

2.6.0


In [13]:
mlflow.set_tracking_uri("http://0.0.0.0:6006")

In [None]:
# do not go ahead unless you execute this step and mlflow is isntalled. 
 
#MAKE mlruns FOLDER on root folder
#run this on terminal where you are on root folder. 
# Makse sure to point the database to correct address. Assuming you have same folder structure you can use this
#mlflow server --backend-store-uri='sqlite:///database/mlflow_v01.db' --default-artifact-root="/home/mlruns/" --port=6006 --host=0.0.0.0

In [None]:
#pip install pycaret==2.3.8

### 3 Setting up Environment: 

The `setup()` function initializes the environment in pycaret and creates the transformation pipeline to prepare the data for modeling and deployment. `setup()`must be called before executing any other function in pycaret. 
* It takes two mandatory parameters: a pandas dataframe and the name of the target column. 
* All other parameters are optional and are used to customize the pre-processing pipeline (we will see them in later tutorials).

When `setup()` is executed, PyCaret's inference algorithm will automatically infer the data types for all features based on certain properties. The data type should be inferred correctly but this is not always the case. To account for this, PyCaret displays a table containing the features and their inferred data types after setup() is executed. If all of the data types are correctly identified enter can be pressed to continue or quit can be typed to end the expriment. Ensuring that the data types are correct is of fundamental importance in PyCaret as it automatically performs a few pre-processing tasks which are imperative to any machine learning experiment. These tasks are performed differently for each data type which means it is very important for them to be correctly configured.

In [14]:
# No Pre-Processing 
Baseline_model_exp01 = setup(data = data_for_model, target = 'is_churn', 
                   session_id = 42,fix_imbalance=False,ignore_features=['msno'],
                   date_features=['registration_init_time','transaction_date_x','membership_expire_date_x'],
                   n_jobs=-1,use_gpu=True,
                   log_experiment=True,experiment_name='Baseline_model_exp01',
                   log_plots=True, log_data=True, verbose=True,
                   log_profile=False)

[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1


[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1


[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1


Unnamed: 0,Description,Value
0,Session id,42
1,Target,is_churn
2,Target type,Binary
3,Original data shape,"(307800, 24)"
4,Transformed data shape,"(307800, 31)"
5,Transformed train set shape,"(215460, 31)"
6,Transformed test set shape,"(92340, 31)"
7,Ignore features,1
8,Numeric features,16
9,Date features,3


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1


[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
2025/05/31 06:01:20 INFO mlflow.tracking.fluent: Experiment with name 'Baseline_model_exp01' does not exist. Creating a new experiment.


In [15]:
# this function is also available in utils.py 
def get_train_test_set_from_setup():
     return get_config(variable="X_train"),\
             get_config(variable="y_train"),\
             get_config(variable="X_test"),\
             get_config(variable="y_test")

def get_x_y_from_setup():
     return get_config(variable="X"),\
             get_config(variable="y")
    
def get_transformation_pipeline_from_setup():
     return get_config(variable="pipeline")

In [16]:
X_train, y_train, X_test, y_test = get_train_test_set_from_setup()
#you can also get X,y
# X,y = utils.get_x_y_from_setup()
X_train.head()

Unnamed: 0,city,bd,gender,registered_via,registration_init_time,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,...,is_cancel,transaction_date_y,membership_expire_date_y,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
207224,1,0,others,7,1970-01-01 00:00:00.020160722,41.0,30.0,99.0,99.0,1.0,...,0.0,2017-02-21,2017-03-21,0.231049,0.0,0.0,0.0,1.283458,1.380667,6.795573
118021,1,0,others,7,1970-01-01 00:00:00.020160110,41.0,30.0,99.0,99.0,1.0,...,0.0,2017-02-10,2017-03-10,0.962697,0.180648,0.043322,0.129965,2.875238,2.939704,8.446289
245802,1,0,others,7,1970-01-01 00:00:00.020150423,41.0,28.695652,142.521744,149.0,1.0,...,0.0,2017-02-23,2017-03-23,1.25655,0.199084,0.115525,0.077016,2.293911,2.484759,7.804688
67811,1,0,others,7,1970-01-01 00:00:00.020151019,41.0,30.0,99.0,99.0,1.0,...,0.0,2017-02-19,2017-03-19,0.57631,0.173287,0.057762,0.057762,1.576408,2.131095,7.284831
42955,1,0,others,4,1970-01-01 00:00:00.020151030,38.0,30.0,149.0,149.0,0.0,...,0.0,2017-02-19,2017-03-21,2.610296,1.577235,1.211369,1.504617,3.623843,3.869413,9.331055


In [46]:
pipeline = get_transformation_pipeline_from_setup()
pipeline

In [None]:
# p = get_config(variable="prep_pipe")
# p.fit_transform(get_config(variable="data_before_preprocess"))

In [None]:
# models(internal=True)[['Name', 'GPU Enabled']]

### 4 Compare models: 

In [17]:
best_model = compare_models(fold = 5) #exclude=['xgboost']

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9603,0.9783,0.7018,0.7109,0.7062,0.6849,0.685,2.034
xgboost,Extreme Gradient Boosting,0.9593,0.9793,0.6708,0.7142,0.6918,0.6701,0.6705,1.884
rf,Random Forest Classifier,0.9589,0.9704,0.6593,0.7152,0.6861,0.6642,0.6648,7.8
gbc,Gradient Boosting Classifier,0.9581,0.9702,0.6804,0.697,0.6885,0.6661,0.6662,59.038
et,Extra Trees Classifier,0.9574,0.9662,0.6338,0.7098,0.6697,0.647,0.6482,4.876
lda,Linear Discriminant Analysis,0.9536,0.9507,0.6892,0.6504,0.6692,0.6443,0.6447,1.882
ridge,Ridge Classifier,0.9534,0.0,0.6461,0.662,0.654,0.629,0.6291,1.254
ada,Ada Boost Classifier,0.9509,0.9599,0.6329,0.6411,0.6368,0.6105,0.6106,13.654
dt,Decision Tree Classifier,0.946,0.7929,0.6157,0.6007,0.6081,0.5791,0.5792,3.558
lr,Logistic Regression,0.9426,0.9183,0.4425,0.6076,0.512,0.4823,0.4892,14.566


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

2025/05/31 06:13:08 INFO mlflow.tracking._tracking_service.client: 🏃 View run Light Gradient Boosting Machine at: http://0.0.0.0:6006/#/experiments/1/runs/f1629b6f4a124168aa6f98f3947121ab.
2025/05/31 06:13:08 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://0.0.0.0:6006/#/experiments/1.
2025/05/31 06:13:09 INFO mlflow.tracking._tracking_service.client: 🏃 View run Extreme Gradient Boosting at: http://0.0.0.0:6006/#/experiments/1/runs/ad256ec44dcb46158ea52570d1b2adaf.
2025/05/31 06:13:09 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://0.0.0.0:6006/#/experiments/1.
2025/05/31 06:13:10 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest Classifier at: http://0.0.0.0:6006/#/experiments/1/runs/cd1451c72bef42228d7f559f4ad28cc4.
2025/05/31 06:13:10 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://0.0.0.0:6006/#/experiments/1.
2025/05/31 06:13:11 INFO mlflow.tracking._tracking_service.client: 🏃

* Two simple words of code (not even a line) have created over 15 models using 10 fold stratified cross validation and evaluated the 6 most commonly used classification metrics (Accuracy, AUC, Recall, Precision, F1, Kappa). 

* The score grid printed above highlights the highest performing metric for comparison purposes only. The grid by default is sorted using 'Accuracy' (highest to lowest) which can be changed by passing the sort parameter. For example compare_models(sort = 'Recall') will sort the grid by Recall instead of Accuracy. 

* If you want to change the fold parameter from the default value of 10 to a different value then you can use the fold parameter. For example compare_models(fold = 5) will compare all models on 5 fold cross validation. Reducing the number of folds will improve the training time.

In [18]:
#selecting the best model
lgbm  = create_model('lightgbm', fold = 5) 

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9603,0.9778,0.6997,0.7121,0.7059,0.6846,0.6846
1,0.9594,0.9785,0.6858,0.7089,0.6972,0.6754,0.6755
2,0.9602,0.9787,0.7151,0.705,0.71,0.6886,0.6887
3,0.9606,0.9782,0.7045,0.7133,0.7088,0.6877,0.6877
4,0.9607,0.9784,0.7038,0.715,0.7094,0.6883,0.6883
Mean,0.9603,0.9783,0.7018,0.7109,0.7062,0.6849,0.685
Std,0.0005,0.0003,0.0095,0.0035,0.0048,0.005,0.0049


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

2025/05/31 06:13:37 INFO mlflow.tracking._tracking_service.client: 🏃 View run Light Gradient Boosting Machine at: http://0.0.0.0:6006/#/experiments/1/runs/1cdc3a4c9856476c90da618027a82144.
2025/05/31 06:13:37 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://0.0.0.0:6006/#/experiments/1.


In [49]:
lgbm

### 5 Analyzing the model performance

5.1 Learning Curve

In [19]:
%matplotlib inline

In [None]:
plot_model(lgbm, plot = 'learning')

5.2 ROC Curve

In [None]:
plot_model(lgbm, plot = 'auc')

5.3 Precision-recall Curve

In [None]:
plot_model(lgbm, plot = 'pr')

5.4 Confusion Matrix

In [None]:
plot_model(lgbm, plot = 'confusion_matrix', plot_kwargs = {'percent' : True})

5.5 Feature Importance

In [None]:
#top 10 features
plot_model(lgbm, plot='feature') #feature_all -> to check for all features 

5.6 Prediction class distribution

In [None]:
plot_model(lgbm, plot='error')

5.7 Model Interpretability

In [None]:
#pip install shap

In [None]:
# interpret model
interpret_model(lgbm)

In [None]:
interpret_model(lgbm,plot='correlation',feature='is_cancel')

In [None]:
interpret_model(lgbm,plot='reason',observation=0) # index of observation in test data

In [None]:
#pip install interpret

In [None]:
interpret_model(lgbm,plot='msa')

5.8 Model Evaluation

In [None]:
predict_model(lgbm, data_unseen);