### Setting up

In [12]:
!pip install shap
!pip install interpret

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting shap
  Downloading shap-0.41.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (572 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m572.4/572.4 KB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting slicer==0.0.7
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.41.0 slicer-0.0.7
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting interpret
  Downloading interpret-0.3.2-py3-none-any.whl (1.4 kB)
Collecting interpret-core[dash,debug,decisiontree,ebm,lime,linear,notebook,plotly,required,sensitivity,shap,skoperules,treeinterpreter]==0.3.2
  Downloading interpret_core-0.3.2-py3-none-any.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m54.6 MB/s[0m eta [36m0:00:00[0m
Coll

In [25]:
import pandas as pd
import numpy as np
import sklearn 
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
import plotly.express as px
from interpret.blackbox import LimeTabular
from interpret import show

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
### Helper method
# You may consider changeing the colour scheme in practice

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        
    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="black" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [4]:
from sklearn.preprocessing import StandardScaler
numerical_cols = ["lead_time", "previous_cancellations", 
                  "previous_bookings_not_canceled", "booking_changes", "days_in_waiting_list", "adr", "required_car_parking_spaces", 
                  "total_of_special_requests", "length_of_stay", "adults", "num.guests", "stays"]

def normalise_cols(df, columns):
    scaler = StandardScaler()
    df_norm = df.copy()
    df_norm[columns] = scaler.fit_transform(df_norm[columns])
    return df_norm

### Loading Data 

In [6]:
df_train = pd.read_csv('/content/drive/MyDrive/ST4248 Project/dataset/train_normalised.csv')
df_test = pd.read_csv('/content/drive/MyDrive/ST4248 Project/dataset/test_normalised.csv')

In [7]:
df_train.shape

(81957, 26)

In [8]:
X_train = df_train.drop("is_canceled", axis = 1)
y_train = df_train["is_canceled"]

X_test = df_test.drop("is_canceled", axis = 1)
y_test = df_test["is_canceled"]

### LIME Models

#### LIME On Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier

model_RF = RandomForestClassifier(random_state = 42,
                               n_estimators= 428, 
                               max_depth= 25, 
                               min_samples_split= 3, 
                               min_samples_leaf= 5)
model_RF.fit(X_train, y_train)
y_predict = model_RF.predict(X_test)

In [22]:
lime = LimeTabular(model=model_RF.predict_proba, 
                   data=X_train, 
                   random_state=42)
# Get local explanations
lime_local = lime.explain_local(X_test[0:10], 
                                y_test[0:10], 
                                name='LIME')

In [26]:
show(lime_local)

#### LIME on LightGBM

In [27]:
import lightgbm as lgb

model_LGBM = lgb.LGBMClassifier(
    random_state = 42,
    n_estimators= 520, 
    learning_rate= 0.13, 
    num_leaves= 1140, 
    max_depth= 19, 
    min_data_in_leaf= 125, 
    lambda_l1= 10, 
    lambda_l2= 50, 
    min_gain_to_split= 0.0, 
    max_bin= 230, 
    bagging_fraction= 0.5, 
    feature_fraction= 0.8)

model_LGBM.fit(X_train, y_train,
               categorical_feature=[6,10,14,15,16,18,17,19,20,23,24])
y_predict = model_LGBM.predict(X_test)


Using categorical_feature in Dataset.





In [32]:
lime = LimeTabular(model=model_LGBM.predict_proba, 
                   data=X_train, 
                   random_state=42)
# Get local explanations
lime_local = lime.explain_local(X_test[0:10], 
                                y_test[0:10], 
                                name='LIME')

In [33]:
show(lime_local)