In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from helper import get_performance

EPOCHS = 700
BATCH_SIZE = 2048
ACTIVATION = 'swish'
LEARNING_RATE = 0.0007
FOLDS = 5

In [13]:
# Reading the dataset
data = pd.read_csv("dataset/Hotel_Booking/hotel_bookings.csv")
data = data.sample(frac=0.2, replace=True, random_state=1).reset_index(drop=True)

data = data.drop(['company'], axis = 1)
data['children'] = data['children'].fillna(0)
data['hotel'] = data['hotel'].map({'Resort Hotel':0, 'City Hotel':1})

data['arrival_date_month'] = data['arrival_date_month'].map({'January':1, 'February': 2, 'March':3, 'April':4, 'May':5, 'June':6, 'July':7,
                                                            'August':8, 'September':9, 'October':10, 'November':11, 'December':12})
def family(data):
    if ((data['adults'] > 0) & (data['children'] > 0)):
        val = 1
    elif ((data['adults'] > 0) & (data['babies'] > 0)):
        val = 1
    else:
        val = 0
    return val

def deposit(data):
    if ((data['deposit_type'] == 'No Deposit') | (data['deposit_type'] == 'Refundable')):
        return 0
    else:
        return 1
    
def feature(data):
    data["is_family"] = data.apply(family, axis = 1)
    data["total_customer"] = data["adults"] + data["children"] + data["babies"]
    data["deposit_given"] = data.apply(deposit, axis=1)
    data["total_nights"] = data["stays_in_weekend_nights"]+ data["stays_in_week_nights"]
    return data

data = feature(data)
# Information of these columns is also inside of new features, so it is better to drop them.
# I did not drop stays_nights features, I can't decide which feature is more important there.
data = data.drop(columns = ['adults', 'babies', 'children', 'deposit_type', 'reservation_status_date'])

indices = data.loc[pd.isna(data["country"]), :].index 
data = data.drop(data.index[indices])   
data = data.drop(columns = ['arrival_date_week_number', 'stays_in_weekend_nights', 'arrival_date_month', 'agent'], axis = 1)

df1 = data.copy()
#one-hot-encoding
df1 = pd.get_dummies(data = df1, columns = ['meal', 'market_segment', 'distribution_channel',
                                            'reserved_room_type', 'assigned_room_type', 'customer_type', 'reservation_status'])
le = LabelEncoder()
df1['country'] = le.fit_transform(df1['country']) 
# There are more than 300 classes, so I wanted to use label encoder on this feature.

df2 = df1.drop(columns = ['reservation_status_Canceled', 'reservation_status_Check-Out', 'reservation_status_No-Show'], axis = 1)
df2.rename(columns={'market_segment_Offline TA/TO' : 'market_segment_Offline_TA_TO',
                    'market_segment_Online TA' : 'market_segment_Online_TA',
                    'distribution_channel_TA/TO' : 'distribution_channel_TA_TO',
                    'customer_type_Transient-Party' : 'customer_type_Transient_Party'}, inplace=True)

y = df2["is_canceled"]
X = df2.drop(["is_canceled"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

print("Train data: ", X_train.shape)
print("Test data: ", X_test.shape)

Train data:  (16648, 59)
Test data:  (7135, 59)


In [14]:
y_train.value_counts()

0    10537
1     6111
Name: is_canceled, dtype: int64

In [15]:
y_test.value_counts()

0    4504
1    2631
Name: is_canceled, dtype: int64

# Default Model

In [16]:
model_default = svm.SVC()
scores_default = cross_val_score(model_default, X=X_train, y=y_train, cv = FOLDS)
model_default.fit(X_train, y_train)
y_pred_default = model_default.predict(X_test)
get_performance(X_test, y_test, y_pred_default)

Performance on all data
Precision: 0.679502614215771
Recall: 0.6850735809390329
F1 Score: 0.6471423330777778


(0.679502614215771, 0.6850735809390329, 0.6471423330777778, None)

In [17]:
pd.DataFrame(y_pred_default).value_counts()

0    5951
1    1184
dtype: int64

In [18]:
import time 
import sys
sys.path.insert(1, './mmd')
from mmd import diagnoser
from scipy import stats as st
import numpy
#notebook's library
%matplotlib inline
from helper import get_top_f1_rules, get_relevent_attributs_target, get_MMD_results, get_biased_features, get_BGMD_results
from helper import generateTrain_data_Weights

In [19]:
default_result = pd.concat([X_test, y_test], axis=1, join='inner')
default_result.loc[:,"pred"] = y_pred_default

In [20]:
def mispredict_label(row):
    if row['is_canceled'] == row['pred']:
        return False
    return True

In [21]:
default_result_copy = default_result.copy()
X_test_copy = X_test.copy()
X_test_copy['mispredict'] = default_result_copy.apply(lambda row: mispredict_label(row), axis=1)

In [22]:
settings = diagnoser.Settings
settings.all_rules = True
# Get relevent attributes and target 
relevant_attributes, Target = get_relevent_attributs_target(X_test_copy)
# Generate MMD rules and correspodning information
MMD_rules, MMD_time, MMD_Features = get_MMD_results(X_test_copy, relevant_attributes, Target)

#Get biased attributes this time 
biased_attributes = get_biased_features(X_test_copy, relevant_attributes)

BGMD_rules, BGMD_time, BGMD_Features = get_BGMD_results(X_test_copy, biased_attributes, Target)

print('MMD Spent:', MMD_time, 'BGMD Spent:', BGMD_time)
MMD_rules, BGMD_rules

Original Rule
Subgroup Discovery Result

Found [1m3[0m subgroups
[1mDataset[0m
Target: mispredict=True
# Rows:	7135
# Cols:	60
% Target in dataset 31.49%
[1mSubgroup: stays_in_week_nights>11[0m
% of subgroup in population (Full Dataset):	0.38% (27 rows)
Precision: P(mispredict=True | stays_in_week_nights>11) = 59.26%
Recall: P(stays_in_week_nights>11 | mispredict=True) = 0.71%
[1mSubgroup: total_nights>14[0m
% of subgroup in population (Full Dataset):	0.48% (34 rows)
Precision: P(mispredict=True | total_nights>14) = 55.88%
Recall: P(total_nights>14 | mispredict=True) = 0.85%
[1mSubgroup: required_car_parking_spaces<=0[0m
% of subgroup in population (Full Dataset):	93.69% (6685 rows)
Precision: P(mispredict=True | required_car_parking_spaces<=0) = 33.13%
Recall: P(required_car_parking_spaces<=0 | mispredict=True) = 98.58%

###############################

BGMD Rule
Subgroup Discovery Result

Found [1m3[0m subgroups
[1mDataset[0m
Target: mispredict=True
# Rows:	7135
# Cols:

([[0.4959695476936857,
   0.33133881824981304,
   0.9857587894971073,
   'required_car_parking_spaces<=0'],
  [0.016659359929855327,
   0.5588235294117647,
   0.008455718736092568,
   'total_nights>14'],
  [0.014072119613016711,
   0.5925925925925926,
   0.007120605251446373,
   'stays_in_week_nights>11']],
 [[0.4959695476936857,
   0.33133881824981304,
   0.9857587894971073,
   'required_car_parking_spaces<=0'],
  [0.016659359929855327,
   0.5588235294117647,
   0.008455718736092568,
   'total_nights>14'],
  [0.014072119613016711,
   0.5925925925925926,
   0.007120605251446373,
   'stays_in_week_nights>11']])

# Decision Tree

In [23]:
model_default = DecisionTreeClassifier()
scores_default = cross_val_score(model_default, X=X_train, y=y_train, cv = FOLDS)
model_default.fit(X_train, y_train)
y_pred_default = model_default.predict(X_test)
get_performance(X_test, y_test, y_pred_default)

Performance on all data
Precision: 0.8340642154231357
Recall: 0.8333566923615977
F1 Score: 0.8336697129858249


(0.8340642154231357, 0.8333566923615977, 0.8336697129858249, None)

In [24]:
default_result = pd.concat([X_test, y_test], axis=1, join='inner')
default_result.loc[:,"pred"] = y_pred_default
default_result_copy = default_result.copy()
X_test_copy = X_test.copy()
X_test_copy['mispredict'] = default_result_copy.apply(lambda row: mispredict_label(row), axis=1)
settings = diagnoser.Settings
settings.all_rules = True
# Get relevent attributes and target 
relevant_attributes, Target = get_relevent_attributs_target(X_test_copy)
# Generate MMD rules and correspodning information
MMD_rules, MMD_time, MMD_Features = get_MMD_results(X_test_copy, relevant_attributes, Target)

#Get biased attributes this time 
biased_attributes = get_biased_features(X_test_copy, relevant_attributes)

BGMD_rules, BGMD_time, BGMD_Features = get_BGMD_results(X_test_copy, biased_attributes, Target)

print('MMD Spent:', MMD_time, 'BGMD Spent:', BGMD_time)
MMD_rules, BGMD_rules

Original Rule
Subgroup Discovery Result

Found [1m3[0m subgroups
[1mDataset[0m
Target: mispredict=True
# Rows:	7135
# Cols:	60
% Target in dataset 16.66%
[1mSubgroup: total_customer>4.0[0m
% of subgroup in population (Full Dataset):	0.14% (10 rows)
Precision: P(mispredict=True | total_customer>4.0) = 60.0%
Recall: P(total_customer>4.0 | mispredict=True) = 0.5%
[1mSubgroup: deposit_given<=0[0m
% of subgroup in population (Full Dataset):	87.83% (6267 rows)
Precision: P(mispredict=True | deposit_given<=0) = 18.96%
Recall: P(deposit_given<=0 | mispredict=True) = 99.92%
[1mSubgroup: total_nights>14[0m
% of subgroup in population (Full Dataset):	0.48% (34 rows)
Precision: P(mispredict=True | total_nights>14) = 29.41%
Recall: P(total_nights>14 | mispredict=True) = 0.84%

###############################

BGMD Rule
Subgroup Discovery Result

Found [1m3[0m subgroups
[1mDataset[0m
Target: mispredict=True
# Rows:	7135
# Cols:	60
% Target in dataset 16.66%
[1mSubgroup: total_customer

([[0.3186695278969957,
   0.18956438487314506,
   0.9991589571068125,
   'deposit_given<=0'],
  [0.016353229762878167,
   0.29411764705882354,
   0.008410428931875526,
   'total_nights>14'],
  [0.010008340283569641, 0.6, 0.005046257359125316, 'total_customer>4.0']],
 [[0.3186695278969957,
   0.18956438487314506,
   0.9991589571068125,
   'deposit_given<=0'],
  [0.016353229762878167,
   0.29411764705882354,
   0.008410428931875526,
   'total_nights>14'],
  [0.010008340283569641, 0.6, 0.005046257359125316, 'total_customer>4.0']])