In [1]:
from google.colab import drive
drive.mount('./gdrive')

Mounted at ./gdrive


In [2]:
%%capture
import sys

!{sys.executable} -m pip install -U pandas-profiling[notebook]
!jupyter nbextension enable --py widgetsnbextension
!pip install phik==0.10.0
!pip install association-metrics
!!pip install catboost

In [3]:
%matplotlib inline 
import os
import numpy as np                 # linear algebra
import pandas as pd                # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import sklearn as sk               # Useful in data analysis
from datetime import date, timedelta
import matplotlib.pyplot as plt

import phik
import association_metrics
import pandas_profiling
from pandas_profiling.utils.cache import cache_file

In [99]:
data_dir = "./gdrive/MyDrive/DataStorm-2.0/datasets"

train_df = pd.read_csv(os.path.join(data_dir, "Hotel-A-train.csv"))
valid_df = pd.read_csv(os.path.join(data_dir, "Hotel-A-validation.csv"))
test_df = pd.read_csv(os.path.join(data_dir, "Hotel-A-test.csv"))

# Preproccessing - General

In [100]:
#Convert Dates to continuous variables
def date_to_num(d):
    d = d.split('/')
    d = list(map(int, d))
    month, day, year = d
    ref_date = date(2015,1,1)
    cur_date = date(year,month,day)
    delta = cur_date - ref_date
    return delta.days

def date_to_month(d):
    d = d.split('/')
    d = list(map(int, d))
    month, day, year = d
    return month

def date_is_weekend(d1, duration):
    #Checks if the stay is during a weekend
    d1 = d1.split('/')
    d1 = list(map(int, d1))
    month, day, year = d1
    d = date(year, month, day)
    delt = timedelta(days = 1)
    for i in range(duration+1):
        cur_date = d + i*delt
        x = cur_date.weekday() #Monday = 0 Sunday = 6 etc. gets the weekday
        if x ==  5 or x == 6:
            return 1
    return 0

def date_to_day(d1, duration, i):
    #Checks if the stay is during a weekend
    d1 = d1.split('/')
    d1 = list(map(int, d1))
    month, day, year = d1
    d = date(year, month, day)
    delt = timedelta(days = 1)
    for n in range(duration+1):
        cur_date = d + n*delt
        x = cur_date.weekday() #Monday = 0 Sunday = 6 etc. gets the weekday
        if x == i:
            return 1
    return 0

days = ['mon','tue','wed','thur','fri','sat','sun']


In [101]:
for df in [train_df, test_df, valid_df]:
    print ('starting dataset')
    df['in_day'] = df['Expected_checkin'].map(lambda d: date_to_num(d))
    df['out_day'] = df['Expected_checkout'].map(lambda d: date_to_num(d))
    df['duration'] = df['out_day'] - df['in_day']
    df['in_month'] = df['Expected_checkin'].map(lambda d: date_to_month(d))
    df['out_month'] = df['Expected_checkout'].map(lambda d: date_to_month(d))
    print ('calculating booking day lag')
    booking_day = df['Booking_date'].map(lambda d: date_to_num(d))
    df['booking_lag'] = df['in_day'] - booking_day
    df['weekend'] = df.apply(lambda d: date_is_weekend(d.Expected_checkin, d.duration), axis=1)
    df['Spend'] = df['duration']*df['Room_Rate'].astype('float')*(1-df['Discount_Rate'].astype('float')/100)
    for i, day in enumerate(days):
        df[day] = df.apply(lambda d: date_to_day(d.Expected_checkin, d.duration, i), axis = 1)
    

starting dataset
calculating booking day lag
starting dataset
calculating booking day lag
starting dataset
calculating booking day lag


# Preprocessing - General Guidelines

All Columns - 

1. 'Reservation-id'
2. 'Gender'
3. 'Age'
4. 'Ethnicity'
5. 'Educational_Level'
6. 'Income'
7. 'Country_region'
8. 'Hotel_Type'
9. 'Expected_checkin'
10. 'Expected_checkout'
11. 'Booking_date'
12. 'Adults'
13. 'Children'
14. 'Babies'
15. 'Meal_Type'
16. 'Visted_Previously'
17. 'Previous_Cancellations'
18. 'Deposit_type'
19. 'Booking_channel'
20. 'Required_Car_Parking'
21. 'Reservation_Status'
22. 'Use_Promotion'
23. 'Discount_Rate'
24. 'Room_Rate'
25. 'in_day'
26. 'out_day'
27. 'duration'
28. 'in_month'
29. 'out_month'
30. 'booking_lag'
31. 'weekend'
32. 'Spend'


## Columns to Drop

1. 'Reservation-id' - unique identifier
9. 'Expected_checkin' - wrong format
10. 'Expected_checkout' - wrong format
11. 'Booking_date' - wrong format
22. 'Use_Promotion' - 'Discount_Rate' contains more information
25. 'in_day' - not useful
26. 'out_day' - not useful
29. 'out_month' - redundant, same as in_month genenerally

## Ordered Categorical Columns to encode in order

5. 'Educational_Level'
6. 'Income'

## Non-Ordered Categoricals to one-hot encode

4. 'Ethnicity'
7. 'Country_region'
8. 'Hotel_Type'
15. 'Meal_Type'
18. 'Deposit_type'
19. 'Booking_channel'
28. 'in_month'

## Boolean Categoricals - Encode as 0,1

2. 'Gender'
16. 'Visted_Previously'
17. 'Previous_Cancellations'
20. 'Required_Car_Parking'
31. 'weekend'

## Continuous Variables - Normalize to \[0 1\] (Check for outliers)

3. 'Age'
12. 'Adults'*
13. 'Children'*
14. 'Babies'*
23. 'Discount_Rate'*
24. 'Room_Rate'
27. 'duration'
30. 'booking_lag'
32. 'Spend'
\* \- Indicates that this variable might belefit from being a OHE categorical

## Variables to leave out due to low discriminative power **

**This section is subject to trial and error on val section

2. 'Gender' - drop
3. 'Age' - seems irrelevant, drop
5. 'Educational_Level' - weak
6. 'Income' - weak
13. 'Children' - possibly drop
16. 'Visted_Previously' - possibly? Weak correlation
19. 'Booking_channel' - weak, maybe drop
20. 'Required_Car_Parking' - drop
24. 'Room_Rate' - seems weak, maybe drop
27. 'duration' - weakish


In [266]:
processed_train = train_df.copy(deep=True)
processed_val = valid_df.copy(deep=True)
processed_test = test_df.copy(deep=True)
sets = [processed_train, processed_val, processed_test]

## Drop Columns

1. 'Reservation-id' - unique identifier
9. 'Expected_checkin' - wrong format
10. 'Expected_checkout' - wrong format
11. 'Booking_date' - wrong format
22. 'Use_Promotion' - 'Discount_Rate' contains more information
25. 'in_day' - not useful
26. 'out_day' - not useful
29. 'out_month' - redundant, same as in_month genenerally

In [267]:
for df in sets:
    df.drop('Reservation-id', inplace = True, axis = 1)
    df.drop('Expected_checkin', inplace = True, axis = 1)
    df.drop('Expected_checkout', inplace = True, axis = 1)
    df.drop('Booking_date', inplace = True, axis = 1)  
    df.drop('Use_Promotion', inplace = True, axis = 1)
    df.drop('out_day', inplace = True, axis = 1)
    df.drop('in_day', inplace = True, axis = 1)
    df.drop('out_month', inplace = True, axis = 1)

## Ordered Categorical Encoding

5. 'Educational_Level'
6. 'Income'

In [268]:
def educational_to_cat(level):
    if level == 'Mid-School':
        return 0
    elif level == 'High-School':
        return 1
    elif level == 'College':
        return 2
    elif level == 'Grad':
        return 3
    else:
        print (level)
        raise

def income_to_cat(level):
    if level == '<25K':
        return 0
    elif level == '25K --50K':
        return 1
    elif level == '50K -- 100K':
        return 2
    elif level == '>100K':
        return 3
    else:
        print (level)
        raise

In [269]:
for df in sets:
    df['Educational_Level'] = df['Educational_Level'].map(lambda d: educational_to_cat(d))
    df['Income'] = df['Income'].map(lambda d: income_to_cat(d))

## Unordered Caetgoricals to One Hot Encoding (Process Boolean as well)

Note: For decision forests etc. OHE not needed, can just change to levels

4. 'Ethnicity'
7. 'Country_region'
8. 'Hotel_Type'
15. 'Meal_Type'
18. 'Deposit_type'
19. 'Booking_channel'
28. 'in_month'

Boolean - 

2. 'Gender'
16. 'Visted_Previously'
17. 'Previous_Cancellations'
20. 'Required_Car_Parking'
31. 'weekend'

In [270]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

def check_sim(in_dfs, col):
    # checks whether the three dataframes have the same categories for a given column
    # return True if the categories are same, else False
    train_uniq = np.sort(np.array(in_dfs[0][col].unique()))
    valid_uniq = np.sort(np.array(in_dfs[0][col].unique()))
    test_uniq  = np.sort(np.array(in_dfs[0][col].unique()))
    print (train_uniq, valid_uniq)
    if (train_uniq == valid_uniq).all() and (train_uniq == test_uniq).all():
        return True
    else:
        return False

def OHE(in_dfs, col):
    # enc = LabelEncoder()
    enc = OneHotEncoder(drop='first')

    if check_sim(in_dfs, col):
        # enc.fit(in_dfs[0][col]) # fits only on the train 
        enc.fit(in_dfs[0][[col]]) # fits only on the train 
    
    else:
        # if the three dataframes are having different features, should get a combined feature list
        features = np.array(set(in_dfs[0][col].tolist() + in_dfs[1][col].tolist() + in_dfs[2][col].tolist())).reshape(-1,1)
        enc.fit(features)
    # for n in range(len(in_dfs)):
    #     in_dfs[n][col] = enc.transform(in_dfs[n][col])
    for n in range(len(in_dfs)):
        ohe_df =  pd.DataFrame(enc.transform(in_dfs[n][[col]]).toarray())
        ohe_df = ohe_df.add_prefix(col)
        in_dfs[n] = in_dfs[n].join(ohe_df)
        in_dfs[n].drop(col, inplace = True, axis = 1)



In [271]:
orderless_categoricals = ['Ethnicity',
 'Country_region',
 'Hotel_Type',
 'Meal_Type',
 'Deposit_type',
 'Booking_channel',
 'in_month',
 'Gender', #Booleans
 'Visted_Previously',
 'Previous_Cancellations',
 'Required_Car_Parking',
 'weekend']

In [272]:
for cat in orderless_categoricals:
    print (cat)
    OHE(sets, cat)

processed_train, processed_val, processed_test = sets

Ethnicity
['African American' 'Asian American' 'Latino' 'caucasian'] ['African American' 'Asian American' 'Latino' 'caucasian']
Country_region
['East' 'North' 'South' 'West'] ['East' 'North' 'South' 'West']
Hotel_Type
['Airport Hotels' 'City Hotel' 'Resort'] ['Airport Hotels' 'City Hotel' 'Resort']
Meal_Type
['BB' 'FB' 'HB'] ['BB' 'FB' 'HB']
Deposit_type
['No Deposit' 'Non-Refundable' 'Refundable'] ['No Deposit' 'Non-Refundable' 'Refundable']
Booking_channel
['Agent' 'Direct' 'Online'] ['Agent' 'Direct' 'Online']
in_month
[ 1  2  3  4  5  6  7  8  9 10 11 12] [ 1  2  3  4  5  6  7  8  9 10 11 12]
Gender
['F' 'M'] ['F' 'M']
Visted_Previously
['No' 'Yes'] ['No' 'Yes']
Previous_Cancellations
['No' 'Yes'] ['No' 'Yes']
Required_Car_Parking
['No' 'Yes'] ['No' 'Yes']
weekend
[0 1] [0 1]


In [273]:
processed_train.head(10)

Unnamed: 0,Age,Educational_Level,Income,Adults,Children,Babies,Reservation_Status,Discount_Rate,Room_Rate,duration,booking_lag,Spend,mon,tue,wed,thur,fri,sat,sun,Ethnicity0,Ethnicity1,Ethnicity2,Country_region0,Country_region1,Country_region2,Hotel_Type0,Hotel_Type1,Meal_Type0,Meal_Type1,Deposit_type0,Deposit_type1,Booking_channel0,Booking_channel1,in_month0,in_month1,in_month2,in_month3,in_month4,in_month5,in_month6,in_month7,in_month8,in_month9,in_month10,Gender0,Visted_Previously0,Previous_Cancellations0,Required_Car_Parking0,weekend0
0,40,3,0,2,2,0,Check-In,10,218,1,41,196.2,0,0,1,1,0,0,0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,49,0,2,3,3,0,Check-In,0,185,1,36,185.0,0,0,1,1,0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,42,3,0,3,3,0,Check-In,0,119,4,3,476.0,1,0,0,1,1,1,1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,25,2,3,4,3,0,Check-In,5,144,1,12,136.8,0,0,0,1,1,0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,62,1,1,1,1,0,Check-In,10,242,1,13,217.8,0,0,0,0,1,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,34,2,3,5,2,1,Check-In,10,143,1,13,128.7,0,0,0,0,1,1,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
6,53,1,0,2,1,0,Check-In,25,212,1,-1,159.0,0,0,0,0,1,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
7,40,0,3,2,1,0,Check-In,0,170,1,8,170.0,0,0,0,0,1,1,0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
8,59,0,3,3,2,0,Check-In,10,245,1,84,220.5,0,0,1,1,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
9,21,3,2,2,3,0,Check-In,20,212,3,72,508.8,0,0,1,1,1,1,0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0


## Continuous Variables Normalization (No Outliers Found)

3. 'Age'
12. 'Adults'*
13. 'Children'*
14. 'Babies'*
23. 'Discount_Rate'*
24. 'Room_Rate'
27. 'duration'
30. 'booking_lag'
32. 'Spend'

In [274]:
continuous_vars = ['Age', 
 'Room_Rate',
 'duration',
 'Adults', 
 'Children',
 'Babies',
 'Discount_Rate',
 'booking_lag',
 'Spend']

In [275]:
def scale_cols(in_dfs, col):
    M = in_dfs[0][col].max()
    m = in_dfs[0][col].min()
    scale = M - m
    assert (scale !=0)
    for df in in_dfs:
        df[col] = (df[col].astype('float') - m)/scale


In [276]:
for cat in continuous_vars:
    scale_cols(sets, cat)

In [277]:
processed_train.head(10)

Unnamed: 0,Age,Educational_Level,Income,Adults,Children,Babies,Reservation_Status,Discount_Rate,Room_Rate,duration,booking_lag,Spend,mon,tue,wed,thur,fri,sat,sun,Ethnicity0,Ethnicity1,Ethnicity2,Country_region0,Country_region1,Country_region2,Hotel_Type0,Hotel_Type1,Meal_Type0,Meal_Type1,Deposit_type0,Deposit_type1,Booking_channel0,Booking_channel1,in_month0,in_month1,in_month2,in_month3,in_month4,in_month5,in_month6,in_month7,in_month8,in_month9,in_month10,Gender0,Visted_Previously0,Previous_Cancellations0,Required_Car_Parking0,weekend0
0,0.423077,3,0,0.25,0.5,0.0,Check-In,0.25,0.786667,0.0,0.063202,0.144894,0,0,1,1,0,0,0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.596154,0,2,0.5,1.0,0.0,Check-In,0.0,0.566667,0.0,0.05618,0.132979,0,0,1,1,0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.461538,3,0,0.5,1.0,0.0,Check-In,0.0,0.126667,1.0,0.009831,0.442553,1,0,0,1,1,1,1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,0.134615,2,3,0.75,1.0,0.0,Check-In,0.125,0.293333,0.0,0.022472,0.081702,0,0,0,1,1,0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.846154,1,1,0.0,0.0,0.0,Check-In,0.25,0.946667,0.0,0.023876,0.167872,0,0,0,0,1,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,0.307692,2,3,1.0,0.5,0.5,Check-In,0.25,0.286667,0.0,0.023876,0.073085,0,0,0,0,1,1,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
6,0.673077,1,0,0.25,0.0,0.0,Check-In,0.625,0.746667,0.0,0.004213,0.105319,0,0,0,0,1,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
7,0.423077,0,3,0.25,0.0,0.0,Check-In,0.0,0.466667,0.0,0.016854,0.117021,0,0,0,0,1,1,0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
8,0.788462,0,3,0.5,0.5,0.0,Check-In,0.25,0.966667,0.0,0.123596,0.170745,0,0,1,1,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
9,0.057692,3,2,0.25,1.0,0.0,Check-In,0.5,0.746667,0.666667,0.106742,0.477447,0,0,1,1,1,1,0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0


In [278]:
map_dict = {'Check-In':1, 'Canceled':2, 'No-Show':3}

processed_train['Reservation_Status'] = processed_train['Reservation_Status'].map(map_dict)
processed_val['Reservation_Status'] = processed_val['Reservation_Status'].map(map_dict)

In [279]:
processed_train['Checkin'] = processed_train['Reservation_Status'].map(lambda d: d == 1)
processed_val['Checkin'] = processed_val['Reservation_Status'].map(lambda d: d == 1)
processed_train['Canceled'] = processed_train['Reservation_Status'].map(lambda d: d == 2)
processed_val['Canceled'] = processed_val['Reservation_Status'].map(lambda d: d == 2)
processed_train['NoShow'] = processed_train['Reservation_Status'].map(lambda d: d == 3)
processed_val['NoShow'] = processed_val['Reservation_Status'].map(lambda d: d == 3)

In [280]:
conf_matrix_train1 = processed_train.corr()['Checkin'][:]
conf_matrix_valid1 = processed_val.corr()['Checkin'][:]
conf_matrix_train2 = processed_train.corr()['Canceled'][:]
conf_matrix_valid2 = processed_val.corr()['Canceled'][:]
conf_matrix_train3 = processed_train.corr()['NoShow'][:]
conf_matrix_valid3 = processed_val.corr()['NoShow'][:]
pd.concat([conf_matrix_train1, conf_matrix_valid1, conf_matrix_train2, conf_matrix_valid2, conf_matrix_train3, conf_matrix_valid3], axis=1)
pd.concat([10000*conf_matrix_train1.multiply(conf_matrix_valid1), 10000*conf_matrix_train2.multiply(conf_matrix_valid2), 10000*conf_matrix_train3.multiply(conf_matrix_valid3)], axis=1)

Unnamed: 0,Checkin,Canceled,NoShow
Age,0.423188,0.938035,0.007879
Educational_Level,0.100999,-1.09963,-2.722919
Income,-0.35953,0.315532,-0.764054
Adults,0.101026,1.090418,0.553935
Children,0.119925,-1.140143,0.706828
Babies,-0.011766,3.215485,4.748926
Reservation_Status,8423.155438,1766.771305,6561.381
Discount_Rate,-0.145137,0.209165,0.59134
Room_Rate,-0.615329,0.235154,-1.032374
duration,-0.346911,0.038581,-0.024468


## DROP UNNECESSARY FEATURES: 

Trial 1: 58.6

2. 'Gender' - drop
3. 'Age' - seems irrelevant, drop
20. 'Required_Car_Parking' - drop

Trial 2: 58.57: 58.56

2. 'Gender' - drop
3. 'Age' - seems irrelevant, drop
5. 'Educational_Level' - weak
6. 'Income' - weak
24. 'Room_Rate' - seems weak, maybe drop
20. 'Required_Car_Parking' - drop

Trial 3: Don't drop anything - 58.56

Trial 4: Everything

2. 'Gender' - drop
3. 'Age' - seems irrelevant, drop
5. 'Educational_Level' - weak
6. 'Income' - weak
13. 'Children' - possibly drop
16. 'Visted_Previously' - possibly? Weak correlation
19. 'Booking_channel' - weak, maybe drop
20. 'Required_Car_Parking' - drop
24. 'Room_Rate' - seems weak, maybe drop
27. 'duration' - weakish

Trial 5: Everything ++

2. 'Gender' - drop
3. 'Age' - seems irrelevant, drop
5. 'Educational_Level' - weak
6. 'Income' - weak
13. 'Children' - possibly drop
16. 'Visted_Previously' - possibly? Weak correlation
19. 'Booking_channel' - weak, maybe drop
20. 'Required_Car_Parking' - drop
24. 'Room_Rate' - seems weak, maybe drop
27. 'duration' - weakish
1. Ethnicity
2. Country Region
3. Spend
4. Adults

Trial 6:

2. 'Gender' - drop
3. 'Age' - seems irrelevant, drop
20. 'Required_Car_Parking' - drop
24. 'Room_Rate' - seems weak, maybe drop
27. 'Discount' - weakish
1. Ethnicity
2. Country Region

Trial 7: Include only:

1. 'Meal_Type'
2. 'tues'
3. 'sun'
4. 'in_month'
5. 'discount_rate'
6. 'Babies'

Trial 8: 
1. Gender no
1. Ethnicity no
1. Country region yes
1. Hotel type maybe
1. meal type yes?
1. visited previously no
1. prev cancellations maybe
1. deposit type maybe?
1. booking channel no
1. req car parking no
1. use promotion maybe
1. month no
1. weekend yes?
1. days: mon, tues,sat,sun
1. education level no
1. income no
1. adults no
1. children yes
1. babies yes
1. discount rate yes (OHE?)
1. duration no
1. age - no
1. room rate no
1. booking lag ??
1. spend - no


In [250]:
Exclude_cats = ['Gender', 'Ethnicity', 'Spend', 'Room_Rate', 
                'duration',  'Income', 'Education', 'mon', 'wed','thur', 'tue','sat','sun',
                'fri', 'Parking', 'Booking_channel', 'Previously', 'booking_lag', 'Cancellations', 'Country', 'Deposit',
                'Checkin', 'Canceled', 'NoShow', 'Hotel']

for df in sets:
    for cat in df.columns:
      for ex in Exclude_cats:
        if ex in cat and cat != 'in_month8':
          df.drop(cat, inplace = True, axis = 1)
        


In [281]:
include_cats=['Meal_Type0','Meal_Type1','weekend0',
'Reservation_Status']

for df in sets:
  for cat in df.columns:
    if cat not in include_cats:
      df.drop(cat, inplace=True, axis = 1)

print (processed_train.head(10))

   Reservation_Status  Meal_Type0  Meal_Type1  weekend0
0                   1         0.0         0.0       0.0
1                   1         0.0         0.0       0.0
2                   1         0.0         0.0       1.0
3                   1         0.0         0.0       0.0
4                   1         0.0         0.0       1.0
5                   1         0.0         0.0       1.0
6                   1         0.0         0.0       1.0
7                   1         1.0         0.0       1.0
8                   1         0.0         0.0       0.0
9                   1         0.0         1.0       1.0


In [282]:
#Set 'Reservation Status as the last column
t_train = processed_train.pop('Reservation_Status')
processed_train['Reservation_Status'] = t_train
t_val = processed_val.pop('Reservation_Status')
processed_val['Reservation_Status'] = t_val
print (processed_train.head(5))

   Meal_Type0  Meal_Type1  weekend0  Reservation_Status
0         0.0         0.0       0.0                   1
1         0.0         0.0       0.0                   1
2         0.0         0.0       1.0                   1
3         0.0         0.0       0.0                   1
4         0.0         0.0       1.0                   1


In [283]:
print (processed_train.columns)

Index(['Meal_Type0', 'Meal_Type1', 'weekend0', 'Reservation_Status'], dtype='object')


In [284]:
prep_train = processed_train
prep_test = processed_test
prep_valid = processed_val

# Converting the Dataset to a trainable format

In [285]:
print(f'Training dataframe shape :: {prep_train.shape}')
features = prep_train.columns[:-1].tolist()
label = prep_train.columns[-1]

Training dataframe shape :: (27499, 4)


In [286]:
conf_matrix_train = prep_train[prep_train.columns[:]].corr()['Reservation_Status'][:]
conf_matrix_valid = prep_valid[prep_valid.columns[:]].corr()['Reservation_Status'][:]
pd.concat([conf_matrix_train, conf_matrix_valid], axis=1) 

Unnamed: 0,Reservation_Status,Reservation_Status.1
Meal_Type0,0.168576,0.029467
Meal_Type1,0.044998,0.012094
weekend0,0.031958,0.018162
Reservation_Status,1.0,1.0


In [287]:
class_biases = prep_train.iloc[:,-1].value_counts().to_dict()
print(class_biases)
total_samples = prep_train.iloc[:,-1].value_counts().sum()
class_weights = {k:(1 / val)*(total_samples)/3.0 for k, val in class_biases.items()}
print(class_weights)


{1: 21240, 2: 4134, 3: 2125}
{1: 0.4315599497802887, 2: 2.217303660699887, 3: 4.313568627450981}


In [288]:
from sklearn.feature_selection import SelectKBest, chi2

feat_selector = SelectKBest(chi2, k='all').fit(prep_train.iloc[:,:-1], prep_train.iloc[:,-1])

x_train = feat_selector.transform(prep_train.iloc[:,:-1])
x_valid = feat_selector.transform(prep_valid.iloc[:,:-1])
x_test = feat_selector.transform(prep_test)
print (x_train.shape)
print (type(x_train))
y_train = np.array(prep_train.iloc[:,-1])
print (y_train.shape)
print (type(y_train))
y_valid = np.array(prep_valid.iloc[:,-1])

(27499, 3)
<class 'numpy.ndarray'>
(27499,)
<class 'numpy.ndarray'>


In [289]:
from scipy import stats
for n,(arr1,arr2) in enumerate(zip(np.transpose(x_train), np.transpose(x_valid))):
    print (prep_train.columns[n])
    print(stats.describe(arr1))
    print(stats.describe(arr2))
    #bad: booking lag, Meal_Type0


Meal_Type0
DescribeResult(nobs=27499, minmax=(0.0, 1.0), mean=0.18887959562165896, variance=0.15320966544261777, skewness=1.5897297617752102, kurtosis=0.5272407154738672)
DescribeResult(nobs=2749, minmax=(0.0, 1.0), mean=0.34085121862495454, variance=0.22475342363479528, skewness=0.6715203232253493, kurtosis=-1.5490604554953218)
Meal_Type1
DescribeResult(nobs=27499, minmax=(0.0, 1.0), mean=0.2938652314629623, variance=0.2075160035110845, skewness=0.9050307698153985, kurtosis=-1.1809193056873464)
DescribeResult(nobs=2749, minmax=(0.0, 1.0), mean=0.32375409239723535, variance=0.2190170515889594, skewness=0.7533366194188204, kurtosis=-1.432483937842623)
weekend0
DescribeResult(nobs=27499, minmax=(0.0, 1.0), mean=0.5162369540710571, variance=0.24974544330523218, skewness=-0.06498208890616185, kurtosis=-1.995777328121392)
DescribeResult(nobs=2749, minmax=(0.0, 1.0), mean=0.5183703164787196, variance=0.24975338392206137, skewness=-0.07353091141113205, kurtosis=-1.9945932050670485)


In [290]:
#To correct for class imbalance, create oversampled data
checkin = x_train[y_train == 1]
print (checkin.shape)
canceled = x_train[y_train == 2]
print (canceled.shape)
no_show = x_train[y_train == 3]
print (no_show.shape)
checkin_labels = y_train[y_train == 1]
canceled_labels = y_train[y_train == 2]
no_show_labels = y_train[y_train == 3]


ids_canceled = np.arange(len(canceled))
ids_no_show = np.arange(len(no_show))
canceled_choices = np.random.choice(ids_canceled, len(checkin))
no_show_choices = np.random.choice(ids_no_show, len(checkin))

res_canceled = canceled[canceled_choices]
res_canceled_labels = canceled_labels[canceled_choices]
res_no_show = no_show[no_show_choices]
res_no_show_labels = no_show_labels[no_show_choices]

res_x_train = np.concatenate([checkin, res_canceled, res_no_show], axis=0)
res_y_train = np.concatenate([checkin_labels, res_canceled_labels, res_no_show_labels], axis=0)

order = np.arange(len(res_y_train))
np.random.shuffle(order)
res_x_train = res_x_train[order]
res_y_train = res_y_train[order]

print(res_x_train.shape, res_y_train.shape)
print (y_train)
print (res_y_train)

(21240, 3)
(4134, 3)
(2125, 3)
(63720, 3) (63720,)
[1 1 1 ... 1 2 3]
[1 3 2 ... 2 2 3]


# Model Fitting and Results

Models to try (OHE)

1. Neural Network

Models to try (Categorical)

1. XGBoost
1. Decision Tree
2. Decision Forest
3. Logistic Regression

## XGBoost Model

In [292]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

xgb_params = {
    'n_estimators': [100, 500, 1000],     
    'max_depth': [3, 5, 10]
}

xgbModel = xgb.XGBClassifier(use_label_encoder=False, n_estimators = 500, max_depth=10)

eval_set = [(x_train, y_train), (x_valid, y_valid)]
eval_metric = ["merror"]

xgbModel.fit(x_train, y_train, eval_metric=eval_metric, eval_set=eval_set)
xgb_valid = xgbModel.predict(x_valid)
xgb_valid = [round(value) for value in xgb_valid]
accuracy = sk.metrics.accuracy_score(y_valid, xgb_valid)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

[0]	validation_0-merror:0.227608	validation_1-merror:0.414332
[1]	validation_0-merror:0.227608	validation_1-merror:0.414332
[2]	validation_0-merror:0.227608	validation_1-merror:0.414332
[3]	validation_0-merror:0.227608	validation_1-merror:0.414332
[4]	validation_0-merror:0.227608	validation_1-merror:0.414332
[5]	validation_0-merror:0.227608	validation_1-merror:0.414332
[6]	validation_0-merror:0.227608	validation_1-merror:0.414332
[7]	validation_0-merror:0.227608	validation_1-merror:0.414332
[8]	validation_0-merror:0.227608	validation_1-merror:0.414332
[9]	validation_0-merror:0.227608	validation_1-merror:0.414332
[10]	validation_0-merror:0.227608	validation_1-merror:0.414332
[11]	validation_0-merror:0.227608	validation_1-merror:0.414332
[12]	validation_0-merror:0.227608	validation_1-merror:0.414332
[13]	validation_0-merror:0.227608	validation_1-merror:0.414332
[14]	validation_0-merror:0.227608	validation_1-merror:0.414332
[15]	validation_0-merror:0.227608	validation_1-merror:0.414332
[1

In [None]:
xgb_params = {
    'n_estimators': [100, 500, 1000],     
    'max_depth': [3, 5, 10],     
    'min_samples_split': [2, 5, 10],
}
params = {
    'estimator': xgbModel,
    'param_grid': xgb_params,
    'cv': 2,
    'refit': False,
    'n_jobs': -1,
    'verbose': 2,
    'scoring': 'recall_micro',
}

In [None]:
xgb_cv = GridSearchCV(**params)
_ = xgb_cv.fit(x_train, y_train)

print(xgb_cv.best_params_)
xgb = XGBClassifier(**xgb_cv.best_params_).fit(x_train, y_train)

In [None]:
xgb_valid = xgbModel.predict(x_valid)
xgb_valid = [round(value) for value in xgb_valid]
accuracy = sk.metrics.accuracy_score(y_valid, xgb_valid)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# Logistic Regression Classifier

In [293]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression(random_state=42, max_iter=1000).fit(x_train, y_train)

log_train = log_model.predict(x_train)
train_accuracy = sk.metrics.accuracy_score(y_train, log_train)
print("Train Accuracy: %.2f%%" % (train_accuracy * 100.0))

log_valid = log_model.predict(x_valid)
accuracy = sk.metrics.accuracy_score(y_valid, log_valid)
print("Val Accuracy: %.2f%%" % (accuracy * 100.0))

Train Accuracy: 77.24%
Val Accuracy: 58.57%


## Naive Bayes Gaussian

In [294]:
from sklearn.naive_bayes import GaussianNB

gnb_model = GaussianNB()
gnb_model.fit(x_train, y_train)
print('Train Accuracy', gnb_model.score(x_train,y_train))
print('Val Accuracy', gnb_model.score(x_valid,y_valid))

Train Accuracy 0.772391723335394
Val Accuracy 0.5856675154601674


## CatBoost Classifier

In [295]:
from catboost import CatBoostClassifier

cat_model = CatBoostClassifier(eval_metric='Accuracy',colsample_bylevel=0.5,max_depth=5,n_estimators=1000,learning_rate=0.5,use_best_model=True, random_seed=42)
cat_model.fit(x_train, y_train, eval_set=(x_valid,y_valid))

0:	learn: 0.7723917	test: 0.5856675	best: 0.5856675 (0)	total: 11.1ms	remaining: 11.1s
1:	learn: 0.7723917	test: 0.5856675	best: 0.5856675 (0)	total: 19.4ms	remaining: 9.66s
2:	learn: 0.7723917	test: 0.5856675	best: 0.5856675 (0)	total: 28.8ms	remaining: 9.56s
3:	learn: 0.7723917	test: 0.5856675	best: 0.5856675 (0)	total: 37ms	remaining: 9.21s
4:	learn: 0.7723917	test: 0.5856675	best: 0.5856675 (0)	total: 45.1ms	remaining: 8.97s
5:	learn: 0.7723917	test: 0.5856675	best: 0.5856675 (0)	total: 53.1ms	remaining: 8.8s
6:	learn: 0.7723917	test: 0.5856675	best: 0.5856675 (0)	total: 62.8ms	remaining: 8.91s
7:	learn: 0.7723917	test: 0.5856675	best: 0.5856675 (0)	total: 71.9ms	remaining: 8.92s
8:	learn: 0.7723917	test: 0.5856675	best: 0.5856675 (0)	total: 80.5ms	remaining: 8.86s
9:	learn: 0.7723917	test: 0.5856675	best: 0.5856675 (0)	total: 97.6ms	remaining: 9.66s
10:	learn: 0.7723917	test: 0.5856675	best: 0.5856675 (0)	total: 106ms	remaining: 9.57s
11:	learn: 0.7723917	test: 0.5856675	best: 0.5

<catboost.core.CatBoostClassifier at 0x7f19ec13b0d0>

## Neural Network

In [246]:
import tensorflow
from keras.layers import Dense, Dropout, Conv1D, LSTM, MaxPooling1D, Reshape
from keras.models import Model, Sequential
from keras.optimizers import Adam, SGD, RMSprop
from keras.callbacks import ReduceLROnPlateau,EarlyStopping, ModelCheckpoint

# Defining the model

start_dim = 256
exp_dim = 4
pool_dim = 2
n_classes = 3

model = Sequential()
model.add(Dense(start_dim, input_dim=x_train.shape[1], activation='relu'))
model.add(Reshape((exp_dim, start_dim//exp_dim)))
model.add(Conv1D(filters = start_dim//2, kernel_size=5, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size = pool_dim))
model.add(Conv1D(filters = start_dim//(2*pool_dim), kernel_size=5, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size = pool_dim))
# model.add(LSTM(start_dim//(2*pool_dim*pool_dim)))
model.add(Dense(start_dim//(2*2*pool_dim*pool_dim), activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(start_dim//(2*2*2*pool_dim*pool_dim), activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(n_classes, activation='softmax'))

model.summary()
Model: "sequential_1"

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 256)               3072      
_________________________________________________________________
reshape_2 (Reshape)          (None, 4, 64)             0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 4, 128)            41088     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 2, 128)            0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 2, 64)             41024     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 1, 64)             0         
_________________________________________________________________
dense_9 (Dense)              (None, 1, 16)            

In [247]:
early_stop = EarlyStopping(monitor='val_acc', patience=5, restore_best_weights=True)
reducer = ReduceLROnPlateau(monitor='val_acc', factor=0.1, patience=2)
callbacks = [early_stop, reducer]

In [248]:

class_weights = sk.utils.class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train)
class_weight_dict = dict(enumerate(class_weights))
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=0.01), metrics=['acc'])
history = model.fit(x_train, y_train-1, validation_data=(x_valid, y_valid-1) ,epochs=1000, batch_size=16, callbacks=callbacks)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000


In [None]:
nn_test = np.argmax(model.predict(x_test), axis=-1) + 1

### Simple Neural Network

In [None]:
start_dim = 256
n_classes = 3

model = Sequential()
model.add(Dense(start_dim, input_dim=x_train.shape[1], activation='tanh'))
model.add(Dense(start_dim*2, activation='tanh'))
model.add(Dense(start_dim, activation='tanh'))
model.add(Dense(start_dim//2, activation='tanh'))
model.add(Dense(n_classes, activation='softmax'))

model.summary()
Model: "sequential_2"

In [None]:
early_stop = EarlyStopping(monitor='val_acc', patience=10, restore_best_weights=True)
reducer = ReduceLROnPlateau(monitor='val_acc', factor=0.1, patience=2)
callbacks = [early_stop, reducer]

In [None]:

class_weights = sk.utils.class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train)
class_weight_dict = dict(enumerate(class_weights))
print (class_weights)
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=0.01), metrics=['acc'])
history = model.fit(x_train, y_train-1, validation_data=(x_valid, y_valid-1) ,epochs=1000, batch_size=16, callbacks=callbacks, class_weight=class_weight_dict)

### Cascaded NN

In [None]:
start_dim = 1024
n_classes = 1

model1 = Sequential()
model1.add(Dense(start_dim, input_dim=x_train.shape[1], activation='tanh'))
model1.add(Dense(start_dim*2, activation='tanh'))
model1.add(Dense(n_classes, activation='sigmoid'))

model1.summary()
Model1: "sequential_l1"

In [None]:
y_train_1 = (y_train == 1).astype('int')
print (y_train, y_train_1)
y_valid_1 = (y_valid == 1).astype('int')
print (y_valid_1)

In [None]:
early_stop = EarlyStopping(monitor='val_acc', patience=10, restore_best_weights=True)
reducer = ReduceLROnPlateau(monitor='val_acc', factor=0.1, patience=2)
callbacks = [early_stop, reducer]

In [None]:
model1.compile(loss='binary_crossentropy', optimizer=SGD(lr=0.01, momentum=0.001), metrics=['acc'])
history = model1.fit(x_train, y_train_1, validation_data=(x_valid, y_valid_1) ,epochs=1000, batch_size=16, callbacks=callbacks)