# DataStorm 2.0

## Import Data Sets

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

train_data = pd.read_csv('../Data/Hotel-A-train.csv')

validation_data = pd.read_csv('../Data/Hotel-A-validation.csv')

test_data = pd.read_csv('../Data/Hotel-A-test.csv')

train_data.head(10)

Unnamed: 0,Reservation-id,Gender,Age,Ethnicity,Educational_Level,Income,Country_region,Hotel_Type,Expected_checkin,Expected_checkout,Booking_date,Adults,Children,Babies,Meal_Type,Visted_Previously,Previous_Cancellations,Deposit_type,Booking_channel,Required_Car_Parking,Reservation_Status,Use_Promotion,Discount_Rate,Room_Rate
0,39428300,F,40,Latino,Grad,<25K,North,City Hotel,7/1/2015,7/2/2015,5/21/2015,2,2,0,BB,No,No,No Deposit,Online,Yes,Check-In,Yes,10,218
1,77491756,F,49,Latino,Mid-School,50K -- 100K,East,City Hotel,7/1/2015,7/2/2015,5/26/2015,3,3,0,BB,No,No,Refundable,Online,Yes,Check-In,No,0,185
2,73747291,F,42,caucasian,Grad,<25K,East,City Hotel,7/2/2015,7/6/2015,6/29/2015,3,3,0,BB,No,No,No Deposit,Online,Yes,Check-In,No,0,119
3,67301739,M,25,African American,College,>100K,South,Airport Hotels,7/2/2015,7/3/2015,6/20/2015,4,3,0,BB,No,No,Refundable,Agent,Yes,Check-In,Yes,5,144
4,77222321,F,62,Latino,High-School,25K --50K,East,Resort,7/3/2015,7/4/2015,6/20/2015,1,1,0,BB,No,No,No Deposit,Direct,No,Check-In,Yes,10,242
5,55152245,M,34,African American,College,>100K,North,Airport Hotels,7/3/2015,7/4/2015,6/20/2015,5,2,1,BB,Yes,No,No Deposit,Online,No,Check-In,Yes,10,143
6,87139343,F,53,African American,High-School,<25K,East,Airport Hotels,7/3/2015,7/4/2015,7/4/2015,2,1,0,BB,Yes,Yes,Refundable,Online,Yes,Check-In,Yes,25,212
7,7647912,M,40,Latino,Mid-School,>100K,North,City Hotel,7/3/2015,7/4/2015,6/25/2015,2,1,0,FB,No,No,No Deposit,Online,Yes,Check-In,No,0,170
8,94296865,M,59,African American,Mid-School,>100K,West,Resort,5/6/2015,5/7/2015,2/11/2015,3,2,0,BB,No,No,Non-Refundable,Direct,Yes,Check-In,Yes,10,245
9,93087487,M,21,Latino,Grad,50K -- 100K,West,City Hotel,4/22/2015,4/25/2015,2/9/2015,2,3,0,HB,Yes,No,Refundable,Online,No,Check-In,Yes,20,212


In [2]:
train_headers = train_data.columns
print(len(train_headers))
train_headers

24


Index(['Reservation-id', 'Gender', 'Age', 'Ethnicity', 'Educational_Level',
       'Income', 'Country_region', 'Hotel_Type', 'Expected_checkin',
       'Expected_checkout', 'Booking_date', 'Adults', 'Children', 'Babies',
       'Meal_Type', 'Visted_Previously', 'Previous_Cancellations',
       'Deposit_type', 'Booking_channel', 'Required_Car_Parking',
       'Reservation_Status', 'Use_Promotion', 'Discount_Rate', 'Room_Rate'],
      dtype='object')

## Data Formatting

In [3]:
def conv_date(data_set):
    booking_date = pd.to_datetime(data_set['Booking_date'])
    expected_checkin = pd.to_datetime(data_set['Expected_checkin'])
    expected_checkout = pd.to_datetime(data_set['Expected_checkout'])

    booking_delta = pd.DataFrame((expected_checkin - booking_date).dt.days, columns=['booking_delta'])
    stay_delta =  pd.DataFrame((expected_checkout - expected_checkin).dt.days, columns=['stay_delta'])
    return pd.concat([data_set, booking_delta, stay_delta], axis=1)


train_data = conv_date(train_data)
validation_data = conv_date(validation_data)
test_data = conv_date(test_data)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

# columns
cat_data = ['Gender','Ethnicity','Educational_Level','Income','Country_region','Hotel_Type','Meal_Type','Visted_Previously','Previous_Cancellations','Deposit_type','Booking_channel','Required_Car_Parking','Use_Promotion']
num_data = ['Age', 'Adults', 'Children', 'Babies', 'Discount_Rate', 'Room_Rate']

drop_data = ['Reservation-id','Expected_checkin', 'Expected_checkout', 'Booking_date']

for element in drop_data:
    if element in cat_data:
        cat_data.remove(element)
    if element in num_data:
        num_data.remove(element)

print("Total Fields =", len(cat_data + drop_data + num_data))

def format_data(data_set, is_test=False):
    f_data_set = pd.get_dummies(data_set, prefix=cat_data, columns=cat_data)
    f_data_set = f_data_set.drop(drop_data, axis = 1)
    if(is_test):
        x = f_data_set
        return x
    else:
        x = f_data_set.drop('Reservation_Status', axis = 1)
        y = f_data_set['Reservation_Status']
        return x, y

x_train, y_train = format_data(train_data)
x_val, y_val = format_data(validation_data)
x_test = format_data(test_data, is_test=True)

# x_val1, x_val2, y_val1, y_val2 = train_test_split(x_val, y_val, test_size=0.8, random_state=10)

print(len(x_train), len(x_val), len(x_test))
y_val.head(10)

Total Fields = 23
27499 2749 4318


0     No-Show
1    Canceled
2    Canceled
3    Check-In
4    Check-In
5    Canceled
6     No-Show
7    Canceled
8    Check-In
9    Check-In
Name: Reservation_Status, dtype: object

## Normalize Data

In [5]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

x_train = pd.DataFrame(scaler.fit_transform(x_train), columns=x_train.columns)
x_val = pd.DataFrame(scaler.transform(x_val), columns=x_val.columns)
x_test = pd.DataFrame(scaler.transform(x_test), columns=x_test.columns)

x_train.head(10)

Unnamed: 0,Age,Adults,Children,Babies,Discount_Rate,Room_Rate,booking_delta,stay_delta,Gender_F,Gender_M,Ethnicity_African American,Ethnicity_Asian American,Ethnicity_Latino,Ethnicity_caucasian,Educational_Level_College,Educational_Level_Grad,Educational_Level_High-School,Educational_Level_Mid-School,Income_25K --50K,Income_50K -- 100K,Income_<25K,Income_>100K,Country_region_East,Country_region_North,Country_region_South,Country_region_West,Hotel_Type_Airport Hotels,Hotel_Type_City Hotel,Hotel_Type_Resort,Meal_Type_BB,Meal_Type_FB,Meal_Type_HB,Visted_Previously_No,Visted_Previously_Yes,Previous_Cancellations_No,Previous_Cancellations_Yes,Deposit_type_No Deposit,Deposit_type_Non-Refundable,Deposit_type_Refundable,Booking_channel_Agent,Booking_channel_Direct,Booking_channel_Online,Required_Car_Parking_No,Required_Car_Parking_Yes,Use_Promotion_No,Use_Promotion_Yes
0,0.423077,0.25,0.5,0.0,0.25,0.786667,0.063202,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
1,0.596154,0.5,1.0,0.0,0.0,0.566667,0.05618,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,0.461538,0.5,1.0,0.0,0.0,0.126667,0.009831,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
3,0.134615,0.75,1.0,0.0,0.125,0.293333,0.022472,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
4,0.846154,0.0,0.0,0.0,0.25,0.946667,0.023876,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
5,0.307692,1.0,0.5,0.5,0.25,0.286667,0.023876,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
6,0.673077,0.25,0.0,0.0,0.625,0.746667,0.004213,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
7,0.423077,0.25,0.0,0.0,0.0,0.466667,0.016854,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
8,0.788462,0.5,0.5,0.0,0.25,0.966667,0.123596,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
9,0.057692,0.25,1.0,0.0,0.5,0.746667,0.106742,0.666667,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0


## UpSampling

In [6]:
from imblearn.over_sampling import SMOTENC
cat_indx =range(8, len(x_train.columns))
sm = SMOTENC(categorical_features=cat_indx, random_state=0)

import numpy as np
x_train_res, y_train_res = sm.fit_resample(np.array(x_train), np.array(y_train))


## Correlation Matrix

In [7]:
corr = x_train.join(y_train).corr()

corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Age,Adults,Children,Babies,Discount_Rate,Room_Rate,booking_delta,stay_delta,Gender_F,Gender_M,Ethnicity_African American,Ethnicity_Asian American,Ethnicity_Latino,Ethnicity_caucasian,Educational_Level_College,Educational_Level_Grad,Educational_Level_High-School,Educational_Level_Mid-School,Income_25K --50K,Income_50K -- 100K,Income_<25K,Income_>100K,Country_region_East,Country_region_North,Country_region_South,Country_region_West,Hotel_Type_Airport Hotels,Hotel_Type_City Hotel,Hotel_Type_Resort,Meal_Type_BB,Meal_Type_FB,Meal_Type_HB,Visted_Previously_No,Visted_Previously_Yes,Previous_Cancellations_No,Previous_Cancellations_Yes,Deposit_type_No Deposit,Deposit_type_Non-Refundable,Deposit_type_Refundable,Booking_channel_Agent,Booking_channel_Direct,Booking_channel_Online,Required_Car_Parking_No,Required_Car_Parking_Yes,Use_Promotion_No,Use_Promotion_Yes
Age,1.0,0.005065,0.004385,0.003809,0.006335,-0.000178,0.012485,-0.006275,-0.002908,0.002908,-0.004262,-0.003573,0.000564,0.007279,0.000112,0.002557,-0.001302,-0.001383,-0.003441,0.005889,-0.000219,-0.002891,-0.000774,0.002372,0.007174,-0.010437,0.003564,-0.010096,0.006525,0.001967,-0.015809,0.011426,0.002933,-0.002933,-0.001231,0.001231,0.005916,-0.003512,-0.004201,0.005974,-0.005704,0.000974,-0.000855,0.000855,-0.004318,0.004318
Adults,0.005065,1.0,-0.003966,0.004384,0.003839,-0.006613,-0.003353,-0.000363,-0.000557,0.000557,-0.007843,0.000573,0.003845,0.003471,-0.015993,0.002716,0.009258,0.007634,-0.001658,-0.005202,-0.002993,0.012689,0.012217,0.003409,-0.012991,0.000259,0.007898,0.00493,-0.012864,0.000941,-0.01355,0.010611,0.008322,-0.008322,0.006798,-0.006798,-0.007179,0.009483,0.001797,0.00416,0.007348,-0.009674,-0.005898,0.005898,-0.000214,0.000214
Children,0.004385,-0.003966,1.0,0.000518,-0.010817,0.007783,-0.009894,0.009788,-0.00381,0.00381,0.000795,0.004701,0.000367,-0.005852,-0.001888,-0.003374,0.002551,0.003125,-0.004497,0.0104,-0.007381,0.001822,-0.004873,0.000771,0.004637,-0.001597,-0.003822,-0.001456,0.005295,-0.01048,0.001323,0.01036,0.00259,-0.00259,0.003639,-0.003639,0.012325,-0.006653,-0.009173,-0.000555,0.003011,-0.002359,0.002301,-0.002301,0.000587,-0.000587
Babies,0.003809,0.004384,0.000518,1.0,-0.006798,-0.004881,-0.00194,-0.005837,-0.009121,0.009121,-0.000215,0.003693,-0.00082,-0.002651,-0.002357,-0.005059,0.002976,0.004955,0.00272,-0.008094,0.008665,-0.004156,0.00073,0.001688,-0.005803,0.004678,0.007442,-0.001881,-0.00559,-0.00372,0.004554,0.000168,-0.012201,0.012201,-0.006439,0.006439,0.004363,-0.005611,-0.001188,-0.00544,0.007007,-0.002545,0.004404,-0.004404,0.001957,-0.001957
Discount_Rate,0.006335,0.003839,-0.010817,-0.006798,1.0,-0.0042,0.013085,0.003288,-0.006692,0.006692,-0.01223,0.013424,0.004044,-0.005143,-0.000309,0.002575,-0.008695,0.00653,0.001134,0.006254,-0.007795,0.000454,-0.004179,-0.005275,0.005771,0.002452,0.008925,-0.006346,-0.002611,0.004114,-0.006591,0.00115,0.015827,-0.015827,0.011255,-0.011255,0.000123,0.009413,-0.006086,-0.005886,0.010191,-0.00514,0.001601,-0.001601,-0.640924,0.640924
Room_Rate,-0.000178,-0.006613,0.007783,-0.004881,-0.0042,1.0,0.003737,-0.012215,0.007197,-0.007197,-0.000977,-0.00596,0.001832,0.0051,0.004503,-0.009435,0.0099,-0.006044,0.001175,0.000636,0.005324,-0.009159,-0.003422,0.002104,-0.006633,0.009429,-0.003039,0.007925,-0.00488,-0.001792,0.001881,0.00035,-0.010196,0.010196,-0.001033,0.001033,0.001528,-0.006964,0.002745,0.002556,-0.009736,0.007088,-0.01191,0.01191,-0.003229,0.003229
booking_delta,0.012485,-0.003353,-0.009894,-0.00194,0.013085,0.003737,1.0,-0.015712,0.000454,-0.000454,-0.00283,0.010938,0.005427,-0.013475,0.008321,-0.0036,0.003576,-0.010207,-0.00097,-0.009904,0.008192,0.003539,-0.001576,-0.000732,0.002631,-0.000908,0.002973,0.000383,-0.003368,-0.047863,-0.010239,0.061302,-0.003431,0.003431,-0.00548,0.00548,-0.005931,-0.000678,0.006867,0.007451,0.002923,-0.007965,0.003339,-0.003339,-0.005904,0.005904
stay_delta,-0.006275,-0.000363,0.009788,-0.005837,0.003288,-0.012215,-0.015712,1.0,-0.00922,0.00922,-0.002565,-0.006236,-0.001733,0.01052,-0.001567,0.001636,-5.2e-05,0.000344,-0.003184,-0.000259,4.5e-05,0.004378,0.00419,-0.003679,-0.00173,0.001664,0.002292,-0.004934,0.002636,0.002491,-0.001724,-0.00125,-0.003092,0.003092,0.000463,-0.000463,0.005167,-0.002151,-0.004249,0.003185,0.00764,-0.009248,-0.006859,0.006859,4.3e-05,-4.3e-05
Gender_F,-0.002908,-0.000557,-0.00381,-0.009121,-0.006692,0.007197,0.000454,-0.00922,1.0,-1.0,-0.005191,0.008698,-0.000885,-0.002585,-0.000281,0.002957,-0.006162,0.003575,-0.010164,0.006856,0.009149,-0.007487,0.002401,0.006974,-0.003144,-0.005621,-0.006105,-0.002693,0.008825,-0.000816,-0.001366,0.002068,-0.003861,0.003861,-0.002152,0.002152,0.000948,-0.001212,-0.000263,-0.000138,0.003483,-0.003087,-0.005176,0.005176,0.003402,-0.003402
Gender_M,0.002908,0.000557,0.00381,0.009121,0.006692,-0.007197,-0.000454,0.00922,-1.0,1.0,0.005191,-0.008698,0.000885,0.002585,0.000281,-0.002957,0.006162,-0.003575,0.010164,-0.006856,-0.009149,0.007487,-0.002401,-0.006974,0.003144,0.005621,0.006105,0.002693,-0.008825,0.000816,0.001366,-0.002068,0.003861,-0.003861,0.002152,-0.002152,-0.000948,0.001212,0.000263,0.000138,-0.003483,0.003087,0.005176,-0.005176,-0.003402,0.003402


## Model Training

In [8]:
# Random Forrest Classifier
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=10)
# classifier.fit(x_train, y_train)
classifier.fit(x_train_res, y_train_res)

classifier.score(x_val, y_val)

0.5158239359767188

In [9]:
# XGBoost Classifier
# from xgboost import XGBClassifier
# import numpy as np

# classifier = XGBClassifier(n_estimators=10)
# classifier.fit(np.array(x_train), np.array(y_train))

# classifier.score(np.array(x_val), np.array(y_val))


In [10]:
from sklearn.metrics import f1_score

y_pred = classifier.predict(np.array(x_val))

f1_score(y_val, y_pred, average='macro')

0.3211770074150832

## Inference

In [11]:
y_test = classifier.predict(x_test)

y_test

array(['Check-In', 'Check-In', 'Check-In', ..., 'Canceled', 'Check-In',
       'Check-In'], dtype=object)

In [12]:
y_test_mapped = pd.Series(y_test).map({'Check-In':1,'Canceled':2,'No-Show':3})
y_test_mapped.index = test_data['Reservation-id']
y_test_mapped.to_csv('results/Submission.csv', header=['Reservation_status'])

y_test_mapped.head(20)


Reservation-id
62931593    1
70586099    1
4230648     1
25192322    1
80931528    1
64700386    2
16938050    1
90487908    2
5595445     1
17502557    1
92229895    1
27386334    1
76765181    1
40697301    2
63963888    1
27282946    1
8696141     1
52621292    1
21821230    1
12316731    1
dtype: int64