# Classification Methods: Predicting Hotel Cancellations

### Python version

In [1]:
from platform import python_version
print(python_version())

3.8.10


### Import Libraries

In [2]:
import csv
import imblearn
import matplotlib.pyplot as plt
import numpy as np
from numpy.random import seed
seed(1)

import os
import pandas as pd
import random
import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from collections import Counter

### Import Data From AWS S3 to Sagemaker

In [3]:
# import boto3
# import botocore
# from sagemaker import get_execution_role

# role = get_execution_role()

# bucket = 'enterbucketname'
# data_key_train = 'H1.csv'
# data_location_train = 's3://{}/{}'.format(bucket, data_key_train)

# train_df = pd.read_csv(data_location_train)

### Import Data From Azure Blob Storage to Azure Machine Learning Studio

In [4]:
# from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient

# #download csv file from Azure blob

# sas_url = "enter url here"
# blob_client = BlobClient.from_blob_url(sas_url)
# downloaded_blo = blob_client.download_blob()

In [5]:
# from io import StringIO
# blob_data = blob_client.download_blob()
# train_df = pd.read_csv(StringIO(blob_data.content_as_text()))
# print(train_df)

### Import Data Through CSV

In [6]:
train_df = pd.read_csv('H1.csv')
a=train_df.head()
b=train_df
b
b.sort_values(['ArrivalDateYear','ArrivalDateWeekNumber'], ascending=True)

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,...,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate
0,0,342,2015,July,27,1,0,0,2,0,...,No Deposit,,,0,Transient,0.00,0,0,Check-Out,2015-07-01
1,0,737,2015,July,27,1,0,0,2,0,...,No Deposit,,,0,Transient,0.00,0,0,Check-Out,2015-07-01
2,0,7,2015,July,27,1,0,1,1,0,...,No Deposit,,,0,Transient,75.00,0,0,Check-Out,2015-07-02
3,0,13,2015,July,27,1,0,1,1,0,...,No Deposit,304,,0,Transient,75.00,0,0,Check-Out,2015-07-02
4,0,14,2015,July,27,1,0,2,2,0,...,No Deposit,240,,0,Transient,98.00,0,1,Check-Out,2015-07-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40055,0,212,2017,August,35,31,2,8,2,1,...,No Deposit,143,,0,Transient,89.75,0,0,Check-Out,2017-09-10
40056,0,169,2017,August,35,30,2,9,2,0,...,No Deposit,250,,0,Transient-Party,202.27,0,1,Check-Out,2017-09-10
40057,0,204,2017,August,35,29,4,10,2,0,...,No Deposit,250,,0,Transient,153.57,0,3,Check-Out,2017-09-12
40058,0,211,2017,August,35,31,4,10,2,0,...,No Deposit,40,,0,Contract,112.80,0,1,Check-Out,2017-09-14


In [7]:
IsCanceled = train_df['IsCanceled']
y = IsCanceled

### Numerical Variables

In [8]:
leadtime = train_df['LeadTime']
arrivaldateyear = train_df['ArrivalDateYear']
arrivaldateweekno = train_df['ArrivalDateWeekNumber']
arrivaldatedayofmonth = train_df['ArrivalDateDayOfMonth']
staysweekendnights = train_df['StaysInWeekendNights']
staysweeknights = train_df['StaysInWeekNights']
adults = train_df['Adults']
children = train_df['Children']
babies = train_df['Babies']
previouscancellations = train_df['PreviousCancellations']
previousbookingsnotcanceled = train_df['PreviousBookingsNotCanceled']
bookingchanges = train_df['BookingChanges']
dayswaitinglist = train_df['DaysInWaitingList']
adr = train_df['ADR']
rcps = train_df['RequiredCarParkingSpaces']
totalsqr = train_df['TotalOfSpecialRequests']

### Categorical Variables

In [9]:
arrivaldatemonth = train_df.ArrivalDateMonth.astype("category").cat.codes
arrivaldatemonthcat=pd.Series(arrivaldatemonth)
mealcat=train_df.Meal.astype("category").cat.codes
mealcat=pd.Series(mealcat)
countrycat=train_df.Country.astype("category").cat.codes
countrycat=pd.Series(countrycat)
marketsegmentcat=train_df.MarketSegment.astype("category").cat.codes
marketsegmentcat=pd.Series(marketsegmentcat)
distributionchannelcat=train_df.DistributionChannel.astype("category").cat.codes
distributionchannelcat=pd.Series(distributionchannelcat)
reservedroomtypecat=train_df.ReservedRoomType.astype("category").cat.codes
reservedroomtypecat=pd.Series(reservedroomtypecat)
assignedroomtypecat=train_df.AssignedRoomType.astype("category").cat.codes
assignedroomtypecat=pd.Series(assignedroomtypecat)
deposittypecat=train_df.DepositType.astype("category").cat.codes
deposittypecat=pd.Series(deposittypecat)
customertypecat=train_df.CustomerType.astype("category").cat.codes
customertypecat=pd.Series(customertypecat)
reservationstatuscat=train_df.ReservationStatus.astype("category").cat.codes
reservationstatuscat=pd.Series(reservationstatuscat)
isrepeatedguestcat = train_df.IsRepeatedGuest.astype("category").cat.codes
isrepeatedguestcat=pd.Series(isrepeatedguestcat)
agentcat = train_df.Agent.astype("category").cat.codes
agentcat=pd.Series(agentcat)
companycat = train_df.Company.astype("category").cat.codes
companycat=pd.Series(companycat)

In [10]:
x = np.column_stack((leadtime,arrivaldateyear,arrivaldatemonthcat,arrivaldateweekno,arrivaldatedayofmonth,staysweekendnights,staysweeknights,adults,children,babies,mealcat,countrycat,marketsegmentcat,distributionchannelcat,isrepeatedguestcat,previouscancellations,previousbookingsnotcanceled,reservedroomtypecat,assignedroomtypecat,bookingchanges,deposittypecat,dayswaitinglist,customertypecat,adr,rcps,totalsqr,reservationstatuscat))
x = sm.add_constant(x, prepend=True)

## Feature Selection

### Wrapper-Based: Forward Search

In [11]:
# from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
# from sklearn.metrics import roc_auc_score
# from mlxtend.feature_selection import SequentialFeatureSelector

# forward_feature_selector = SequentialFeatureSelector(RandomForestClassifier(n_jobs=-1),
#            k_features=6,
#            forward=True,
#            verbose=2,
#            scoring='roc_auc',
#            cv=4)

In [12]:
# fselector = forward_feature_selector.fit(x, y)

In [13]:
# fselector.k_feature_names_

### Wrapper-Based: Backward Search

In [14]:
# from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
# from sklearn.metrics import roc_auc_score 

# backward_feature_selector = SequentialFeatureSelector(RandomForestClassifier(n_jobs=-1),
#            k_features=6,
#            forward=False,
#            verbose=2,
#            scoring='roc_auc',
#            cv=4)

In [15]:
# bselector = backward_feature_selector.fit(x, y)

In [16]:
# bselector.k_feature_names_

### Extra Trees Classifier

In [17]:
# from sklearn.ensemble import ExtraTreesClassifier
# model = ExtraTreesClassifier()
# model.fit(x, y)
# print(model.feature_importances_)

In [18]:
# ext=pd.DataFrame(model.feature_importances_,columns=["extratrees"])
# ext
# ext.sort_values(['extratrees'], ascending=True)

### Selected Features

In [19]:
x1 = np.column_stack((leadtime,countrycat,marketsegmentcat,deposittypecat,customertypecat,rcps,arrivaldateweekno))
x1 = sm.add_constant(x1, prepend=True)

In [20]:
x1_train, x1_val, y1_train, y1_val = train_test_split(x1, y, random_state=0)

In [21]:
clf = RandomForestClassifier(random_state=0 )
clf.fit(x1_train, y1_train)
R_y_pred = clf.predict(x1_val)
print(classification_report(y_test, R_y_pred))

NameError: name 'RandomForestClassifier' is not defined