<img src="images/aaib.PNG" style="width:400px;height:250px;">

# About Today's Practice:

1. Content-Related: **Implementing Supervised ML Models: LR, SVM, DT, RF, AdaBoost, and GB!**
\
&nbsp;
2. **Dataset:** We can see that there are 32 features (columns) and 119390 records (rows) in our dataset. Our main objective with this data is to predict if the **hotel booking** would be made by a customer, provided if they make a reservation within the constraints of out data.
\
&nbsp;

# Set-up 

In [3]:
# Commonly used libraries
import numpy as np
import pandas as pd

# From ScikitLearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import AdaBoostClassifier
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# Data Exploration & Preprocessing

In [4]:
# loading dataset and chekcing its heads!
data = pd.read_csv('datasets/hotel_bookings.csv')
data.head(10)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03
5,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03
6,Resort Hotel,0,0,2015,July,27,1,0,2,2,...,No Deposit,,,0,Transient,107.0,0,0,Check-Out,2015-07-03
7,Resort Hotel,0,9,2015,July,27,1,0,2,2,...,No Deposit,303.0,,0,Transient,103.0,0,1,Check-Out,2015-07-03
8,Resort Hotel,1,85,2015,July,27,1,0,3,2,...,No Deposit,240.0,,0,Transient,82.0,0,1,Canceled,2015-05-06
9,Resort Hotel,1,75,2015,July,27,1,0,3,2,...,No Deposit,15.0,,0,Transient,105.5,0,0,Canceled,2015-04-22


In [5]:
data.shape

(119390, 32)

In [6]:
# We drop some columns right away because they either have very low variance or would not be support (make sense to be used)
data.drop(inplace=True, axis=1, labels=['agent', 'company','hotel','reservation_status_date'])

In [7]:
# Let's check for any null values, if there are any...
data.isnull().sum()

is_canceled                         0
lead_time                           0
arrival_date_year                   0
arrival_date_month                  0
arrival_date_week_number            0
arrival_date_day_of_month           0
stays_in_weekend_nights             0
stays_in_week_nights                0
adults                              0
children                            4
babies                              0
meal                                0
country                           488
market_segment                      0
distribution_channel                0
is_repeated_guest                   0
previous_cancellations              0
previous_bookings_not_canceled      0
reserved_room_type                  0
assigned_room_type                  0
booking_changes                     0
deposit_type                        0
days_in_waiting_list                0
customer_type                       0
adr                                 0
required_car_parking_spaces         0
total_of_spe

In [8]:
# As the focus of the practical is not on DPT, let's simply replace the null values with the mode.
data.fillna(data.mode().iloc[0], inplace=True)

In [9]:
data.isnull().sum()

is_canceled                       0
lead_time                         0
arrival_date_year                 0
arrival_date_month                0
arrival_date_week_number          0
arrival_date_day_of_month         0
stays_in_weekend_nights           0
stays_in_week_nights              0
adults                            0
children                          0
babies                            0
meal                              0
country                           0
market_segment                    0
distribution_channel              0
is_repeated_guest                 0
previous_cancellations            0
previous_bookings_not_canceled    0
reserved_room_type                0
assigned_room_type                0
booking_changes                   0
deposit_type                      0
days_in_waiting_list              0
customer_type                     0
adr                               0
required_car_parking_spaces       0
total_of_special_requests         0
reservation_status          

In [10]:
data.head()

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,...,reserved_room_type,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status
0,0,342,2015,July,27,1,0,0,2,0.0,...,C,C,3,No Deposit,0,Transient,0.0,0,0,Check-Out
1,0,737,2015,July,27,1,0,0,2,0.0,...,C,C,4,No Deposit,0,Transient,0.0,0,0,Check-Out
2,0,7,2015,July,27,1,0,1,1,0.0,...,A,C,0,No Deposit,0,Transient,75.0,0,0,Check-Out
3,0,13,2015,July,27,1,0,1,1,0.0,...,A,A,0,No Deposit,0,Transient,75.0,0,0,Check-Out
4,0,14,2015,July,27,1,0,2,2,0.0,...,A,A,0,No Deposit,0,Transient,98.0,0,1,Check-Out


In [11]:
# Now we will separate the dependent and independent feature
# The dependent variable is "is canceled", which tells us if a reservation was canceled (or not)
# X = ?
# y = ?

X = data.iloc[:,1:]
y = data.iloc[:,0]


In [12]:
# Using onehotencoder for the categorical features!
# ct = make_column_transformer(
#     (OneHotEncoder(),['meal''distribution_channel','reservation_status','country','arrival_date_month','market_segment','deposit_type','customer_type', 'reserved_room_type','assigned_room_type']), remainder = 'passthrough')
ct = make_column_transformer(
    (OneHotEncoder(), ['meal', 'distribution_channel', 'reservation_status', 'country', 'arrival_date_month',
                       'market_segment', 'deposit_type', 'customer_type', 'reserved_room_type', 'assigned_room_type'
                      ]), remainder='passthrough')

In [13]:
# Column Transformer is given the One Hot Encoder and the list of all categorical columns. 
# Now, we simply need to apply fit and transform to our independant variables.
X = ct.fit_transform(X).toarray()


In [14]:
X

array([[  1.  ,   0.  ,   0.  , ...,   0.  ,   0.  ,   0.  ],
       [  1.  ,   0.  ,   0.  , ...,   0.  ,   0.  ,   0.  ],
       [  1.  ,   0.  ,   0.  , ...,  75.  ,   0.  ,   0.  ],
       ...,
       [  1.  ,   0.  ,   0.  , ..., 157.71,   0.  ,   4.  ],
       [  1.  ,   0.  ,   0.  , ..., 104.4 ,   0.  ,   0.  ],
       [  0.  ,   0.  ,   1.  , ..., 151.2 ,   0.  ,   2.  ]])

In [15]:
y

0         0
1         0
2         0
3         0
4         0
         ..
119385    0
119386    0
119387    0
119388    0
119389    0
Name: is_canceled, Length: 119390, dtype: int64

In [16]:
# Now, we split data between train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [17]:
X.shape

(119390, 256)

# Scaling & Dimensionality Reduction 

It's important to note that the number of features just went **from 28 to 256** very quickly. That is a very big number. The [curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality) happens when a dataset has too many variables. It just means that our model will have to deal with too much unnecessary information, which will slow it down and make it less efficient.

We use methods called [Dimensionality Reduction](https://en.wikipedia.org/wiki/Dimensionality_Reduction) to avoid the curse of dimensionality. PCA, or [Principal Component Analysis](https://en.wikipedia.org/wiki/Principal_component_analysis), is one of the most popular ones. PCA has however one small requirement: the data it is used on must have a sandar scale. Which we do in the next cell:

In [18]:
# Scaling the data (Train and Test)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
print("X_train ---------->\n", X_train, "\nX_test -------->\n", X_test)

X_train ---------->
 [[ 0.54036534 -0.0823272  -0.37034568 ...  1.38185952 -0.25462991
  -0.71991517]
 [ 0.54036534 -0.0823272  -0.37034568 ...  0.44713919 -0.25462991
  -0.71991517]
 [ 0.54036534 -0.0823272  -0.37034568 ...  0.98096836 -0.25462991
   1.80114067]
 ...
 [ 0.54036534 -0.0823272  -0.37034568 ... -0.19470211 -0.25462991
   3.06166858]
 [ 0.54036534 -0.0823272  -0.37034568 ... -0.24455386 -0.25462991
  -0.71991517]
 [ 0.54036534 -0.0823272  -0.37034568 ...  1.20945555 -0.25462991
  -0.71991517]] 
X_test -------->
 [[ 0.54036534 -0.0823272  -0.37034568 ... -1.33381462 -0.25462991
  -0.71991517]
 [ 0.54036534 -0.0823272  -0.37034568 ... -0.32556295 -0.25462991
  -0.71991517]
 [ 0.54036534 -0.0823272  -0.37034568 ... -0.82615762 -0.25462991
  -0.71991517]
 ...
 [-1.85059983 -0.0823272  -0.37034568 ...  0.37859303 -0.25462991
  -0.71991517]
 [ 0.54036534 -0.0823272  -0.37034568 ... -0.14069604 -0.25462991
  -0.71991517]
 [ 0.54036534 -0.0823272  -0.37034568 ... -0.47304105 -0.2

In [19]:
# Implementing PCA - To reduce Dimensionality 
pca = PCA(n_components = 50)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
explained_variance
# The number of components that we are asking to be selected is 50. IN practice you do with "None", check EV, 
# then select a threshold and compare the EV.

array([0.02163567, 0.0155279 , 0.01290689, 0.01181745, 0.01017084,
       0.00945854, 0.00856808, 0.00811739, 0.00797673, 0.00770754,
       0.00747807, 0.00722409, 0.00682987, 0.00660287, 0.00624453,
       0.00588291, 0.00570149, 0.00565886, 0.00552897, 0.00512911,
       0.00484721, 0.00472453, 0.00459591, 0.00450492, 0.00444421,
       0.00436872, 0.00435433, 0.00431191, 0.00428845, 0.00426635,
       0.00422217, 0.00416725, 0.0041111 , 0.00410826, 0.00409868,
       0.00407334, 0.00406472, 0.00405924, 0.00405361, 0.00404471,
       0.00403676, 0.00402927, 0.00402345, 0.00401892, 0.00401738,
       0.00401583, 0.00401443, 0.00401054, 0.00400857, 0.00400366])

# Models Implementation - LR

In [45]:
# Logistic Regression
classifier = LogisticRegression(random_state = 0, max_iter=1000, solver = 'lbfgs')
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=1000, random_state=0)

Now, let's see how our model performs on the test data

In [21]:
# Let's check how our model performs on the test data
y_pred = classifier.predict(X_test)

To calculate the accuracy of our model, the simplest way is to construct a confusion matrix

In [22]:
# Confusion matrix (CM)
cm = confusion_matrix(y_test, y_pred)
cm

array([[14900,    34],
       [   30,  8914]], dtype=int64)

In [23]:
# Accuracy Score
# ac = accuracy_score(y_train, y_pred)
ac = accuracy_score(y_test, y_pred)
ac

0.9973197085183013

# Models Implementation - SVM

In [24]:
clf_svm = svm.SVC(max_iter=1000, gamma='scale', kernel = "rbf", random_state=0)
clf_svm.fit(X_train, y_train)



SVC(max_iter=1000, random_state=0)

In [25]:
y_pred = clf_svm.predict(X_test)

In [26]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[14918,    16],
       [   40,  8904]], dtype=int64)

In [27]:
ac = accuracy_score(y_test, y_pred)
ac

0.9976547449535137

# Models Implementation - DT

In [28]:
clf_tree = tree.DecisionTreeClassifier(max_depth=5, criterion = "gini", min_samples_split=100,
                                       min_samples_leaf= 30, random_state=0)
clf_tree.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=5, min_samples_leaf=30, min_samples_split=100,
                       random_state=0)

In [46]:
y_pred = clf_tree.predict(X_test)

In [30]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[14628,   306],
       [ 1045,  7899]], dtype=int64)

In [31]:
ac = accuracy_score(y_test, y_pred)
ac

0.9434207220035179

# Models Implementation - RF

In [32]:
clf_rf = RandomForestClassifier(n_estimators=30, max_depth=5, random_state=0)
clf_rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=5, n_estimators=30, random_state=0)

In [33]:
y_pred = clf_rf.predict(X_test)

In [34]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[14889,    45],
       [ 1114,  7830]], dtype=int64)

In [35]:
ac = accuracy_score(y_test, y_pred)
ac

0.9514615964486138

# Models Implementation - AB

In [36]:
clf_ab = AdaBoostClassifier(n_estimators=100, learning_rate=1, random_state=0)
clf_ab.fit(X_train, y_train)

AdaBoostClassifier(learning_rate=1, n_estimators=100, random_state=0)

In [37]:
y_pred = clf_ab.predict(X_test)

In [38]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[14901,    33],
       [   54,  8890]], dtype=int64)

In [39]:
ac = accuracy_score(y_test, y_pred)
ac

0.9963564787670659

# Models Implementation - GB

In [40]:
clf_gb = GradientBoostingClassifier(n_estimators=30, learning_rate=0.1, max_depth=1, random_state=0)
clf_gb.fit(X_train, y_train)

GradientBoostingClassifier(max_depth=1, n_estimators=30, random_state=0)

In [41]:
y_pred = clf_gb.predict(X_test)

In [42]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[14542,   392],
       [ 2114,  6830]], dtype=int64)

In [48]:
ac = accuracy_score(y_test, y_pred)
ac

0.9434207220035179

## References:

- [Dataset from Kaggle](https://www.kaggle.com/datasets/ahsan81/hotel-reservations-classification-dataset)
- [Hotel Booking (Logistic Regression) by Amit Sharma](https://www.kaggle.com/datasets/jessemostipak/hotel-booking-demand)