**INF2179 CLASSIFICATION CHALLENGE** <br> By: Lawrence Fraginal <br>Student #1004071489<br>INF2179 Introduction to Machine Learning in Python<br>Professor Mehdi Ataei<br> March 24, 2023

In [1]:
# Import necessary packages

import pandas as pd 
import numpy as np
import matplotlib.pylab as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [2]:
# Import the training data 
training = pd.read_csv("/content/drive/MyDrive/INF2179 Classification Challenge/train.csv")

In [3]:
# Getting the testing data 
testing = pd.read_csv("/content/drive/MyDrive/INF2179 Classification Challenge/test.csv")

In [4]:
# Resampling the training data to balance out the data for better model training
nc_training = training[training['booking_status'] == 'Not_Canceled'].sample(9437)
c_training = training[training['booking_status'] == 'Canceled']

In [5]:
f_training = pd.concat([nc_training, c_training])

In [6]:
# Validating the size of the updated training set
len(f_training)

18874

In [7]:
# Shuffling the new training dataset 
f_training = f_training.sample(frac = 1)

In [8]:
f_training.head(20)

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
23773,INN23373,3,0,1,4,Meal Plan 1,0,Room_Type 4,32,2018,11,14,Online,0,0,0,107.1,1,Canceled
15016,INN24537,2,0,2,2,Meal Plan 1,0,Room_Type 4,93,2018,10,22,Online,0,0,0,125.33,0,Canceled
5431,INN34248,2,0,1,1,Meal Plan 2,0,Room_Type 1,247,2018,6,6,Offline,0,0,0,115.0,1,Canceled
28105,INN03309,2,0,1,2,Meal Plan 1,0,Room_Type 1,151,2018,8,5,Offline,0,0,0,72.25,0,Not_Canceled
4472,INN03276,3,0,0,1,Meal Plan 1,0,Room_Type 4,31,2018,4,30,Online,0,0,0,168.3,0,Canceled
28729,INN08456,2,0,2,0,Meal Plan 1,0,Room_Type 1,175,2018,9,18,Online,0,0,0,126.9,0,Canceled
10235,INN34220,3,0,0,4,Meal Plan 1,0,Room_Type 4,19,2017,9,30,Online,0,0,0,91.47,2,Not_Canceled
12470,INN15778,2,0,0,2,Meal Plan 1,0,Room_Type 1,18,2018,2,19,Online,0,0,0,93.0,1,Not_Canceled
22701,INN23185,2,0,0,2,Meal Plan 1,0,Room_Type 4,81,2018,8,5,Online,0,0,0,131.4,0,Canceled
17366,INN31280,2,0,1,0,Meal Plan 1,0,Room_Type 1,16,2018,8,1,Online,0,0,0,121.0,1,Not_Canceled


**Data Preprocessing**

Step 1: Get dummy variables


In [9]:
# Drop ID column

f_training= f_training.drop('Booking_ID', axis = 1)
testing= testing.drop('Booking_ID', axis = 1)

In [10]:
# One hot encoding of categorical data

training_dum = pd.get_dummies(data=f_training, columns= ['type_of_meal_plan', 'room_type_reserved','market_segment_type','booking_status'])
testing_dum = pd.get_dummies(data=testing, columns= ['type_of_meal_plan', 'room_type_reserved','market_segment_type','booking_status'])

In [11]:
training_dum

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,lead_time,arrival_year,arrival_month,arrival_date,repeated_guest,...,room_type_reserved_Room_Type 5,room_type_reserved_Room_Type 6,room_type_reserved_Room_Type 7,market_segment_type_Aviation,market_segment_type_Complementary,market_segment_type_Corporate,market_segment_type_Offline,market_segment_type_Online,booking_status_Canceled,booking_status_Not_Canceled
23773,3,0,1,4,0,32,2018,11,14,0,...,0,0,0,0,0,0,0,1,1,0
15016,2,0,2,2,0,93,2018,10,22,0,...,0,0,0,0,0,0,0,1,1,0
5431,2,0,1,1,0,247,2018,6,6,0,...,0,0,0,0,0,0,1,0,1,0
28105,2,0,1,2,0,151,2018,8,5,0,...,0,0,0,0,0,0,1,0,0,1
4472,3,0,0,1,0,31,2018,4,30,0,...,0,0,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12922,2,0,1,3,0,33,2018,12,1,0,...,0,0,0,0,0,0,0,1,0,1
15432,2,0,2,1,0,100,2017,12,12,0,...,0,0,0,0,0,0,0,1,0,1
12955,1,0,0,2,0,5,2018,5,24,0,...,0,0,0,0,0,1,0,0,0,1
27199,2,2,2,1,0,107,2018,12,3,0,...,0,1,0,0,0,0,0,1,1,0


In [12]:
testing_dum

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,lead_time,arrival_year,arrival_month,arrival_date,repeated_guest,...,room_type_reserved_Room_Type 5,room_type_reserved_Room_Type 6,room_type_reserved_Room_Type 7,market_segment_type_Aviation,market_segment_type_Complementary,market_segment_type_Corporate,market_segment_type_Offline,market_segment_type_Online,booking_status_Canceled,booking_status_Not_Canceled
0,1,0,2,4,0,245,2018,7,6,0,...,0,0,0,0,0,0,1,0,1,0
1,2,0,3,5,0,160,2018,7,25,0,...,0,0,0,0,0,0,0,1,1,0
2,2,0,1,1,0,151,2018,8,13,0,...,0,0,0,0,0,0,0,1,1,0
3,2,0,0,2,0,74,2017,9,18,0,...,0,0,0,0,0,0,1,0,0,1
4,2,0,0,2,0,102,2017,10,16,0,...,0,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7250,3,0,0,2,0,3,2018,4,7,0,...,0,0,1,0,0,0,0,1,0,1
7251,2,0,0,4,0,156,2018,8,2,0,...,0,0,0,0,0,0,0,1,1,0
7252,1,0,0,2,0,188,2018,6,15,0,...,0,0,0,0,0,0,1,0,1,0
7253,2,0,0,3,0,213,2018,6,7,0,...,0,0,0,0,0,0,1,0,1,0


In [13]:
# number of columns are mis-matched in the testing data for some reason; need to add 'type_of_meal_plan_3' column
my_list = ['0'] * 7255

testing_dum.insert(16,"type_of_meal_plan_Meal Plan 3", value = my_list, allow_duplicates = True)

Step 2: Splitting the data

In [14]:
# Split the data into features and target
X_train = training_dum.drop(labels=["booking_status_Canceled", "booking_status_Not_Canceled"], axis=1)
y_train = training_dum["booking_status_Canceled"]

X_test = testing_dum.drop(labels=["booking_status_Canceled", "booking_status_Not_Canceled"], axis=1)
y_test = testing_dum["booking_status_Canceled"]

Step 3: Normalization

In [15]:
# Normalize the training data 
scaler = StandardScaler ()

X_train[["no_of_adults", 'no_of_children',"no_of_weekend_nights","no_of_week_nights","lead_time","arrival_year","arrival_month","arrival_date","no_of_previous_cancellations","no_of_previous_bookings_not_canceled", "avg_price_per_room", "no_of_special_requests"]] = scaler.fit_transform(X_train[["no_of_adults", 'no_of_children',"no_of_weekend_nights","no_of_week_nights","lead_time","arrival_year","arrival_month","arrival_date","no_of_previous_cancellations","no_of_previous_bookings_not_canceled", "avg_price_per_room", "no_of_special_requests"]])
X_test[["no_of_adults", 'no_of_children',"no_of_weekend_nights","no_of_week_nights","lead_time","arrival_year","arrival_month","arrival_date","no_of_previous_cancellations","no_of_previous_bookings_not_canceled", "avg_price_per_room", "no_of_special_requests"]] = scaler.fit_transform(X_test[["no_of_adults", 'no_of_children',"no_of_weekend_nights","no_of_week_nights","lead_time","arrival_year","arrival_month","arrival_date","no_of_previous_cancellations","no_of_previous_bookings_not_canceled", "avg_price_per_room", "no_of_special_requests"]])

In [16]:
pd.set_option('display.max_columns', None)

**Model Building**

*LOGISTIC REGRESSION*

In [17]:
# Train the model

model_norm = LogisticRegression(tol = 0.001, max_iter = 500, solver='liblinear')
model_norm.fit(X_train, y_train)

In [18]:
# display coefficients for weights 
weights = model_norm.coef_
intercept = model_norm.intercept_

print("Weights:", weights)
print("Intercept:", intercept)

Weights: [[ 0.00788393  0.04017437  0.14430418  0.07875447 -1.4646746   1.484274
   0.13681668 -0.13323052  0.02858904 -2.01328861  0.05397989 -0.07401401
   0.63057411 -1.11299118 -0.33836068 -0.15864871  0.16501355  0.00671766
   0.45861357  0.27543957  0.12947557  0.27908997 -0.31943315 -0.46989212
  -0.67857159  0.85320711 -1.04728213  0.04302259 -0.99223949  0.81801374]]
Intercept: [-0.32527818]


In [19]:
# make predictions on the training data

x_pred = model_norm.predict(X_train)

train_acc = accuracy_score(y_train, x_pred)
print("Estimated Accuracy:", train_acc)

Estimated Accuracy: 0.776041114761047


In [20]:
# make predictions on the test data
y_pred = model_norm.predict(X_test)

model_acc = accuracy_score(y_test, y_pred)
print("Accuracy:", model_acc)

Accuracy: 0.7643004824259132


**Saving Results**

In [21]:
# Re-applying categorical labels to the predictions
y_pred = pd.DataFrame(y_pred, columns= ["Prediction"])

y_pred.replace({0: "Not_Cancelled", 1:"Cancelled"})

Unnamed: 0,Prediction
0,Cancelled
1,Cancelled
2,Cancelled
3,Not_Cancelled
4,Not_Cancelled
...,...
7250,Cancelled
7251,Cancelled
7252,Cancelled
7253,Cancelled


In [22]:
# Convert accuracy scores to DataFrames
est_accuracy = {'Estimated_Accuracy':[train_acc]}
ea = pd.DataFrame(data= est_accuracy)

accuracy = {'Accuracy': [model_acc]}
a = pd.DataFrame(data= accuracy)

In [23]:
# Exporting predictions to CSV file
y_pred.to_csv('pred.csv', index= False, header= False)
ea.to_csv('ea.csv', index= False, header= False)
a.to_csv('a.csv', index= False, header= False)

*DECISION TREE* 

In [24]:
# Importing the Decision Tree Classifier to compare the accuracy vs the previous Logistic Regression Model 
from sklearn.tree import DecisionTreeClassifier

In [25]:
clf = DecisionTreeClassifier()

In [26]:
# Training the model
clf = clf.fit(X_train, y_train)

In [27]:
# Make predictions on the training data

x_pred1 = model_norm.predict(X_train)

train_acc1 = accuracy_score(y_train, x_pred1)
print("Estimated Accuracy:", train_acc1)

Estimated Accuracy: 0.776041114761047


In [28]:
# Make predictions on the test data
y_pred1 = clf.predict(X_test)

model_acc1 = accuracy_score(y_test, y_pred1)
print("Accuracy:", model_acc1)

Accuracy: 0.7838731909028256


UPDATED: Eliminating features to improve accuracy

In [30]:
# Intuitively, the meal plan that a customer chooses should not have an impact on the cancellation of their reservation
# Let's eliminate these features to see whether eliminating unnecessary features will improve the performance of the model 
X_train_new = X_train.drop(labels=['type_of_meal_plan_Meal Plan 1', 'type_of_meal_plan_Meal Plan 2', 'type_of_meal_plan_Meal Plan 3', 'type_of_meal_plan_Not Selected'], axis = 1)
X_test_new = X_test.drop(labels=['type_of_meal_plan_Meal Plan 1', 'type_of_meal_plan_Meal Plan 2', 'type_of_meal_plan_Meal Plan 3', 'type_of_meal_plan_Not Selected'], axis = 1)

In [31]:
# Train the updated model 
model_norm.fit(X_train_new, y_train)

In [32]:
# Make new predictions on the test data 
x_pred_new = model_norm.predict(X_train_new)

In [33]:
# Observe the model performance 
updated_train_acc = accuracy_score(y_train, x_pred_new)
print("Estimated Accuracy:", updated_train_acc)

Estimated Accuracy: 0.7747695242132033
