# Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import re # Regular expression library
import string


data = pd.read_csv('Hotel Reservations.csv')
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled
1,INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled
3,INN00004,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,Canceled
4,INN00005,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,Canceled


In [2]:


# Correcting the 'lower' lambda function
lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower()) 

# Applying the functions to the DataFrame
# Applying lowercase to all string columns in the DataFrame
for col in df.columns:
    if df[col].dtype == 'object':  # typically string columns are of 'object' type
        df[col] = df[col].map(lower)
        
#df.head() 

#df.info()
#df.isnull().sum()
unique_meal_values = df['type_of_meal_plan'].unique()
unique_room_values = df['room_type_reserved'].unique()
unique_segment_values = df['market_segment_type'].unique()
print(sorted(unique_meal_values))
print(sorted(unique_room_values))
print(unique_segment_values)


['meal plan 1', 'meal plan 2', 'meal plan 3', 'not selected']
['room type 1', 'room type 2', 'room type 3', 'room type 4', 'room type 5', 'room type 6', 'room type 7']
['offline' 'online' 'corporate' 'aviation' 'complementary']


In [3]:
df['type_of_meal_plan'] = df['type_of_meal_plan'].replace({'not selected': 0, 'meal plan 1': 1, 'meal plan 2': 2, 'meal plan 3': 3})
df['room_type_reserved'] = df['room_type_reserved'].replace({'room type 1': 0, 'room type 2': 1, 'room type 3': 2, 'room type 4': 3,
                                                             'room type 5': 4, 'room type 6': 5, 'room type 7': 6})
df['market_segment_type'] = df['market_segment_type'].replace({'offline': 0, 'online': 1, 'corporate': 2, 'aviation': 3, 'complementary': 4})

encoded_unique_meal_values = df['type_of_meal_plan'].unique()
encoded_unique_room_values = df['room_type_reserved'].unique()
encoded_unique_segment_values = df['market_segment_type'].unique()

#print(unique_meal_values)
#print(unique_room_values)
#print(unique_segment_values)

#print(encoded_unique_meal_values)
#print(encoded_unique_room_values)
#print(encoded_unique_segment_values)

### Type of Meal Plan

    Not Selected = 0
    Meal Plan 1 = 1
    Meal Plan 2 = 2
    Meal Plan 3 = 3

### Room Type

    Room Type 1 = 0
    Room Type 2 = 1
    Room Type 3 = 2
    Room Type 4 = 3
    Room Type 5 = 4
    Room Type 6 = 5
    Room Type 7 = 6

### Market Segment

    offline = 0
    online = 1
    corporate = 2
    avitation = 3
    complementary = 4

# Split the data for train and test data

In [6]:
X = df[['no_of_adults', 'no_of_children','no_of_weekend_nights','no_of_week_nights', 'type_of_meal_plan', 	
               'required_car_parking_space', 'room_type_reserved', 'lead_time', 'arrival_year', 'arrival_month', 	
               'arrival_date', 'market_segment_type', 'repeated_guest', 'no_of_previous_cancellations', 'no_of_previous_bookings_not_canceled', 	
               'avg_price_per_room', 'no_of_special_requests']]

y = df['booking_status']
X.head()

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests
0,2,0,1,2,1,0,0,224,2017,10,2,0,0,0,0,65.0,0
1,2,0,2,3,0,0,0,5,2018,11,6,1,0,0,0,106.68,1
2,1,0,2,1,1,0,0,1,2018,2,28,1,0,0,0,60.0,0
3,2,0,0,2,1,0,0,211,2018,5,20,1,0,0,0,100.0,0
4,2,0,1,1,0,0,0,48,2018,4,11,1,0,0,0,94.5,0


In [5]:
from sklearn.model_selection import train_test_split

#Split the data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shapes of the resulting sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (29020, 17)
Shape of X_test: (7255, 17)
Shape of y_train: (29020,)
Shape of y_test: (7255,)
