In [66]:
import pandas as pd
import sweetviz as sv
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime
import warnings
import mlflow
import pickle
import category_encoders as ce
from sklearn.base import clone
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import(
    ShuffleSplit,
    KFold,
    cross_validate,
    cross_val_predict,
    GridSearchCV,
    train_test_split
)
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.feature_selection import RFE, SelectKBest, f_classif


In [67]:
df = pd.read_csv("hotel_bookings.csv")

## Creating train / test split file

In [68]:
df['index'] = df.index
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_val, df_train = train_test_split(df_train, train_size = 0.05, random_state=42)

df_train.to_csv('Data/train_data.csv', index=False)
df_val.to_csv('Data/validation_data.csv', index=False)
df_test.to_csv('Data/test_data.csv', index=False)

print('Train size:', len(df_train), 'Test size:', len(df_test), 'Validation size:', len(df_val))


Train size: 90737 Test size: 23878 Validation size: 4775


In [69]:
df_train = pd.read_csv("Data/train_data.csv")

# Creating processing pipeline

In [70]:
def data_cleaning(raw_data):
    #Replacing the small amount of null values in children and country columns with sensible values
    null_replacements = {"children": 0.0,"country": "Unknown"} 
    clean_data = raw_data.fillna(null_replacements) 
    #Dropping only row with room type L
    clean_data = clean_data[clean_data['assigned_room_type'] != 'L']
    #Replace error entries with nulls
    clean_data['market_segment'].replace('Undefined', np.nan, inplace=True)
    clean_data['distribution_channel'].replace('Undefined', np.nan, inplace=True)
    #Dropping bookings with no guests
    empty_bookings = (clean_data.children == 0) & (clean_data.adults == 0) & (clean_data.babies == 0) # There are no guests when these conditions are true
    clean_data = clean_data[~empty_bookings] 
    #Dropping all the duplicates
    clean_data = clean_data.drop_duplicates()
    clean_data = clean_data.reset_index(drop=True)
    return clean_data
df_train = data_cleaning(df_train)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  clean_data['market_segment'].replace('Undefined', np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  clean_data['distribution_channel'].replace('Undefined', np.nan, inplace=True)


In [71]:
def feature_engineering(df):
    # Add a boolean variable for whether or not an agent was used to book
    df['used_agent'] = np.where(df['agent'].isnull(),0,1)
    # Combining weekdays and weekends for a total nights value
    df['total_nights'] = df['stays_in_weekend_nights'] + df['stays_in_week_nights']
    # Adding a column to map hotel to 0/1
    df['hotel_binary'] = df['hotel'].map({'Resort Hotel': 0, 'City Hotel': 1})
    # Creating a flags for guests coming from Portugal and EU
    df['portugal_flag'] = (df['country'] == 'PRT').astype('float64')
    eu_countries = ['AUT, BEL, BGR, HRV, CYP, CZE, DNK, EST, FIN, FRA, DEU, GRC, HUN, IRL, ITA, LVA, LTU, LUX, MLT, NLD, POL']
    df['eu_flag'] = df['country'].isin(eu_countries).astype('float64')
    
    # Now finding the day of the week for each booking 

    # Dictionary to map the month to its corresponding number
    month_to_number = {
        "January": 1,
        "February": 2,
        "March": 3,
        "April": 4,
        "May": 5,
        "June": 6,
        "July": 7,
        "August": 8,
        "September": 9,
        "October": 10,
        "November": 11,
        "December": 12
    }
    # adding columns for the date of arrivaL
    df['arrival_date_year'] = df['arrival_date_year'].astype(str)
    df['arrival_date_month_number'] = df['arrival_date_month'].map(month_to_number).astype(str).str.zfill(2)  
    df['arrival_date_day_of_month'] = df['arrival_date_day_of_month'].astype(str).str.zfill(2)  
    # string in the format "YYYY-MM-DD"
    df['raw_date_string'] = df['arrival_date_year'] + "-" + \
                                    df['arrival_date_month_number'] + "-" + \
                                    df['arrival_date_day_of_month']
    df['raw_date_string']

    # Creating a function that transforms the date into its corresponding day of the week
    def find_day(raw_date_string):
        date_object = datetime.strptime(raw_date_string, "%Y-%m-%d")
        day_of_week = date_object.weekday()
        return day_of_week
    df['day_of_week_on_arrival'] = df['raw_date_string'].apply(find_day)

    # Adding a column to encode the seasonal affect of which week in the year the booking was made
    period = df['arrival_date_week_number'].nunique()
    two_pi = 2*np.pi
    df['sin_week'] = np.sin(two_pi * df['arrival_date_week_number'] / period)
    df['cos_week'] = np.cos(two_pi * df['arrival_date_week_number'] / period)
    
    df['arrival_date_day_of_month'] = df['arrival_date_day_of_month'].astype(int)
    df['arrival_date_month_number'] = df['arrival_date_month_number'].astype(int)
    
    return df
df_train = feature_engineering(df_train)

In [72]:
df_train

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,used_agent,total_nights,hotel_binary,portugal_flag,eu_flag,arrival_date_month_number,raw_date_string,day_of_week_on_arrival,sin_week,cos_week
0,City Hotel,0,13,2017,August,32,6,2,1,2,...,1,3,1,0.0,0.0,8,2017-08-06,6,-0.606800,-0.794854
1,Resort Hotel,0,1,2016,April,17,23,0,1,2,...,1,1,0,0.0,0.0,4,2016-04-23,5,0.902798,-0.430065
2,City Hotel,0,194,2017,April,13,1,1,1,2,...,1,2,1,0.0,0.0,4,2017-04-01,5,0.999561,0.029633
3,City Hotel,0,159,2016,September,40,30,2,3,2,...,1,5,1,0.0,0.0,9,2016-09-30,4,-0.999561,0.029633
4,City Hotel,1,214,2017,May,18,3,0,4,2,...,1,4,1,0.0,0.0,5,2017-05-03,2,0.845596,-0.533823
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90596,City Hotel,1,261,2015,October,42,16,0,1,2,...,1,1,1,1.0,0.0,10,2015-10-16,4,-0.964636,0.263587
90597,City Hotel,1,35,2017,August,33,16,0,4,2,...,1,4,1,1.0,0.0,8,2017-08-16,2,-0.696551,-0.717507
90598,City Hotel,0,8,2016,May,20,12,0,2,2,...,1,2,1,1.0,0.0,5,2016-05-12,3,0.696551,-0.717507
90599,City Hotel,0,0,2015,October,41,7,0,2,2,...,1,2,1,1.0,0.0,10,2015-10-07,2,-0.989040,0.147647


In [73]:
df_train.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90601 entries, 0 to 90600
Data columns (total 43 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   hotel                           90601 non-null  object 
 1   is_canceled                     90601 non-null  int64  
 2   lead_time                       90601 non-null  int64  
 3   arrival_date_year               90601 non-null  object 
 4   arrival_date_month              90601 non-null  object 
 5   arrival_date_week_number        90601 non-null  int64  
 6   arrival_date_day_of_month       90601 non-null  int32  
 7   stays_in_weekend_nights         90601 non-null  int64  
 8   stays_in_week_nights            90601 non-null  int64  
 9   adults                          90601 non-null  int64  
 10  children                        90601 non-null  float64
 11  babies                          90601 non-null  int64  
 12  meal                            

In [74]:
numeric_feats = [
    "lead_time",
    "arrival_date_week_number",
    "arrival_date_day_of_month",
    "stays_in_weekend_nights",
    "stays_in_week_nights",
    "adults",
    "children",
    "babies",
    "previous_cancellations",
    'previous_bookings_not_canceled',
    "days_in_waiting_list",
    "adr",
    "required_car_parking_spaces",
    "total_of_special_requests",
    "total_nights",
    "day_of_week_on_arrival",
    "sin_week",
    "cos_week"
]

binary_feats = [
    "is_repeated_guest",
    "portugal_flag",
    "eu_flag",
    "used_agent",
    "hotel_binary",
]

categorical_feats = [
    "arrival_date_month",
    "meal",
    "country",
    "market_segment",
    "distribution_channel",
    "reserved_room_type",
    "assigned_room_type", # check this
    "deposit_type",
    "customer_type"
]

all_feats = numeric_feats + binary_feats + categorical_feats 

print(f"Number of features: {len(all_feats)}")

Number of features: 32


In [75]:
columns = df.columns
set(columns)-set(all_feats)

{'agent',
 'arrival_date_year',
 'booking_changes',
 'company',
 'hotel',
 'index',
 'is_canceled',
 'reservation_status',
 'reservation_status_date'}

In [76]:
simple_imputer = ColumnTransformer(
    [
        ("numeric", SimpleImputer(strategy="median", add_indicator=True), numeric_feats),
        ("categorical", SimpleImputer(strategy="constant", fill_value="missing"), categorical_feats),
        ("binary", SimpleImputer(strategy="most_frequent", add_indicator=True), binary_feats)
    ],
    remainder="drop",
    verbose_feature_names_out=False
)

simple_imputer.set_output(transform="pandas")

In [77]:
for feat in categorical_feats:
    print(df_train[feat].value_counts())

arrival_date_month
August       10529
July          9561
May           9003
October       8542
April         8413
June          8252
September     8017
March         7452
February      6081
November      5151
December      5113
January       4487
Name: count, dtype: int64
meal
BB           70150
HB           10982
SC            7995
Undefined      880
FB             594
Name: count, dtype: int64
country
PRT    36757
GBR     9195
FRA     7920
ESP     6515
DEU     5545
       ...  
CYM        1
DMA        1
NIC        1
MDG        1
MRT        1
Name: count, Length: 174, dtype: int64
market_segment
Online TA        43007
Offline TA/TO    18366
Groups           15006
Direct            9518
Corporate         3971
Complementary      552
Aviation           179
Name: count, dtype: int64
distribution_channel
TA/TO        74409
Direct       11042
Corporate     5006
GDS            142
Name: count, dtype: int64
reserved_room_type
A    65165
D    14669
E     4986
F     2202
G     1601
B      839
C

In [78]:
# Lower cardinality features on hot encoding
ohe_feats = [
    "meal",
    "market_segment",
    "distribution_channel",
    "deposit_type",
    "customer_type"
]

# Higher cardinality features target encoding
target_encoded_feats = list(set(categorical_feats) - set(ohe_feats))

target_encoded_feats

encoder = ColumnTransformer(
    [
    ("one_hot", ce.OneHotEncoder(use_cat_names=True), ohe_feats),
    ("target", ce.TargetEncoder(smoothing=100, min_samples_leaf=1000), target_encoded_feats)
    ],
    remainder="passthrough",
    verbose_feature_names_out=False
)

encoder.set_output(transform="pandas")

In [79]:
scaler = StandardScaler()
scaler.set_output(transform="pandas")

prep_pipeline = Pipeline(
    [
        ("imputation", simple_imputer),
        ("encoding", encoder),
        ("scaling", scaler)
    ]
)

prep_pipeline.fit_transform(df_train, df_train['is_canceled'])

Unnamed: 0,meal_BB,meal_SC,meal_HB,meal_FB,meal_Undefined,market_segment_Online TA,market_segment_Offline TA/TO,market_segment_Groups,market_segment_Direct,market_segment_Aviation,...,total_of_special_requests,total_nights,day_of_week_on_arrival,sin_week,cos_week,is_repeated_guest,portugal_flag,eu_flag,used_agent,hotel_binary
0,0.539937,-0.311102,-0.371392,-0.081237,-0.099036,1.051978,-0.504236,-0.445539,-0.342616,-0.044493,...,-0.720842,-0.167365,1.540109,-0.809480,-0.954567,-0.179301,-0.826231,0.0,0.397049,0.709860
1,0.539937,-0.311102,-0.371392,-0.081237,-0.099036,1.051978,-0.504236,-0.445539,-0.342616,-0.044493,...,-0.720842,-0.953139,1.027567,1.267506,-0.408832,-0.179301,-0.826231,0.0,0.397049,-1.408729
2,0.539937,-0.311102,-0.371392,-0.081237,-0.099036,-0.950591,1.983200,-0.445539,-0.342616,-0.044493,...,-0.720842,-0.560252,1.027567,1.400637,0.278890,-0.179301,-0.826231,0.0,0.397049,0.709860
3,0.539937,-0.311102,-0.371392,-0.081237,-0.099036,1.051978,-0.504236,-0.445539,-0.342616,-0.044493,...,1.802202,0.618408,0.515025,-1.349861,0.278890,-0.179301,-0.826231,0.0,0.397049,0.709860
4,0.539937,-0.311102,-0.371392,-0.081237,-0.099036,1.051978,-0.504236,-0.445539,-0.342616,-0.044493,...,1.802202,0.225521,-0.510058,1.188804,-0.564057,-0.179301,-0.826231,0.0,0.397049,0.709860
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90596,-1.852066,-0.311102,2.692574,-0.081237,-0.099036,-0.950591,1.983200,-0.445539,-0.342616,-0.044493,...,-0.720842,-0.953139,0.515025,-1.301809,0.628892,-0.179301,1.210316,0.0,0.397049,0.709860
90597,-1.852066,3.214375,-0.371392,-0.081237,-0.099036,1.051978,-0.504236,-0.445539,-0.342616,-0.044493,...,0.540680,0.225521,-0.510058,-0.932964,-0.838853,-0.179301,1.210316,0.0,0.397049,0.709860
90598,0.539937,-0.311102,-0.371392,-0.081237,-0.099036,-0.950591,1.983200,-0.445539,-0.342616,-0.044493,...,-0.720842,-0.560252,0.002483,0.983740,-0.838853,-0.179301,1.210316,0.0,0.397049,0.709860
90599,0.539937,-0.311102,-0.371392,-0.081237,-0.099036,-0.950591,-0.504236,2.244471,-0.342616,-0.044493,...,-0.720842,-0.560252,-0.510058,-1.335386,0.455441,-0.179301,1.210316,0.0,0.397049,0.709860


In [80]:
lr_model = LogisticRegression()
lr_pipe = Pipeline([("prep", clone(prep_pipeline)), ("model", lr_model)])
lr_pipe.fit(df_train, df_train["is_canceled"])

In [81]:
kf = KFold(n_splits=5)
scoring = ['neg_log_loss', "accuracy", "precision", "recall", "f1"]

cv_results_lr = cross_validate(
    clone(lr_pipe),
    df_train,
    df_train['is_canceled'],
    cv=kf,
    return_train_score=True,
    scoring=scoring
)

results = pd.DataFrame(columns=['model', 'neg_log_loss', 'accuracy', 'precision', 'recall', 'f1'])

entry = {
    "model": "BaseModel",
    "neg_log_loss": np.mean(cv_results_lr['test_neg_log_loss']),
    "accuracy": np.mean(cv_results_lr['test_accuracy']),
    "precision": np.mean(cv_results_lr['test_precision']),
    "recall": np.mean(cv_results_lr['test_recall']),
    "f1": np.mean(cv_results_lr['test_f1']),
}
entry = pd.DataFrame([entry])
results = pd.concat([results, entry], ignore_index=True)
results

  results = pd.concat([results, entry], ignore_index=True)


Unnamed: 0,model,neg_log_loss,accuracy,precision,recall,f1
0,BaseModel,-0.390758,0.808048,0.804682,0.634732,0.709647


### TODO

- JS encoding
- Log transform
- Interactions
- Hinges

In [82]:
def log_transform(df, cols):
    df_copy = df.copy(deep=True)

    df_copy.loc[:, cols] = df_copy[cols].clip(lower=0)

    df_copy.loc[:, cols] = df_copy[cols].apply(np.log1p)

    return df_copy

In [83]:
# Lower cardinality features one-hot encoding
ohe_feats = [
    "meal",
    "market_segment",
    "distribution_channel",
    "deposit_type",
    "customer_type"
]

# Higher cardinality features target encoding
js_encoded_feats = list(set(categorical_feats) - set(ohe_feats))

encoder = ColumnTransformer(
    [
        ("one_hot", ce.OneHotEncoder(use_cat_names=True), ohe_feats),
        ("james_stein", ce.JamesSteinEncoder(), js_encoded_feats)
    ],
    remainder="passthrough",
    verbose_feature_names_out=False
)

encoder.set_output(transform="pandas")

prep_pipeline_w_log = Pipeline(
    [
        ("imputation", simple_imputer),
        ("encoding", encoder),
        ("log_transform", FunctionTransformer(log_transform, kw_args={"cols": numeric_feats + js_encoded_feats})),
        ("scaling", scaler)
    ]
)

lr_pipe_w_log = Pipeline([("prep", clone(prep_pipeline_w_log)), ("model", lr_model)])

lr_pipe_w_log.fit(df_train, df_train["is_canceled"])

In [84]:
cv_results_lr = cross_validate(
    clone(lr_pipe_w_log),
    df_train,
    df_train["is_canceled"],
    cv=kf,
    return_train_score=True,
    scoring=scoring
)

entry = {
    "model": "Log + JS",
    "neg_log_loss": np.mean(cv_results_lr["test_neg_log_loss"]),
    "accuracy": np.mean(cv_results_lr["test_accuracy"]),
    "precision": np.mean(cv_results_lr["test_precision"]),
    "recall": np.mean(cv_results_lr["test_recall"]),
    "f1": np.mean(cv_results_lr["test_f1"]),
}
entry = pd.DataFrame([entry])
results = pd.concat([results, entry], ignore_index=True)
results

Unnamed: 0,model,neg_log_loss,accuracy,precision,recall,f1
0,BaseModel,-0.390758,0.808048,0.804682,0.634732,0.709647
1,Log + JS,-0.367912,0.824847,0.812155,0.684391,0.7428


In [85]:
class AddInteractionTerms(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None, **fit_params):
        
        self.input_columns = X.columns

        self.added_columns = []
        
        if 'assigned_room_type' in X.columns and 'reserved_room_type' in X.columns:
            self.added_columns.append('assigned_room_type_X_reserved_room_type')
        
        if 'previous_bookings_not_canceled' in X.columns and 'is_repeated_guest' in X.columns:
            self.added_columns.append('previous_bookings_not_canceled_X_is_repeated_guest')
        
        return self
    
    def transform(self, X):
        X_copy = X.copy(deep=True)

        if 'assigned_room_type' in X.columns and 'reserved_room_type' in X.columns:
            X_copy['assigned_room_type_X_reserved_room_type'] = X_copy['assigned_room_type'] * X_copy['reserved_room_type']
    
        if 'previous_bookings_not_canceled' in X.columns and 'is_repeated_guest' in X.columns:
            X_copy['previous_bookings_not_canceled_X_is_repeated_guest'] = X_copy['previous_bookings_not_canceled'] * X_copy['is_repeated_guest']

        return X_copy
    
    def get_feature_names_out(self, *args, **params):
        return self.input_columns + self.added_columns

In [86]:
prep_pipeline_w_log_interactions = Pipeline(
    [
        ("imputation", simple_imputer),
        ("encoding", encoder),
        ("log_transform", FunctionTransformer(log_transform, kw_args={"cols": numeric_feats + js_encoded_feats})),
        ("interactions", AddInteractionTerms()),
        ("scaling", scaler)
    ]
)
lr_model = LogisticRegression(max_iter=10000)
lr_pipe_w_log_interactions = Pipeline([("prep", clone(prep_pipeline_w_log_interactions)), ("model", lr_model)])
lr_pipe_w_log_interactions.fit(df_train, df_train["is_canceled"])

In [87]:
cv_results_lr = cross_validate(
    clone(lr_pipe_w_log_interactions),
    df_train,
    df_train["is_canceled"],
    cv=kf,
    return_train_score=True,
    scoring=scoring
)

entry = {
    "model": "Log + JS + Interactions",
    "neg_log_loss": np.mean(cv_results_lr["test_neg_log_loss"]),
    "accuracy": np.mean(cv_results_lr["test_accuracy"]),
    "precision": np.mean(cv_results_lr["test_precision"]),
    "recall": np.mean(cv_results_lr["test_recall"]),
    "f1": np.mean(cv_results_lr["test_f1"]),
}
entry = pd.DataFrame([entry])
results = pd.concat([results, entry], ignore_index=True)
results

Unnamed: 0,model,neg_log_loss,accuracy,precision,recall,f1
0,BaseModel,-0.390758,0.808048,0.804682,0.634732,0.709647
1,Log + JS,-0.367912,0.824847,0.812155,0.684391,0.7428
2,Log + JS + Interactions,-0.366862,0.825499,0.812859,0.685735,0.743882
