In [1]:
import pandas as pd
import sweetviz as sv
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime
import warnings
import mlflow
import pickle
import category_encoders as ce
from sklearn.base import clone
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from skopt.space import Integer, Real, Categorical
from skopt.utils import use_named_args
from skopt import gp_minimize
from skopt.plots import plot_evaluations, plot_objective
from sklearn.model_selection import (
    StratifiedKFold,
    ShuffleSplit,
    KFold,
    cross_validate,
    cross_val_predict,
    GridSearchCV,
    train_test_split
)
import lightgbm as lgb
from lightgbm import LGBMClassifier, early_stopping
from sklearn.feature_selection import RFE, SelectKBest, f_classif
from imblearn.over_sampling import RandomOverSampler

  from pandas.core import (


In [2]:
df_train = pd.read_csv("Data/train_data.csv")

In [3]:
def data_cleaning(raw_data):
    #Replacing the small amount of null values in children and country columns with sensible values
    null_replacements = {"children": 0.0,"country": "Unknown"} 
    clean_data = raw_data.fillna(null_replacements) 
    #Dropping only row with room type L
    clean_data = clean_data[clean_data['assigned_room_type'] != 'L']
    #Replace error entries with nulls
    clean_data['market_segment'].replace('Undefined', np.nan, inplace=True)
    clean_data['distribution_channel'].replace('Undefined', np.nan, inplace=True)
    #Dropping bookings with no guests
    empty_bookings = (clean_data.children == 0) & (clean_data.adults == 0) & (clean_data.babies == 0) # There are no guests when these conditions are true
    clean_data = clean_data[~empty_bookings] 
    #Dropping all the duplicates
    clean_data = clean_data.drop_duplicates()
    clean_data = clean_data.reset_index(drop=True)
    return clean_data
df_train = data_cleaning(df_train)

def feature_engineering(df):
    # Add a boolean variable for whether or not an agent was used to book
    df['used_agent'] = np.where(df['agent'].isnull(),0,1)
    # Combining weekdays and weekends for a total nights value
    df['total_nights'] = df['stays_in_weekend_nights'] + df['stays_in_week_nights']
    # Adding a column to map hotel to 0/1
    df['hotel_binary'] = df['hotel'].map({'Resort Hotel': 0, 'City Hotel': 1})
    # Creating a flags for guests coming from Portugal and EU
    df['portugal_flag'] = (df['country'] == 'PRT').astype('float64')
    eu_countries = ['AUT, BEL, BGR, HRV, CYP, CZE, DNK, EST, FIN, FRA, DEU, GRC, HUN, IRL, ITA, LVA, LTU, LUX, MLT, NLD, POL']
    df['eu_flag'] = df['country'].isin(eu_countries).astype('float64')
    
    # Now finding the day of the week for each booking 

    # Dictionary to map the month to its corresponding number
    month_to_number = {
        "January": 1,
        "February": 2,
        "March": 3,
        "April": 4,
        "May": 5,
        "June": 6,
        "July": 7,
        "August": 8,
        "September": 9,
        "October": 10,
        "November": 11,
        "December": 12
    }
    # adding columns for the date of arrivaL
    df['arrival_date_year'] = df['arrival_date_year'].astype(str)
    df['arrival_date_month_number'] = df['arrival_date_month'].map(month_to_number).astype(str).str.zfill(2)  
    df['arrival_date_day_of_month'] = df['arrival_date_day_of_month'].astype(str).str.zfill(2)  
    # string in the format "YYYY-MM-DD"
    df['raw_date_string'] = df['arrival_date_year'] + "-" + \
                                    df['arrival_date_month_number'] + "-" + \
                                    df['arrival_date_day_of_month']
    df['raw_date_string']

    # Creating a function that transforms the date into its corresponding day of the week
    def find_day(raw_date_string):
        date_object = datetime.strptime(raw_date_string, "%Y-%m-%d")
        day_of_week = date_object.weekday()
        return day_of_week
    df['day_of_week_on_arrival'] = df['raw_date_string'].apply(find_day)

    # Adding a column to encode the seasonal affect of which week in the year the booking was made
    period = df['arrival_date_week_number'].nunique()
    two_pi = 2*np.pi
    df['sin_week'] = np.sin(two_pi * df['arrival_date_week_number'] / period)
    df['cos_week'] = np.cos(two_pi * df['arrival_date_week_number'] / period)
    
    df['arrival_date_day_of_month'] = df['arrival_date_day_of_month'].astype(int)
    df['arrival_date_month_number'] = df['arrival_date_month_number'].astype(int)
    
    return df
df_train = feature_engineering(df_train)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  clean_data['market_segment'].replace('Undefined', np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  clean_data['distribution_channel'].replace('Undefined', np.nan, inplace=True)


In [4]:
# Balancing the data set with a random sampler
ros = RandomOverSampler(random_state=42)

df_train, df_train_y = ros.fit_resample(df_train, df_train['is_canceled'])
df_train['is_canceled'].value_counts() * 100 / len(df_train)

is_canceled
0    50.0
1    50.0
Name: count, dtype: float64

In [5]:
numeric_feats = [
    "lead_time",
    "arrival_date_week_number",
    "arrival_date_day_of_month",
    "stays_in_weekend_nights",
    "stays_in_week_nights",
    "adults",
    "children",
    "babies",
    "previous_cancellations",
    'previous_bookings_not_canceled',
    "days_in_waiting_list",
    "adr",
    "required_car_parking_spaces",
    "total_of_special_requests",
    "total_nights",
    "day_of_week_on_arrival",
    "sin_week",
    "cos_week"
]

binary_feats = [
    "is_repeated_guest",
    "portugal_flag",
    "eu_flag",
    "used_agent",
    "hotel_binary",
]

categorical_feats = [
    "arrival_date_month",
    "meal",
    "country",
    "market_segment",
    "distribution_channel",
    "reserved_room_type",
    "assigned_room_type", 
    "deposit_type",
    "customer_type"
]

all_feats = numeric_feats + binary_feats + categorical_feats 

print(f"Number of features: {len(all_feats)}")

Number of features: 32


In [6]:
prep = ColumnTransformer(
    transformers = [
        ('pass', 'passthrough', numeric_feats + binary_feats),
        ('ordinal', ce.OrdinalEncoder(handle_missing="return_nan"), categorical_feats)
    ],
    remainder="drop",
    verbose_feature_names_out=False
).set_output(transform="pandas")

prep.fit(df_train, df_train['is_canceled'])

In [7]:
# Define the LGBM model
lgbm_model = LGBMClassifier(
    objective="binary",
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=5,
    num_leaves=100,
    reg_alpha=0.1,
    reg_lambda=0.1,
    min_split_gain=0.01,
    ming_child_weight=0.1,
    subsample=1.0,
    colsample_bytree=1.0,
    cat_smooth=1.0,
    random_state=42,
    verbose=-1
)

# Define the pipeline
lgbm_pipe = Pipeline([
    ("prep", clone(prep)),  # Assuming 'prep' is defined elsewhere
    ("model", clone(lgbm_model))
])

lgbm_pipe


In [8]:
fit_params = {"model__categorical_feature": categorical_feats}
scoring = ["neg_log_loss", "accuracy", "precision", "recall", "f1", "roc_auc"]
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Since the pipeline drops the target, just pass df_train as it is
cv_results = cross_validate(
    clone(lgbm_pipe),
    df_train,
    df_train["is_canceled"],  # Target
    cv=kf,
    fit_params=fit_params,
    return_train_score=True,
    scoring=scoring
)

# Create a dictionary for the results
entry = {
    "model": "BaseModel",
    "neg_log_loss": np.mean(cv_results['test_neg_log_loss']),
    "accuracy": np.mean(cv_results['test_accuracy']),
    "precision": np.mean(cv_results['test_precision']),
    "recall": np.mean(cv_results['test_recall']),
    "f1": np.mean(cv_results['test_f1']),
    "roc_auc": np.mean(cv_results['test_roc_auc'])
}

# Convert to a DataFrame and output
results = pd.DataFrame([entry])
results



Unnamed: 0,model,neg_log_loss,accuracy,precision,recall,f1,roc_auc
0,BaseModel,-0.252452,0.889033,0.881457,0.898982,0.890126,0.96096


In [9]:
ss = ShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
cv_train_idx, cv_val_idx = next(ss.split(df_train))
df_train_cv = df_train.iloc[cv_train_idx]
df_val_cv = df_train.iloc[cv_val_idx]

assert len(set(df_train_cv.index) & set(df_val_cv.index)) == 0 

In [10]:
len(df_val_cv)

22847

In [11]:
prep_cv = clone(prep)
X_train_cv = prep_cv.fit_transform(df_train_cv)
X_val_cv = prep_cv.transform(df_val_cv) 

In [12]:
fit_params = {
    "model__eval_set": [(X_val_cv, df_val_cv["is_canceled"])],
    "model__categorical_feature": categorical_feats,
    "model__callbacks": [early_stopping(30, first_metric_only=True)]
    }

In [13]:
bayes_space = [
    Integer(3, 15, name='max_depth'),
    Integer(32, 512, name='num_leaves'),
    Real(0.0, 100, name='reg_alpha'),
    Real(0.0, 100, name='reg_lambda'),
    Real(0.0, 10, name='min_split_gain'),
    Real(0.01, 1000, prior='log-uniform', name='min_child_weight'),
    Real(0.5, 1, name='subsample'),
    Real(0.5, 1, name='colsample_bytree'),
    Real(0.01, 150, name='cat_smooth'),
    Real(0.001, 0.1, prior='log-uniform', name='learning_rate'),
    Integer(5, 100, name='min_child_samples')
]

cv_params = {
    "max_depth": 5,
    "num_leaves": 100,
    "reg_alpha": 0.1,
    "reg_lambda": 0.1,
    "min_split_gain": 0.01,
    "min_child_weight": 0.1,  
    "subsample": 1.0,
    "colsample_bytree": 1.0,
    "cat_smooth": 1.0,
    "learning_rate": 0.1,
    "min_child_samples": 20,  
    "random_state": 42,
    "verbose": -1 
}

In [14]:
early_stopping_data = []

@use_named_args(bayes_space)
def objective(**params):
    global early_stopping_data

    print(f"Evaluating params: {params}")
    cv_params_i = cv_params.copy()
    for key, value in params.items():
        cv_params_i[key] = value

    lgbm_pipe['model'].set_params(**cv_params_i)

    lgbm_cv_results = cross_validate(
        lgbm_pipe,
        df_train_cv,
        df_train_cv["is_canceled"],
        cv=kf,
        fit_params=fit_params,
        return_estimator=True,
        return_train_score=True,
        scoring='neg_log_loss',
        error_score='raise'  # Raise errors directly
    )

    cv_models = [estimator['model'] for estimator in lgbm_cv_results['estimator']]
    # Collect best_iteration_ from each model
    best_iterations = [model.best_iteration_ for model in cv_models]

    # Append the median best_iteration_ to the global list
    median_best_iteration = int(np.median(best_iterations))
    early_stopping_data.append(median_best_iteration)

    metric_to_min = -np.mean(lgbm_cv_results['test_score'])
    print(f"Mean Negative Log Loss: {metric_to_min}")
    return metric_to_min

In [None]:
res_gp = gp_minimize(
    objective,
    bayes_space,
    n_initial_points=30,
    n_calls=100,
    random_state=42,
    verbose=False
)

In [16]:
bayes_best_iter = np.argmin(res_gp.func_vals)
best_iter_early_stopping = early_stopping_data[bayes_best_iter]
n_estimators_refit = best_iter_early_stopping
print(f"n_estimators for Refit: {n_estimators_refit}")


bayes_best_score = res_gp.fun
bayes_best_hps = res_gp.x
bayes_best_score, bayes_best_hps

n_estimators for Refit: 225


(0.26653427646005057,
 [15, 512, 0.0, 0.0, 0.0, 0.03903657674190941, 0.5, 0.5, 150.0, 0.1, 5])

In [17]:
bayes_best_params = {res_gp.space[i][1].name: res_gp.x[i] for i in range(len(res_gp.x))}
bayes_best_params

{'max_depth': 15,
 'num_leaves': 512,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'min_split_gain': 0.0,
 'min_child_weight': 0.03903657674190941,
 'subsample': 0.5,
 'colsample_bytree': 0.5,
 'cat_smooth': 150.0,
 'learning_rate': 0.1,
 'min_child_samples': 5}

In [18]:
# Update the best parameters with the number of estimators for refit
bayes_best_params.update({"n_estimators": n_estimators_refit})

# Define the LGBM model with the best parameters
lgbm_best_model = LGBMClassifier(**bayes_best_params)

# Fit the model on the entire training set
lgbm_best_model.fit(X_train_cv, df_train_cv["is_canceled"])

# Define the pipeline
lgbm_pipe_best = Pipeline([
    ("prep", clone(prep)),  
    ("model", clone(lgbm_best_model))
])

In [19]:

fit_params = {"model__categorical_feature": categorical_feats}
scoring = ["neg_log_loss", "accuracy", "precision", "recall", "f1", "roc_auc"]
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Since the pipeline drops the target, just pass df_train as it is
cv_results = cross_validate(
    clone(lgbm_pipe_best),
    df_train.drop(columns=["is_canceled"]),
    df_train["is_canceled"],  # Target
    cv=kf,
    fit_params=fit_params,
    return_train_score=True,
    scoring=scoring
)

# Create a dictionary for the results
entry = {
    "model": "BayesOptModel",
    "neg_log_loss": np.mean(cv_results['test_neg_log_loss']),
    "accuracy": np.mean(cv_results['test_accuracy']),
    "precision": np.mean(cv_results['test_precision']),
    "recall": np.mean(cv_results['test_recall']),
    "f1": np.mean(cv_results['test_f1']),
    "roc_auc": np.mean(cv_results['test_roc_auc'])
}
entry = pd.DataFrame([entry])

# Concatenate the new entry to the results DataFrame
results = pd.concat([results, entry], ignore_index=True)
results



Unnamed: 0,model,neg_log_loss,accuracy,precision,recall,f1,roc_auc
0,BaseModel,-0.252452,0.889033,0.881457,0.898982,0.890126,0.96096
1,BayesOptModel,-0.194456,0.921677,0.909904,0.936031,0.92278,0.977357
