In [30]:
# Load the datasets
import pandas as pd

# Load both H1 (Resort Hotel) and H2 (City Hotel)
data_h1 = pd.read_csv('dataset/H1.csv')
data_h2 = pd.read_csv('dataset/H2.csv')

# Concatenate datasets (Resort and City hotel)
data_combined = pd.concat([data_h1, data_h2], ignore_index=True)

# Initial data inspection
data_combined.head()

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,...,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate
0,0,342,2015,July,27,1,0,0,2,0.0,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,0,737,2015,July,27,1,0,0,2,0.0,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,0,7,2015,July,27,1,0,1,1,0.0,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,0,13,2015,July,27,1,0,1,1,0.0,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,0,14,2015,July,27,1,0,2,2,0.0,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [31]:
data_cleaned = data_combined.drop_duplicates()

# Convert categorical month to numerical 1-12
months = ['January', 'February', 'March', 'April', 'May', 'June', 
          'July', 'August', 'September', 'October', 'November', 'December']
data_cleaned.loc[:, 'ArrivalDateMonth'] = data_cleaned['ArrivalDateMonth'].apply(lambda x: months.index(x) + 1)

# 'Children' missing values can be filled with 0
data_cleaned.fillna({'Children': 0}, inplace=True)

# Change the undefined value in the meal feature to "SC"
# SC is the same as Undefined, from the data description of the literature
data_cleaned['Meal'].replace('Undefined', 'SC', inplace=True)

# deletion on rows that have missing distribution_channel, market_segment feature values.
data_cleaned.dropna(subset=['DistributionChannel', 'MarketSegment'], inplace=True)

data_cleaned.drop(columns=['Country'], inplace=True)

# 'Agent' and 'Company' - Replace 'NULL' with 'Unknown' for these fields
data_cleaned['Agent'].replace('NULL', 'Unknown', inplace=True)
data_cleaned['Company'].replace('NULL', 'Unknown', inplace=True)

# Inspect the cleaned data
data_cleaned.head()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned.fillna({'Children': 0}, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_cleaned['Meal'].replace('Undefined', 'SC', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned['Meal'].replace('Undefined', 'SC', inplace=True)
A value is trying to be s

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,...,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate
0,0,342,2015,7,27,1,0,0,2,0.0,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,0,737,2015,7,27,1,0,0,2,0.0,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,0,7,2015,7,27,1,0,1,1,0.0,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,0,13,2015,7,27,1,0,1,1,0.0,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,0,14,2015,7,27,1,0,2,2,0.0,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [32]:
data_cleaned.sort_values(by=['ArrivalDateYear', 'ArrivalDateMonth', 'ArrivalDateDayOfMonth'], inplace=True)
X = data_cleaned.drop(columns='IsCanceled')
y = data_cleaned['IsCanceled']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned.sort_values(by=['ArrivalDateYear', 'ArrivalDateMonth', 'ArrivalDateDayOfMonth'], inplace=True)


In [33]:
from sklearn.base import BaseEstimator, TransformerMixin
from datetime import datetime, timedelta

class FeatureEngineering(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.adr_grouped = None
        pass
    
    def fit(self, X, y=None):        
        # Calculate the 75th percentile of ADR for each group
        self.adr_grouped = X.groupby(['DistributionChannel', 'ReservedRoomType', 'ArrivalDateYear', 'ArrivalDateMonth'])['ADR'].quantile(0.75)

        return self
    
    def transform(self, X):
        X = X.drop(columns=['ArrivalDateWeekNumber',   
                            'AssignedRoomType',
                            'RequiredCarParkingSpaces'])

        # Create 'LiveTime' feature
        X['LiveTime'] = X.apply(lambda row: self.calculate_livetime(row), axis=1)

        # Clean data to remove any leading/trailing spaces
        X['ReservedRoomType'] = X['ReservedRoomType'].str.strip()
        X['DistributionChannel'] = X['DistributionChannel'].str.strip()
        
        # Calculate ADRThirdQuartileDeviation
        X['ADRThirdQuartileDeviation'] = X.apply(
            lambda row: row['ADR'] / self.adr_grouped.get(
                (row['DistributionChannel'], row['ReservedRoomType'], row['ArrivalDateYear'], row['ArrivalDateMonth']), 1),
            axis=1
        )
        
        X = X.drop(columns=['LeadTime', 
                            'ADR', 
                            'ArrivalDateYear',
                            'ArrivalDateMonth', 
                            'ArrivalDateDayOfMonth', 
                            'ReservationStatus', 
                            'ReservationStatusDate',
                            'ReservedRoomType'])
        
        return X

    def calculate_livetime(self, row):
        current_date = datetime.now()
        arrival_date = datetime(row['ArrivalDateYear'], row['ArrivalDateMonth'], row['ArrivalDateDayOfMonth'])
        booking_date = arrival_date - timedelta(days=row['LeadTime'])
        
        if row['ReservationStatus'] == 'Check-Out':  # "A" type (effective bookings)
            return row['LeadTime']
        elif row['ReservationStatus'] == 'Canceled' or row['ReservationStatus'] == 'No-Show':  # "B" type (canceled bookings)
            reservation_status_date = datetime.strptime(row['ReservationStatusDate'], "%Y-%m-%d")
            return (reservation_status_date - booking_date).days
        else:  # "C" type (future bookings)
            return (current_date - booking_date).days


In [34]:
import numpy as np

class LogitOddsEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, target_col='IsCanceled', min_frequency=0.02, smoothing=1e-6):
        self.target_col = target_col
        self.min_frequency = min_frequency
        self.smoothing = smoothing
        self.logit_odds_map = {}
        self.global_mean_logit_odds = None
    
    def fit(self, X, y=None):
        # Store global logit odds as a fallback for unseen categories
        global_mean = y.mean()
        self.global_mean_logit_odds = np.log(global_mean / (1 - global_mean))

        for col in X.columns:
            counts = X[col].value_counts(normalize=True)
            common_levels = counts[counts >= self.min_frequency].index
            group_data = X[[col]].copy()
            group_data[self.target_col] = y
            
            # Calculate logit odds with proper handling of 0 and 1 values
            logit_odds = group_data.groupby(col)[self.target_col].mean().apply(
                lambda x: np.log(np.clip(x, self.smoothing, 1 - self.smoothing) / 
                                 (1 - np.clip(x, self.smoothing, 1 - self.smoothing)))
            )
            self.logit_odds_map[col] = logit_odds.reindex(common_levels).fillna(self.global_mean_logit_odds)
        return self
    
    def transform(self, X):
        X_encoded = X.copy()
        for col in X.columns:
            # Map the logit odds to the column, filling with the global mean logit odds if not found
            X_encoded[col] = X[col].map(self.logit_odds_map[col]).fillna(self.global_mean_logit_odds)
        return X_encoded



In [35]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Define preprocessing for numerical and categorical columns
numerical_features = ['LiveTime', 
                      'StaysInWeekendNights', 
                      'StaysInWeekNights', 
                      'Adults', 
                      'Children', 
                      'Babies', 
                      'ADRThirdQuartileDeviation', 
                      'IsRepeatedGuest', 
                      'PreviousCancellations', 
                      'PreviousBookingsNotCanceled', 
                      'BookingChanges', 
                      'DaysInWaitingList', 
                      'TotalOfSpecialRequests']
categorical_features = ['DepositType', 
                        'DistributionChannel', 
                        'CustomerType', 
                        'Meal', 
                        'MarketSegment', 
                        'Agent', 
                        'Company']

# Preprocessing pipeline for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features[:-2]),  # Apply OHE to other categorical features
        ('logit_odds', LogitOddsEncoder(), ['Agent', 'Company']) 
    ])


In [36]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, make_scorer
from sklearn.model_selection import TimeSeriesSplit, cross_validate
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline


# Definisci tutti i modelli da valutare
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Bagging': BaggingClassifier(random_state=42),
    'Bayesian': GaussianNB(),
    'XGBoost': XGBClassifier(random_state=42)
}

def confusion_matrix_scorer(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    return cm

# Define scoring metrics for cross_validate
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc',
    # 'confusion_matrix': make_scorer(confusion_matrix_scorer)  # Add confusion matrix to scoring
}

# TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

# Lista per salvare i risultati
all_results = []

# Esegui TimeSeriesSplit per ognuno dei modelli
for model_name, model in models.items():

    # Create a pipeline with SMOTE and the model
    pipeline = Pipeline(steps=[
        ('feature_engineering', FeatureEngineering()),  # Feature engineering
        ('preprocessing', preprocessor),  # Preprocessing (StandardScaler, OneHotEncoder, Logit-Odds)
        ('smote', SMOTE(random_state=42)),  # Apply SMOTE to balance the classes
        ('model', model)  # The current model
    ])
    
    # Perform cross-validation with the pipeline and TimeSeriesSplit
    cv_results = cross_validate(pipeline, X, y, cv=tscv, scoring=scoring, return_train_score=False)
    
    # Store the results
    all_results.append({
        'Model': model_name,
        'Accuracy': np.mean(cv_results['test_accuracy']),
        'Precision': np.mean(cv_results['test_precision']),
        'Recall': np.mean(cv_results['test_recall']),
        'F1 Score': np.mean(cv_results['test_f1']),
        'ROC AUC': np.mean(cv_results['test_roc_auc']),
        # 'Confusion Matrices': cv_results['test_confusion_matrix']  # This will store the confusion matrices for each fold
    })

results_df = pd.DataFrame(all_results)

results_df

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC AUC
0,Random Forest,0.777193,0.668861,0.435248,0.523313,0.767072
1,Logistic Regression,0.720568,0.509742,0.667334,0.577228,0.778349
2,AdaBoost,0.741796,0.541871,0.626248,0.580634,0.778406
3,Decision Tree,0.709694,0.491818,0.456147,0.472566,0.633218
4,K-Nearest Neighbors,0.703282,0.48673,0.573719,0.525827,0.713301
5,Bagging,0.763257,0.6232,0.419518,0.499203,0.729614
6,Bayesian,0.678608,0.506249,0.421995,0.419314,0.706956
7,XGBoost,0.792778,0.712859,0.453583,0.549741,0.79396
