In [1]:
# Load the datasets
import pandas as pd

# Load both H1 (Resort Hotel) and H2 (City Hotel)
data_h1 = pd.read_csv('dataset/H1.csv')
data_h1['Hotel'] = 'H1'
data_h2 = pd.read_csv('dataset/H2.csv')
data_h2['Hotel'] = 'H2'

# Concatenate datasets (Resort and City hotel)
data_combined = pd.concat([data_h1, data_h2], ignore_index=True)

# Initial data inspection
data_combined.head()

data_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   IsCanceled                   119390 non-null  int64  
 1   LeadTime                     119390 non-null  int64  
 2   ArrivalDateYear              119390 non-null  int64  
 3   ArrivalDateMonth             119390 non-null  object 
 4   ArrivalDateWeekNumber        119390 non-null  int64  
 5   ArrivalDateDayOfMonth        119390 non-null  int64  
 6   StaysInWeekendNights         119390 non-null  int64  
 7   StaysInWeekNights            119390 non-null  int64  
 8   Adults                       119390 non-null  int64  
 9   Children                     119386 non-null  float64
 10  Babies                       119390 non-null  int64  
 11  Meal                         119390 non-null  object 
 12  Country                      118902 non-null  object 
 13 

In [10]:
# data_cleaned = data_combined.drop_duplicates()
data_cleaned = data_combined.copy()

# Convert categorical month to numerical 1-12
months = ['January', 'February', 'March', 'April', 'May', 'June', 
          'July', 'August', 'September', 'October', 'November', 'December']
data_cleaned['ArrivalDateMonth'] = data_cleaned['ArrivalDateMonth'].apply(lambda x: months.index(x) + 1)

# 'Children' missing values can be filled with 0
data_cleaned['Children'] = data_cleaned['Children'].fillna(0)

# Replace SC with Undefined in Meal column
data_cleaned['Meal'] = data_cleaned['Meal'].str.strip()
data_cleaned['Meal'] = data_cleaned['Meal'].replace('SC', 'Undefined')

# deletion on rows that have missing distribution_channel, market_segment feature values.
data_cleaned = data_cleaned.dropna(subset=['DistributionChannel', 'MarketSegment'])
data_cleaned = data_cleaned[data_cleaned['DistributionChannel'] != 'Undefined']

data_cleaned = data_cleaned.drop(columns=['Country'])

# Inspect the cleaned data
data_cleaned.head()

ReservationStatus
Check-Out    75165
Canceled     43013
No-Show       1207
Name: count, dtype: int64

In [3]:
data_cleaned.sort_values(by=['ArrivalDateYear', 'ArrivalDateMonth', 'ArrivalDateDayOfMonth'], inplace=True)
X = data_cleaned.drop(columns='IsCanceled')
y = data_cleaned['IsCanceled']

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
from datetime import datetime, timedelta

class FeatureEngineering(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.adr_grouped = None
        pass
    
    def fit(self, X, y=None):        
        # Calculate the 75th percentile of ADR for each group
        self.adr_grouped = X.groupby(['DistributionChannel', 'ReservedRoomType', 'ArrivalDateYear', 'ArrivalDateMonth'])['ADR'].quantile(0.75)

        return self
    
    def transform(self, X):
        X = X.drop(columns=['ArrivalDateWeekNumber',   
                            'AssignedRoomType',
                            'RequiredCarParkingSpaces'])

        # Create 'LiveTime' feature
        X['LiveTime'] = X.apply(lambda row: self.calculate_livetime(row), axis=1)

        # Clean data to remove any leading/trailing spaces
        X['ReservedRoomType'] = X['ReservedRoomType'].str.strip()
        X['DistributionChannel'] = X['DistributionChannel'].str.strip()
        
        # Calculate ADRThirdQuartileDeviation
        X['ADRThirdQuartileDeviation'] = X.apply(
            lambda row: row['ADR'] / self.adr_grouped.get(
                (row['DistributionChannel'], row['ReservedRoomType'], row['ArrivalDateYear'], row['ArrivalDateMonth']), 1),
            axis=1
        )
        
        X = X.drop(columns=['LeadTime', 
                            'ADR', 
                            'ArrivalDateYear',
                            'ArrivalDateMonth', 
                            'ArrivalDateDayOfMonth', 
                            'ReservationStatus', 
                            'ReservationStatusDate',
                            'ReservedRoomType'])
        
        return X

    def calculate_livetime(self, row):
        current_date = datetime.now()
        arrival_date = datetime(row['ArrivalDateYear'], row['ArrivalDateMonth'], row['ArrivalDateDayOfMonth'])
        booking_date = arrival_date - timedelta(days=row['LeadTime'])
        
        if row['ReservationStatus'] == 'Check-Out':  # "A" type (effective bookings)
            return row['LeadTime']
        elif row['ReservationStatus'] == 'Canceled' or row['ReservationStatus'] == 'No-Show':  # "B" type (canceled bookings)
            reservation_status_date = datetime.strptime(row['ReservationStatusDate'], "%Y-%m-%d")
            return (reservation_status_date - booking_date).days
        else:  # "C" type (future bookings)
            return (current_date - booking_date).days


In [5]:
import numpy as np

class LogitOddsEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, target_col='IsCanceled', min_frequency=0.02, smoothing=1e-6):
        self.target_col = target_col
        self.min_frequency = min_frequency
        self.smoothing = smoothing
        self.logit_odds_map = {}
        self.global_mean_logit_odds = None
    
    def fit(self, X, y=None):
        # Store global logit odds as a fallback for unseen categories
        global_mean = y.mean()
        self.global_mean_logit_odds = np.log(global_mean / (1 - global_mean))

        for col in X.columns:
            counts = X[col].value_counts(normalize=True)
            common_levels = counts[counts >= self.min_frequency].index
            group_data = X[[col]].copy()
            group_data[self.target_col] = y
            
            # Calculate logit odds with proper handling of 0 and 1 values
            logit_odds = group_data.groupby(col)[self.target_col].mean().apply(
                lambda x: np.log(np.clip(x, self.smoothing, 1 - self.smoothing) / 
                                 (1 - np.clip(x, self.smoothing, 1 - self.smoothing)))
            )
            self.logit_odds_map[col] = logit_odds.reindex(common_levels).fillna(self.global_mean_logit_odds)
        return self
    
    def transform(self, X):
        X_encoded = X.copy()
        for col in X.columns:
            # Map the logit odds to the column, filling with the global mean logit odds if not found
            X_encoded[col] = X[col].map(self.logit_odds_map[col]).fillna(self.global_mean_logit_odds)
        return X_encoded



In [6]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Define preprocessing for numerical and categorical columns
numerical_features = ['LiveTime', 
                      'StaysInWeekendNights', 
                      'StaysInWeekNights', 
                      'Adults', 
                      'Children', 
                      'Babies', 
                      'ADRThirdQuartileDeviation', 
                      'IsRepeatedGuest', 
                      'PreviousCancellations', 
                      'PreviousBookingsNotCanceled', 
                      'BookingChanges', 
                      'DaysInWaitingList', 
                      'TotalOfSpecialRequests']
categorical_features = ['DepositType', 
                        'DistributionChannel', 
                        'CustomerType', 
                        'Meal', 
                        'MarketSegment', 
                        'Hotel',
                        'Agent', 
                        'Company']

# Preprocessing pipeline for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features[:-2]),  # Apply OHE to other categorical features
        ('logit_odds', LogitOddsEncoder(), ['Agent', 'Company']) 
    ])


In [7]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, make_scorer
from sklearn.model_selection import TimeSeriesSplit, cross_validate
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline


# Definisci tutti i modelli da valutare
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Bagging': BaggingClassifier(random_state=42),
    'Bayesian': GaussianNB(),
    'XGBoost': XGBClassifier(random_state=42)
}

def confusion_matrix_scorer(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    return cm

# Define scoring metrics for cross_validate
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc',
    # 'confusion_matrix': make_scorer(confusion_matrix_scorer)  # Add confusion matrix to scoring
}

# TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

# Lista per salvare i risultati
all_results = []

# Esegui TimeSeriesSplit per ognuno dei modelli
for model_name, model in models.items():

    # Create a pipeline with SMOTE and the model
    pipeline = Pipeline(steps=[
        ('feature_engineering', FeatureEngineering()),  # Feature engineering
        ('preprocessing', preprocessor),  # Preprocessing (StandardScaler, OneHotEncoder, Logit-Odds)
        ('smote', SMOTE(random_state=42)),  # Apply SMOTE to balance the classes
        ('model', model)  # The current model
    ])
    
    # Perform cross-validation with the pipeline and TimeSeriesSplit
    cv_results = cross_validate(pipeline, X, y, cv=tscv, scoring=scoring, return_train_score=False)
    
    # Store the results
    all_results.append({
        'Model': model_name,
        'Accuracy': np.mean(cv_results['test_accuracy']),
        'Precision': np.mean(cv_results['test_precision']),
        'Recall': np.mean(cv_results['test_recall']),
        'F1 Score': np.mean(cv_results['test_f1']),
        'ROC AUC': np.mean(cv_results['test_roc_auc']),
        # 'Confusion Matrices': cv_results['test_confusion_matrix']  # This will store the confusion matrices for each fold
    })

results_df = pd.DataFrame(all_results)

results_df



Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC AUC
0,Random Forest,0.799176,0.835994,0.570319,0.676322,0.832236
1,Logistic Regression,0.792391,0.799127,0.597567,0.67977,0.828843
2,AdaBoost,0.801578,0.813556,0.611209,0.693573,0.838618
3,Decision Tree,0.734844,0.65187,0.604913,0.626979,0.7083
4,K-Nearest Neighbors,0.747037,0.67771,0.599467,0.634686,0.769474
5,Bagging,0.787013,0.793915,0.573484,0.664643,0.807102
6,Bayesian,0.642871,0.52078,0.683091,0.585712,0.743116
7,XGBoost,0.810283,0.844743,0.597217,0.697826,0.852476
