### Dataset available [here](https://www.sciencedirect.com/science/article/pii/S2352340918315191?via%3Dihub)

# Setup

In [1]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score

# load data
data_h1 = pd.read_csv('./dataset/H1.csv')
# data_h2 = pd.read_csv('./dataset/H2.csv')
# data = pd.concat([data_h1, data_h2], ignore_index=True)
data = data_h1

data.head()

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,...,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate
0,0,342,2015,July,27,1,0,0,2,0,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,0,737,2015,July,27,1,0,0,2,0,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,0,7,2015,July,27,1,0,1,1,0,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,0,13,2015,July,27,1,0,1,1,0,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,0,14,2015,July,27,1,0,2,2,0,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


# Data Cleaning

In [2]:
# Remove duplicates
data_cleaned = data.drop_duplicates()

# Convert ArrivalDateMonth to numerical
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
data_cleaned.loc[:, 'ArrivalDateMonth'] = data_cleaned['ArrivalDateMonth'].apply(lambda x: months.index(x) + 1)

data_cleaned.shape

(33968, 31)

# Data Splitting

Instead of making a stratified split by the outcome (IsCanceled) based on the dimension of the booking creation date, an approach usually employed in time series is applied, convenience splitting. Order the dataset by arrival date of bookings and create blocks of "month/year". Merge a 75% stratified split of each block into a training dataset and the remaining 25% into a test dataset.

In [None]:
from sklearn.model_selection import train_test_split, TimeSeriesSplit

def split_data(data):
    """
    Perform stratified split on the data

    Parameters
    ----------
    data : pd.DataFrame
        The data to split

    Returns
    -------
    X_train : pd.DataFrame
        The training data
    X_test : pd.DataFrame   
        The testing data
    y_train : pd.Series
        The training target
    y_test : pd.Series
        The testing target
    """
    
    data = data.sort_values(by=['ArrivalDateYear', 'ArrivalDateMonth', 'ArrivalDateDayOfMonth'])
    
    # y = data['IsCanceled']
    # X = data.drop(columns=['IsCanceled'])
    
    # X['YearMonth'] = X['ArrivalDateYear'].astype(str) + '-' + X['ArrivalDateMonth'].astype(str)
    
    data['YearMonth'] = data['ArrivalDateYear'].astype(str) + '-' + data['ArrivalDateMonth'].astype(str)

    # X_train_list, X_test_list, y_train_list, y_test_list = [], [], [], []
    
    # # Loop over each block and perform stratified split
    # for block in X['YearMonth'].unique():
    #     X_block = X[X['YearMonth'] == block].drop(columns=['YearMonth'])
    #     y_block = y[X['YearMonth'] == block]
        
    #     X_train_block, X_test_block, y_train_block, y_test_block = train_test_split(
    #         X_block, y_block, test_size=0.25, stratify=y_block, random_state=42)
        
    #     X_train_list.append(X_train_block)
    #     X_test_list.append(X_test_block)
    #     y_train_list.append(y_train_block)
    #     y_test_list.append(y_test_block)
    
    # X_train = pd.concat(X_train_list)
    # X_test = pd.concat(X_test_list)
    # y_train = pd.concat(y_train_list)
    # y_test = pd.concat(y_test_list)

    train_list, test_list = [], []
    
    # Loop over each block and perform stratified split
    for block in data['YearMonth'].unique():
        block_data = data[data['YearMonth'] == block]
        
        train_block, test_block = train_test_split(
            block_data, test_size=0.25, stratify=block_data['IsCanceled'], random_state=42)
        
        train_list.append(train_block)
        test_list.append(test_block)

    train_data = pd.concat(train_list)
    test_data = pd.concat(test_list)

    # return X_train, X_test, y_train, y_test
    return train_data, test_data

def split_data_time_series(data, target_column, n_splits=5):
    """
    Perform time series split on the data

    Parameters
    ----------
    data : pd.DataFrame
        The data to split
    target_column : str
        The target column
    n_splits : int
        The number of splits to perform

    Returns
    -------
    X_train : pd.DataFrame
        The training data
    X_test : pd.DataFrame
        The testing data
    y_train : pd.Series
        The training target
    y_test : pd.Series  
        The testing target
    """
    data = data.sort_values(by=['ArrivalDateYear', 'ArrivalDateMonth', 'ArrivalDateDayOfMonth'])
    
    X = data.drop(target_column, axis=1)
    y = data[target_column]
    
    tscv = TimeSeriesSplit(n_splits=n_splits)
    
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Ensure stratification by checking the distribution of the target variable
        if np.all(np.bincount(y_train) > 0) and np.all(np.bincount(y_test) > 0):
            break
    
    return X_train, X_test, y_train, y_test


# Feature Selection and Engineering

In [3]:
# from sklearn.base import BaseEstimator, TransformerMixin
# import datetime
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.impute import SimpleImputer

# Custom transformer for feature engineering
class FeatureEngineering(BaseEstimator, TransformerMixin):
    def __init__(self):
        # self.categorical_features = categorical_features
        # self.cardinality_threshold = cardinality_threshold
        # self.encoders = {}
        # self.imputers = {}

        # self.value_counts = {}
        # self.logit_odds_map = {}
        self.cat_imputer = SimpleImputer(strategy='most_frequent')  # For categorical columns
        self.num_imputer = SimpleImputer(strategy='mean')  # For numerical columns
        self.encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)  # One-hot encoder
        self.scaler = StandardScaler()  
    
    def fit(self, X, y=None):
        # Fit any transformers (e.g., imputer learns most frequent values, scaler learns the mean and std)
        X_cat = X[['Meal', 'Country', 'MarketSegment', 'DistributionChannel', 'CustomerType', 'ReservationStatus']]
        X_num = X[['LeadTime', 'StaysInWeekendNights', 'StaysInWeekNights', 'Adults', 'Children', 'ADR']]
        
        # self.fit_categorical_encodings(X)

        # Fit the transformers
        self.cat_imputer.fit(X_cat)
        self.num_imputer.fit(X_num)
        self.encoder.fit(self.cat_imputer.transform(X_cat))
        self.scaler.fit(self.num_imputer.transform(X_num))
        return self
    
    def transform(self, X):
        # X = X.copy()

        # # Replace LeadTime with LiveTime
        # X['LiveTime'] = X.apply(self.calculate_livetime, axis=1)
        
        # # Replace ADR with ADRThirdQuartileDeviation
        # X = self.calculate_adr_third_quartile_deviation(X)
        
        # # Apply categorical encodings
        # X = self.apply_categorical_encodings(X)
        
        # # Drop the original columns
        # columns_to_drop = ['LeadTime', 'ADR', 'ArrivalDateMonth', 'ArrivalDateDayOfMonth', 
        #                    'ArrivalDateWeekNumber', 'ArrivalDateYear', 'Country', "AssignedRoomType",
        #                    'RequiredCarParkingSpaces', 'ReservedRoomType']
        # columns_to_drop.extend(self.categorical_features)
        # X.drop(columns=columns_to_drop, inplace=True)
         
        # Apply transformations to the data
        X_cat = X[['Meal', 'Country', 'MarketSegment', 'DistributionChannel', 'CustomerType', 'ReservationStatus']]
        X_num = X[['LeadTime', 'StaysInWeekendNights', 'StaysInWeekNights', 'Adults', 'Children', 'ADR']]

       # Apply transformations
        X_cat_imputed = self.cat_imputer.transform(X_cat)
        X_num_imputed = self.num_imputer.transform(X_num)
        X_cat_encoded = self.encoder.transform(X_cat_imputed)
        X_num_scaled = self.scaler.transform(X_num_imputed)
        
        # Return combined transformed data
        return np.hstack((X_cat_encoded, X_num_scaled))

        
        return X
    
    def calculate_livetime(self, row):
        arrival_date = datetime.date(row['ArrivalDateYear'], row['ArrivalDateMonth'], row['ArrivalDateDayOfMonth'])
        booking_date = arrival_date - datetime.timedelta(days=row['LeadTime'])
        reservation_status_date = datetime.datetime.strptime(row['ReservationStatusDate'], '%Y-%m-%d').date()

        if row['ReservationStatus'] == 'Check-Out' or row['ReservationStatus'] == 'No-Show':
            return row['LeadTime']
        else:
            # If the reservation is canceled, the live time is the difference between the booking date and the cancel date
            return (reservation_status_date - booking_date).days
    
    def calculate_adr_third_quartile_deviation(self, X):
        # Group by the required factors
        grouped = X.groupby(['DistributionChannel', 'ReservedRoomType', 'ArrivalDateWeekNumber', 'ArrivalDateYear'])

        # Calculate the third quartile for each group
        adr_third_quartile = grouped['ADR'].transform(lambda x: x.quantile(0.75))

        # Calculate the deviation
        X['ADRThirdQuartileDeviation'] = X['ADR'] / adr_third_quartile

        # Handle cases where third quartile is 0 (unlikely, but possible)
        X['ADRThirdQuartileDeviation'] = X['ADRThirdQuartileDeviation'].fillna(0)

        return X
    
    def fit_categorical_encodings(self, X):
        for feature in self.categorical_features:
            # Calculate value counts and keep only those above the threshold
            value_counts = X[feature].value_counts(normalize=True)
            frequent_categories = value_counts[value_counts >= self.cardinality_threshold].index.tolist()
            
            # Fit OneHotEncoder for frequent categories
            onehot = OneHotEncoder(handle_unknown='ignore')
            onehot.fit(X[feature].values.reshape(-1, 1))
            self.encoders[feature] = onehot
            
            # Calculate logit-odds
            feature_grouped = pd.DataFrame({
                'total': X.groupby(feature).size(),
                'positive': X.groupby(feature)['IsCanceled'].sum()
            })
            feature_grouped['negative'] = feature_grouped['total'] - feature_grouped['positive']
            feature_grouped['logit_odds'] = np.log((feature_grouped['positive'] + 0.5) / (feature_grouped['negative'] + 0.5))
            
            # Fit imputer for logit-odds
            imputer = SimpleImputer(strategy='median')
            imputer.fit(feature_grouped['logit_odds'].values.reshape(-1, 1))
            self.imputers[feature] = imputer

            # Store value counts and logit-odds map for later use
            self.value_counts[feature] = value_counts
            self.logit_odds_map[feature] = feature_grouped['logit_odds'].to_dict()

    def apply_categorical_encodings(self, X):
        for feature in self.categorical_features:
            # Apply OneHotEncoder
            onehot_encoded = self.encoders[feature].transform(X[feature].values.reshape(-1, 1)).toarray()
            onehot_columns = [f"{feature}_onehot_{i}" for i in range(onehot_encoded.shape[1])]

            # Ensure the number of columns match
            if onehot_encoded.shape[1] == len(onehot_columns):
                onehot_df = pd.DataFrame(onehot_encoded, columns=onehot_columns, index=X.index)
                X = pd.concat([X, onehot_df], axis=1)
            else:
                raise ValueError(f"Mismatch in number of one-hot encoded columns for feature {feature}")
            
            # Apply logit-odds encoding
            # logit_odds = self.imputers[feature].transform(X[feature].map(self.logit_odds_map[feature]).values.reshape(-1, 1))
            # X[f"{feature}_logit_odds"] = logit_odds
            
            # Calculate prevalence
            X[f"{feature}_prevalence"] = X[feature].map(self.value_counts[feature])
        
        return X

In [None]:
def preprocess_data(data, categorical_features):
    
    # 1. Perform time-series stratified split
    # X_train, X_test, y_train, y_test = split_data(data)
    # X_train, X_test, y_train, y_test = split_data_time_series(data, 'IsCanceled', n_splits=5)
    train_data, test_data = split_data(data)

    # 2. Feature Engineering
    feature_engineer = FeatureEngineering(categorical_features)
    feature_engineer.fit(train_data)
    train_data_transformed = feature_engineer.transform(train_data)
    test_data_transformed = feature_engineer.transform(test_data)
    
    # 3. Separate features and target
    y_train = train_data_transformed['IsCanceled']
    X_train = train_data_transformed.drop(columns=['IsCanceled'])
    
    y_test = test_data_transformed['IsCanceled']
    X_test = test_data_transformed.drop(columns=['IsCanceled'])

    return X_train, X_test, y_train, y_test


In [None]:
categorical_features = ['Hotel', 
                        'Meal', 
                        'MarketSegment', 
                        'DistributionChannel', 
                        'DepositType', 
                        'CustomerType', 
                        'ReservationStatus',
                        'ReservationStatusDate',
                        'Agent',
                        'Company']

X_train, X_test, y_train, y_test = preprocess_data(data_cleaned, categorical_features)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

def create_model_pipeline(classifier):
    return Pipeline([
        ('feature_engineering', FeatureEngineering(categorical_features)),
        ('classifier', classifier)
    ])

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Splitting the cleaned data for pipeline training
X = data_cleaned.drop(columns='IsCanceled')
y = data_cleaned['IsCanceled']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating the full pipeline with feature engineering and model
pipeline = Pipeline(steps=[
    ('feature_engineering', FeatureEngineering()),  # Feature engineering step
    ('model', RandomForestClassifier())  # You can replace this with any model
])

# Fitting the pipeline to the training data
pipeline.fit(X_train, y_train)

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Evaluate with cross-validation
scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
print(f'Cross-validated accuracy: {scores.mean():.4f}')


Cross-validated accuracy: 1.0000
