In [1]:
# Load the datasets
import pandas as pd

# Load both H1 (Resort Hotel) and H2 (City Hotel)
data_h1 = pd.read_csv('dataset/H1.csv')
data_h2 = pd.read_csv('dataset/H2.csv')

# Concatenate datasets (Resort and City hotel)
data_combined = pd.concat([data_h1, data_h2], ignore_index=True)

# Initial data inspection
data_combined.info(), data_combined.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 31 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   IsCanceled                   119390 non-null  int64  
 1   LeadTime                     119390 non-null  int64  
 2   ArrivalDateYear              119390 non-null  int64  
 3   ArrivalDateMonth             119390 non-null  object 
 4   ArrivalDateWeekNumber        119390 non-null  int64  
 5   ArrivalDateDayOfMonth        119390 non-null  int64  
 6   StaysInWeekendNights         119390 non-null  int64  
 7   StaysInWeekNights            119390 non-null  int64  
 8   Adults                       119390 non-null  int64  
 9   Children                     119386 non-null  float64
 10  Babies                       119390 non-null  int64  
 11  Meal                         119390 non-null  object 
 12  Country                      118902 non-null  object 
 13 

(None,
    IsCanceled  LeadTime  ArrivalDateYear ArrivalDateMonth  \
 0           0       342             2015             July   
 1           0       737             2015             July   
 2           0         7             2015             July   
 3           0        13             2015             July   
 4           0        14             2015             July   
 
    ArrivalDateWeekNumber  ArrivalDateDayOfMonth  StaysInWeekendNights  \
 0                     27                      1                     0   
 1                     27                      1                     0   
 2                     27                      1                     0   
 3                     27                      1                     0   
 4                     27                      1                     0   
 
    StaysInWeekNights  Adults  Children  ...      DepositType        Agent  \
 0                  0       2       0.0  ...  No Deposit              NULL   
 1               

In [2]:
# Step 1: Remove duplicates
data_cleaned = data_combined.drop_duplicates()

# Step 2: Convert 'ArrivalDateMonth' to numerical (January = 1, February = 2, etc.)
months = ['January', 'February', 'March', 'April', 'May', 'June', 
          'July', 'August', 'September', 'October', 'November', 'December']
data_cleaned.loc[:, 'ArrivalDateMonth'] = data_cleaned['ArrivalDateMonth'].apply(lambda x: months.index(x) + 1)

# Step 3: Handle missing values by filling with reasonable defaults
# 'Children' missing values can be filled with 0
data_cleaned.fillna({'Children': 0}, inplace=True)

# 'Country' - Replace missing values with 'Unknown'
data_cleaned.fillna({'Country': 'Unknown'}, inplace=True)

# 'Agent' and 'Company' - Replace 'NULL' with 'Unknown' for these fields
data_cleaned['Agent'].replace('NULL', 'Unknown', inplace=True)
data_cleaned['Company'].replace('NULL', 'Unknown', inplace=True)

# Inspect the cleaned data
data_cleaned.info(), data_cleaned.head()


<class 'pandas.core.frame.DataFrame'>
Index: 87396 entries, 0 to 119389
Data columns (total 31 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   IsCanceled                   87396 non-null  int64  
 1   LeadTime                     87396 non-null  int64  
 2   ArrivalDateYear              87396 non-null  int64  
 3   ArrivalDateMonth             87396 non-null  object 
 4   ArrivalDateWeekNumber        87396 non-null  int64  
 5   ArrivalDateDayOfMonth        87396 non-null  int64  
 6   StaysInWeekendNights         87396 non-null  int64  
 7   StaysInWeekNights            87396 non-null  int64  
 8   Adults                       87396 non-null  int64  
 9   Children                     87396 non-null  float64
 10  Babies                       87396 non-null  int64  
 11  Meal                         87396 non-null  object 
 12  Country                      87396 non-null  object 
 13  MarketSegment       

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned.fillna({'Children': 0}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned.fillna({'Country': 'Unknown'}, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_cleaned['Agent'].replace('NULL', 'Unknown', inplace=True)
A value is trying to be set 

(None,
    IsCanceled  LeadTime  ArrivalDateYear ArrivalDateMonth  \
 0           0       342             2015                7   
 1           0       737             2015                7   
 2           0         7             2015                7   
 3           0        13             2015                7   
 4           0        14             2015                7   
 
    ArrivalDateWeekNumber  ArrivalDateDayOfMonth  StaysInWeekendNights  \
 0                     27                      1                     0   
 1                     27                      1                     0   
 2                     27                      1                     0   
 3                     27                      1                     0   
 4                     27                      1                     0   
 
    StaysInWeekNights  Adults  Children  ...      DepositType        Agent  \
 0                  0       2       0.0  ...  No Deposit              NULL   
 1               

In [3]:
# Step 4: Convenience splitting
# Sort data by 'ArrivalDateYear' and 'ArrivalDateMonth'
data_sorted = data_cleaned.sort_values(by=['ArrivalDateYear', 'ArrivalDateMonth'])

# Split the sorted data: 75% for training and 25% for testing
train_size = int(0.75 * len(data_sorted))
train_data = data_sorted[:train_size]
test_data = data_sorted[train_size:]

# Separate the features (X) and the target (y) for both training and test sets
X_train = train_data.drop(columns='IsCanceled')
y_train = train_data['IsCanceled']

X_test = test_data.drop(columns='IsCanceled')
y_test = test_data['IsCanceled']

# Output the size of the train and test sets
X_train.shape, y_train.shape, X_test.shape, y_test.shape


((65547, 30), (65547,), (21849, 30), (21849,))

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

class FeatureEngineering(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.adr_grouped = None
        pass
    
    def fit(self, X, y=None):        
        return self
    
    def transform(self, X):
        # Keep 'ArrivalDateMonth' for grouping but remove other date-related columns
        X = X.drop(columns=['ArrivalDateYear', 
                            'ArrivalDateWeekNumber', 
                            'ArrivalDateDayOfMonth', 
                            'ReservationStatus', 
                            'ReservationStatusDate'])
        
        # Clean data to remove any leading/trailing spaces
        X['ReservedRoomType'] = X['ReservedRoomType'].str.strip()
        X['DistributionChannel'] = X['DistributionChannel'].str.strip()

        # Create 'LiveTime' feature
        X['LiveTime'] = X['LeadTime']

        # Compute quartiles if not already computed (i.e., for the training set)
        if self.adr_grouped is None:
            self.adr_grouped = X.groupby(['DistributionChannel', 'ReservedRoomType', 'ArrivalDateMonth'])['ADR'].quantile(0.75)
        
        # Calculate ADRThirdQuartileDeviation
        X['ADRThirdQuartileDeviation'] = X.apply(
            lambda row: row['ADR'] / self.adr_grouped.get(
                (row['DistributionChannel'], row['ReservedRoomType'], row['ArrivalDateMonth']), 1),
            axis=1
        )
        
        # Drop 'LeadTime', 'ADR', and 'ArrivalDateMonth' (after computation)
        X = X.drop(columns=['LeadTime', 'ADR', 'ArrivalDateMonth'])
        
        return X


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Define preprocessing for numerical and categorical columns
numerical_features = ['LiveTime', 'StaysInWeekendNights', 'StaysInWeekNights', 'Adults', 'Children', 'Babies', 'ADRThirdQuartileDeviation']
categorical_features = ['Meal', 'Country', 'MarketSegment', 'DistributionChannel', 'Agent', 'Company', 'CustomerType']

# Preprocessing pipeline for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Final pipeline: Feature Engineering -> Preprocessing -> RandomForestClassifier
pipeline = Pipeline(steps=[
    ('feature_engineering', FeatureEngineering()),           # Feature engineering 
    ('preprocessing', preprocessor),                         # Preprocessing pipeline
    ('model', RandomForestClassifier(random_state=42))       # Random forest model
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model using different metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print out the metrics
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

# Print out the confusion matrix
print('Confusion Matrix:')
conf_matrix

  lambda row: row['ADR'] / self.adr_grouped.get(


Accuracy: 0.7076
Precision: 0.6095
Recall: 0.4189
F1 Score: 0.4965
Confusion Matrix:


array([[12311,  2018],
       [ 4370,  3150]])