In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from sklearn.utils.class_weight import compute_class_weight

# Load data sheet
df = pd.read_csv('../data/results/hotel_booking_cancellation_prediction_one_hot_deposit.csv')

In [2]:
# Convert the months in 'arrival_date' to numerical values
df['arrival_date_month'] = pd.to_datetime(df['arrival_date']).dt.month

# Define the X 
X = df.drop(columns=['is_canceled', 'arrival_date', 'stays_in_weeks_nights',
                      'children', 'babies'])


# Target variable
y = df['is_canceled']
accuracy_list = []


In [3]:
# Split the data 80% train/ 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


# Scale 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Create the model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
   BatchNormalization(),
   Dropout(0.4),

    tf.keras.layers.Dense(128, activation='relu'),
   BatchNormalization(),
   Dropout(0.4),
   
    tf.keras.layers.Dense(64, activation='relu'),
   BatchNormalization(),
   Dropout(0.3),

   tf.keras.layers.Dense(32, activation='relu'),
   BatchNormalization(),
   Dropout(0.2),

   tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
              loss='binary_crossentropy',
              metrics=[tf.keras.metrics.AUC(name='auc'), 
                        tf.keras.metrics.Recall(name='recall'),
                        'accuracy'])

# Calculate the class weight
neg, pos = np.bincount(y_train)
total = neg + pos
weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)
class_weight = {0: weight_for_0, 1: weight_for_1}


# Train the model
early_stop = EarlyStopping(
    monitor='val_auc', 
    patience=50,
    verbose=1,
    mode='max',
    restore_best_weights=True
)

history = model.fit(
    X_train_scaled, 
    y_train,
    epochs=500,
    batch_size=256,
    validation_split=0.2,
    class_weight=class_weight,
    callbacks=[early_stop]
)

# Evaluation
test_predictions = model.predict(X_test_scaled)


report = classification_report(y_test, test_predictions > 0.5, output_dict=True)



accuracy_list.append(report['accuracy'])

print("Classification report: ", classification_report(y_test, test_predictions > 0.5))



Epoch 1/500


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 30ms/step - accuracy: 0.5360 - auc: 0.5632 - loss: 0.8025 - recall: 0.5775 - val_accuracy: 0.7641 - val_auc: 0.8017 - val_loss: 0.5984 - val_recall: 0.6248
Epoch 2/500
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.6504 - auc: 0.7303 - loss: 0.6370 - recall: 0.7156 - val_accuracy: 0.7876 - val_auc: 0.8455 - val_loss: 0.5583 - val_recall: 0.7002
Epoch 3/500
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7076 - auc: 0.7845 - loss: 0.5732 - recall: 0.7121 - val_accuracy: 0.8047 - val_auc: 0.8676 - val_loss: 0.5127 - val_recall: 0.7152
Epoch 4/500
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.7307 - auc: 0.8062 - loss: 0.5361 - recall: 0.7156 - val_accuracy: 0.8186 - val_auc: 0.8813 - val_loss: 0.4752 - val_recall: 0.7387
Epoch 5/500
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/ste