# Hotel Booking Prediction

Predict whether a user will book a hotel based on search data using a deep learning model. The dataset includes ~7M training samples and 300K test samples. The target is `is_booking` (binary), and the evaluation metric is ROC AUC.

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from tensorflow import keras
from tensorflow.keras import layers
import category_encoders as ce
from imblearn.over_sampling import SMOTE
import plotly.graph_objects as go
import os
os.environ["KERAS_BACKEND"] = "tensorflow"

## Data Loading

In [2]:
# Load data
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

## Preprocessing

In [3]:
# Drop irrelevant column
train = train.drop("user", axis=1)
test = test.drop("user", axis=1)

# Handle missing values in check-in/out dates
train = train.dropna(subset=["checkIn_date", "checkOut_date"])

# Fill missing destination_distance with mean by user_location_city and destination
mean_distances = train.groupby(["user_location_city", "destination"])["destination_distance"].mean().to_dict()
train["destination_distance"] = train.apply(
    lambda x: mean_distances.get((x["user_location_city"], x["destination"]), 0) 
    if pd.isna(x["destination_distance"]) else x["destination_distance"], axis=1
)
test["destination_distance"] = test.apply(
    lambda x: mean_distances.get((x["user_location_city"], x["destination"]), 0) 
    if pd.isna(x["destination_distance"]) else x["destination_distance"], axis=1
)

# Convert date columns to datetime
date_cols = ["search_date", "checkIn_date", "checkOut_date"]
for col in date_cols:
    train[col] = pd.to_datetime(train[col])
    test[col] = pd.to_datetime(test[col])

In [10]:
train["destination_distance"]=train["destination_distance"].fillna(0)
test["destination_distance"]=test["destination_distance"].fillna(0)

## Feature Engineering

In [4]:
# Create temporal features
train["stay_duration"] = (train["checkOut_date"] - train["checkIn_date"]).dt.days
train["days_to_checkin"] = (train["checkIn_date"] - train["search_date"]).dt.days
train["search_hour"] = train["search_date"].dt.hour
train["search_dayofweek"] = train["search_date"].dt.dayofweek
train["checkin_dayofweek"] = train["checkIn_date"].dt.dayofweek
train["search_month"] = train["search_date"].dt.month
train["checkin_month"] = train["checkIn_date"].dt.month

test["stay_duration"] = (test["checkOut_date"] - test["checkIn_date"]).dt.days
test["days_to_checkin"] = (test["checkIn_date"] - test["search_date"]).dt.days
test["search_hour"] = test["search_date"].dt.hour
test["search_dayofweek"] = test["search_date"].dt.dayofweek
test["checkin_dayofweek"] = test["checkIn_date"].dt.dayofweek
test["search_month"] = test["search_date"].dt.month
test["checkin_month"] = test["checkIn_date"].dt.month

# Additional features
train["is_long_stay"] = (train["stay_duration"] > 7).astype(int)
train["is_early_booking"] = (train["days_to_checkin"] > 30).astype(int)
train["is_weekend_checkin"] = (train["checkin_dayofweek"] >= 5).astype(int)
train["total_persons"] = train["n_adults"] + train["n_children"]
train["persons_per_room"] = train["total_persons"] / train["n_rooms"].replace(0, 1)

test["is_long_stay"] = (test["stay_duration"] > 7).astype(int)
test["is_early_booking"] = (test["days_to_checkin"] > 30).astype(int)
test["is_weekend_checkin"] = (test["checkin_dayofweek"] >= 5).astype(int)
test["total_persons"] = test["n_adults"] + test["n_children"]
test["persons_per_room"] = test["total_persons"] / test["n_rooms"].replace(0, 1)

# Drop original date and redundant columns
train = train.drop(["search_date", "checkIn_date", "checkOut_date", "n_adults", "n_children", 
                    "user_location_city", "destination", "hotel_market", "user_location_region"], axis=1)
test = test.drop(["search_date", "checkIn_date", "checkOut_date", "n_adults", "n_children", 
                  "user_location_city", "destination", "hotel_market", "user_location_region"], axis=1)

## Data Visualization

In [5]:
# Plot booking vs non-booking by search hour
booked = train[train["is_booking"] == 1]["search_hour"].value_counts().sort_index() / len(train[train["is_booking"] == 1])
not_booked = train[train["is_booking"] == 0]["search_hour"].value_counts().sort_index() / len(train[train["is_booking"] == 0])

fig = go.Figure(data=[
    go.Bar(name="Booked", x=booked.index, y=booked.values),
    go.Bar(name="Not Booked", x=not_booked.index, y=not_booked.values)
])
fig.update_layout(title="Search Hour Distribution", xaxis_title="Hour", yaxis_title="Percentage", barmode="group")
fig.write_json("search_hour.json")

In [7]:
fig.show()

## Feature Encoding and Scaling

In [6]:
# Numerical and categorical columns
num_cols = ["destination_distance", "stay_duration", "days_to_checkin", "search_count", 
            "total_persons", "n_rooms", "persons_per_room", "search_hour", "search_dayofweek", 
            "checkin_dayofweek", "search_month", "checkin_month"]
cat_cols = ["user_location_country", "channel", "destination_type", "hotel_continent", "hotel_country"]

# Scale numerical features
scaler = StandardScaler()
train[num_cols] = scaler.fit_transform(train[num_cols])
test[num_cols] = scaler.transform(test[num_cols])

# Encode categorical features
binary_encoder = ce.BinaryEncoder(cols=cat_cols)
train_encoded = binary_encoder.fit_transform(train[cat_cols])
test_encoded = binary_encoder.transform(test[cat_cols])

train = train.drop(cat_cols, axis=1).join(train_encoded)
test = test.drop(cat_cols, axis=1).join(test_encoded)


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True

In [9]:
train.isna().sum()

destination_distance       2537339
is_mobile                        0
is_package                       0
search_count                     0
n_rooms                          0
hotel_category                   0
is_booking                       0
stay_duration                    0
days_to_checkin                  0
search_hour                      0
search_dayofweek                 0
checkin_dayofweek                0
search_month                     0
checkin_month                    0
is_long_stay                     0
is_early_booking                 0
is_weekend_checkin               0
total_persons                    0
persons_per_room                 0
user_location_country_0          0
user_location_country_1          0
user_location_country_2          0
user_location_country_3          0
user_location_country_4          0
user_location_country_5          0
user_location_country_6          0
user_location_country_7          0
channel_0                        0
channel_1           

## Handle Imbalanced Data

In [11]:
# Apply SMOTE to balance classes
X = train.drop("is_booking", axis=1)
y = train["is_booking"]
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

MemoryError: Unable to allocate 2.11 GiB for an array with shape (6284974, 45) and data type float64

## Model Training

In [None]:
# Split data
X_train, X_valid, y_train, y_valid = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Define model
model = keras.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(128, activation="relu", kernel_regularizer=keras.regularizers.l2(0.005)),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(64, activation="relu"),
    layers.BatchNormalization(),
    layers.Dropout(0.2),
    layers.Dense(32, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])

# Compile model
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0001),
              loss="binary_crossentropy",
              metrics=["accuracy"])

# Train model
history = model.fit(X_train, y_train,
                    batch_size=256,
                    epochs=10,
                    validation_data=(X_valid, y_valid),
                    callbacks=[keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)],
                    verbose=1)

## Model Evaluation

In [None]:
# Evaluate model
y_pred_proba = model.predict(X_valid, batch_size=512).flatten()
roc_auc = roc_auc_score(y_valid, y_pred_proba)
print(f"Validation ROC AUC: {roc_auc:.4f}")

## Prediction and Submission

In [None]:
# Predict on test set
test = test.drop("Unnamed: 0", axis=1, errors="ignore")
predictions = model.predict(test, batch_size=512).flatten()
submission = pd.DataFrame({"prediction": predictions})
submission.to_csv("submission.csv", index=False)

## Save Results

In [None]:
# Create result zip
import zipfile

def compress(file_names):
    with zipfile.ZipFile("result.zip", mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
        for file_name in file_names:
            zf.write(file_name, file_name)

file_names = ["will_not_travel_again.ipynb", "submission.csv", "search_hour.json"]
compress(file_names)