In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ndcg_score
import xgboost as xgb

# Data

In [None]:
def load_dataset(filename):
    return pd.read_csv(filename)

training_file_raw = 'training_set_VU_DM.csv'
test_file_raw = 'test_set_VU_DM.csv'
training_file_stats = 'training_set_stats_VU_DM.csv'
test_file_stats = 'test_set_stats_VU_DM.csv'

In [None]:
df = load_dataset(training_file_raw)
print(df.columns)

#### Features
|Feature  |Type | Description |Potential predictor|
|:------- |:----|:------------|------------------:|
|srch_id                     |int      |ID of search/user| |
|date_time                   |Datetime |Time of search| |
|site_id                     |int      |ID of Expedia link (.com/.co.uk/.co.jp...)| |
|visitor_location_country_id |int      |ID of user's country| *|
|visitor_hist_starrating     |float    |mean star rating of the customer's hotel purchases| *|
|visitor_hist_adr_usd        |float    |mean price per night of the customer's hotel purchases| *|
|prop_country_id             |int      |ID of the hotel's country| *|
|prop_id                     |int      |ID of hotel| |
|prop_starrating             |int      |star rating of hotel| *|
|prop_review_score           |float    |review score of hotel (rounded to 0.5)| **|
|prop_brand_bool             |int      |part of major hotel chain (1) or not (0)| *|
|prop_location_score1        |float    |desirability score of the hotel's location (primary score)                   |                  ** |
|prop_location_score2        |float    |desirability score of the hotel's location (secondary score)                 |                  ** |
|prop_log_historical_price   |float    |log of mean price of hotel in the last trading period                        |                   |
|price_usd                   |float    |displayed price of the hotel                                                 |                  ** |
|promotion_flag              |int      |1 if hotel had a sale price promotion                                        |                  ** |
|srch_destination_id         |int      |ID of the searched destination                                               |                   |
|srch_length_of_stay         |int      |number of nights in the stay                                                 |                  * |
|srch_booking_window         |int      |days between search and stay start                                           |                  * |
|srch_adults_count           |int      |number of adults in the search                                               |                  * |
|srch_children_count         |int      |number of children in the search                                             |                  * |
|srch_room_count             |int      |number of rooms in the search                                                |                  * |
|srch_saturday_night_bool    |bool     |1 if the stay includes a Saturday night                                      |                   |
|srch_query_affinity_score   |float    |log probability a hotel will be clicked on the internet                      |                   |
|orig_destination_distance   |float    |physical distance between hotel and customer                                 |                   |
|random_bool                 |bool     |1 if results were shown in random order                                      |                   |
|comp1_rate                  |int      |price comparison vs. competitor 1 (-1: higher, 0: same, +1: lower)           |                   |
|comp1_inv                   |int      |availability vs. competitor 1 (+1: competitor unavailable, 0: both available)|                   |
|comp1_rate_percent_diff     |float    |absolute percentage price difference with competitor 1                       |                   |
|comp2_rate                  |int      |same as comp1_rate for competitor 2                                          |                   |
|comp2_inv                   |int      |same as comp1_inv for competitor 2                                           |                   |
|comp2_rate_percent_diff     |float    |same as comp1_rate_percent_diff for competitor 2                             |                   |
|...                         |...      |same structure for competitors 3 through 8                                   |                   |
|position                    |int      |rank of hotel in search results (training data only)                         |                   |
|click_bool                  |bool     |1 if user clicked on the hotel  (training data only)                         |                   |
|booking_bool                |bool     |1 if user booked the hotel      (training data only)                         |                   |
|gross_bookings_usd           |float    |actual value of the booking (includes taxes, fees, etc.) (training data only)|                   |    

In [None]:
def dataset_stats(df):
    print(f'Number of features: {len(df.columns)}')
    total_observations = len(df)
    print(f'Number of observations: {len(df)}')
    print(f'Number of rows with missing values: {df.isnull().any(axis=1).sum()}')
    print(f'Number of columns with missing values: {df.isnull().any(axis=0).sum()}')
    print(f'Percentage not-missing data for features with missing values:')
    for feature in df.columns[df.isnull().any()]:
        print(f"{feature}: {100*(total_observations - df[feature].isnull().sum())/total_observations:.2f}% not missing")

dataset_stats(df)

## Feature Engineering

In [None]:
df = load_dataset(training_file_raw)

Column 'relevance' is created containing the NDCG@5 relevance scores as explained in the assignment.

In [None]:
# Relevance scores for NDCG@5, used at model evaluation
df['relevance'] = 0
df.loc[df['click_bool'] == 1, 'relevance'] = 1
df.loc[df['booking_bool'] == 1, 'relevance'] = 5

Per property id (prop_id), the mean, standard deviation, and median are computed for each numerical feature available in the test file. This allows for the model to learn relative differences between searches.

In [None]:
# Columns to calculate mean/std/median for (all but grouping feature prop_id and the training set exclusive features)
numeric_cols = df.select_dtypes(include='number').columns.drop(['prop_id', 'position', 'click_bool', 'booking_bool', 'gross_bookings_usd', 'relevance'])

In [None]:
prop_means = df.groupby('prop_id')[numeric_cols].mean().add_suffix('_mean')

In [None]:
prop_stds = df.groupby('prop_id')[numeric_cols].std().add_suffix('_std')

In [None]:
prop_medians = df.groupby('prop_id')[numeric_cols].median().add_suffix('_median')

In [None]:
prop_stats = prop_means.join([prop_stds, prop_medians])

In [None]:
df_with_stats = df.merge(prop_stats, on='prop_id', how='left')

In [None]:
df_with_stats.to_csv('train_set_stats_VU_DM.csv', index=False)

In [None]:
df_with_stats.to_csv('test_set_stats_VU_DM.csv', index=False)

^ merge + write to csv takes long (35+ mins for training set)

# Models

### XGBoost rank (relevance score)

In [None]:
df = load_dataset(training_file_stats).drop('date_time', axis=1)

In [None]:
# Make sure training features are only the ones available in the test file and don't include target feature relevance
X = df.drop(['position', 'click_bool', 'booking_bool', 'gross_bookings_usd', 'relevance'], axis=1)
y = df['relevance']

In [None]:
# Make sure searches are not split across the training and test sets
groups = df.groupby('srch_id').size().to_numpy()
gss = GroupShuffleSplit(test_size=0.2, random_state=1)
train_idx, test_idx = next(gss.split(X, y, groups=df['srch_id']))

In [None]:
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

In [None]:
X_train_2 = X_train.drop(['position_mean', 'position_std', 'position_median',
         'click_bool_mean', 'click_bool_std', 'click_bool_median',
         'booking_bool_mean', 'booking_bool_std', 'booking_bool_median',
         'gross_bookings_usd_mean', 'gross_bookings_usd_std', 'gross_bookings_usd_median',
         'relevance_mean', 'relevance_std', 'relevance_median'], axis=1)

In [None]:
X_test_2 = X_test.drop(['position_mean', 'position_std', 'position_median',
         'click_bool_mean', 'click_bool_std', 'click_bool_median',
         'booking_bool_mean', 'booking_bool_std', 'booking_bool_median',
         'gross_bookings_usd_mean', 'gross_bookings_usd_std', 'gross_bookings_usd_median',
         'relevance_mean', 'relevance_std', 'relevance_median'], axis=1)

In [None]:
dtrain = xgb.DMatrix(X_train_2, label=y_train)
dtest = xgb.DMatrix(X_test_2, label=y_test)

In [None]:
# Set group info
group_train = df.iloc[train_idx].groupby('srch_id').size().to_numpy()
group_test = df.iloc[test_idx].groupby('srch_id').size().to_numpy()
dtrain.set_group(group_train)
dtest.set_group(group_test)

In [None]:
params = {
    "objective": "rank:ndcg", # ranking as oppposed to regression or binary classification
    "tree_method": "hist",
    "device": "cuda",         # making use of gpu in training
    "eval_metric": "ndcg@5"
}
# What to evaluate during training, last one is used for early stopping
evals = [(dtrain, "train"), (dtest, "validation")]

model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=3000,     # max rounds, made arbitrarily large in order to finish through early stopping
    evals=evals,
    early_stopping_rounds=20, # stop training if metric has not improved for 20 rounds (prevent overfitting)
    verbose_eval=2            # how often to print metrics
)

In [None]:
# Save model to json file to avoid having to train again
model.save_model('xgboost_model_fifth_split.json')

In [None]:
preds = model.predict(dtest)

df_test = df.iloc[test_idx].copy()
df_test['pred'] = preds

# Compute NDCG@5 per srch_id and take the average
ndcg_scores = []
for srch_id, group in df_test.groupby('srch_id'):
    if len(group) < 5:
        continue  # the @5 in NDCG@5
    true_relevance = group['relevance'].values.reshape(1, -1)
    predicted_scores = group['pred'].values.reshape(1, -1)
    score = ndcg_score(true_relevance, predicted_scores, k=5)
    ndcg_scores.append(score)
mean_ndcg = np.mean(ndcg_scores)
print(f"Mean NDCG@5: {mean_ndcg:.4f}")

### Results
all but date_time: 0.3821  
all but date_time + mean, median, std per prop_id of all features (200 rounds, 0.8/0.2 train/test): 0.4006  
all but date_time + mean, median, std per prop_id of all features (286 rounds, 0.8/0.2 train/test): 0.4035

### XGBoost categorical (yes/no booking)

In [None]:
X, y = df.drop(['position', 'click_bool', 'booking_bool', 'gross_bookings_usd', 'date_time'], axis=1), df[['booking_bool']]
y_encoded = OrdinalEncoder().fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [None]:
params = {
    "objective": "binary:logistic",
    "tree_method": "hist",
    "device": "cuda",
    "eval_metric": ["error", "logloss", "auc"]
         }
evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]
n = 200


model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
   verbose_eval=10,
   early_stopping_rounds=20
)

In [None]:
y_proba = model.predict(dtest_reg)
y_pred = (y_proba > 0.5).astype(int)
y_true = dtest_reg.get_label()
print(y_pred[:10])
print(y_true[:10])
acc = accuracy_score(y_true, y_pred)
print("Accuracy:", acc)

print("\nClassification Report:")
print(classification_report(y_true, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_true, y_pred))

### Results
all but date_time: 0.9719390409870384

## Submission generator

In [None]:
model = xgb.Booster()
model.load_model('xgboost_model.json')

In [None]:
test_df = load_dataset(test_file_stats).drop(['relevance_std', 'relevance', 'relevance_mean', 'relevance_median', 'date_time'], axis=1)

In [None]:
dtest = xgb.DMatrix(test_df, enable_categorical=True)

In [None]:
y_pred = model.predict(dtest)
test_df['pred'] = y_pred

In [None]:
# Sort prop_ids within search (srch_id) by descending predicted relevance
test_df_filtered = test_df[['srch_id', 'prop_id', 'pred']]
test_df_sorted = test_df_filtered.sort_values(by=["srch_id", "pred"], ascending=[True, False])

In [None]:
test_df_sorted.head(30)

#### XGBoost rank

In [None]:
df_submission = test_df_sorted.drop(columns=['pred'])
df_submission.to_csv('VU-DM-2025-Group-23 (1).csv', index=False)

#### XGBoost categorical

In [None]:
df_submission = test_df_sorted.drop(columns=['pred'])
df_submission.to_csv('VU-DM-2025-Group-23.csv', index=False)

----------------------------

In [None]:
df_submission.shape