In [6]:
import pandas as pd
import numpy as np
from catboost import CatBoostRanker, Pool
import polars as pl
from sklearn.model_selection import train_test_split
import optuna
from sklearn.metrics import ndcg_score

# loading in the cleaned training data
train_data = pl.read_csv('data/cleaned_training_data.csv')
test_data = pl.read_csv('data/cleaned_test_data.csv')

# I will drop unnecessary columns
train_data = train_data.drop(['prop_log_historical_price', 'price_usd', 'parsed_date', 'year', 'month', 'day', 'search_hour', 'day_of_week', 'year_month', 'date_time', 'prop_historical_price', 'price_usd_per_night_test', 'price_ratio', 'position', 'gross_bookings_usd', 'click_bool']). \
    rename({'price_usd_without_promo': 'price_usd'})

test_data = test_data.drop(['prop_log_historical_price', 'price_usd', 'parsed_date', 'year', 'month', 'day', 'search_hour', 'day_of_week', 'year_month', 'date_time', 'prop_historical_price', 'price_usd_per_night_test', 'price_ratio']). \
    rename({'price_usd_without_promo': 'price_usd'})

In [7]:
# creating the dataframe for the CatBoostRanker
cbr_data = train_data.to_pandas().copy()

# I will train the model on a subset
cbr_data = cbr_data[cbr_data['srch_id'].isin(cbr_data['srch_id'].unique()[:50000])]
print(cbr_data.shape) # 2492 observations

# test model subset
test_subset = test_data.to_pandas().copy()
test_subset = test_subset[test_subset['srch_id'].isin(cbr_data['srch_id'])]

(1243975, 40)


In [8]:
# I will set up a CatBoostRanker. It handles numeric variables natively, so only the categorical ones need to be specified (it assumes everything else is numeric).

# this column is used as a sort of relevance feature for the ranking, as the most relevant hotels are the ones the customers actually booked. This will help the model decide what is most important
cbr_data['target'] = cbr_data['booking_bool']

# now we exclude target-only columns
exclude_cols = ['target', 'booking_bool', 'srch_id', 'prop_id']
feature_cols = [col for col in cbr_data.columns if col not in exclude_cols]

# specifying categorical features
cat_features = [col for col in cbr_data[feature_cols].columns if cbr_data[col].dtype == 'object' or 'id' in col]

# do the train/test split by group (srch_id). This is to prevent the data being split such that both the training and test sets contain the same search id
unique_srch_ids = cbr_data['srch_id'].unique()
train_srch, val_srch = train_test_split(unique_srch_ids, test_size=0.2, random_state=42)

# actually creating the training and validation sets
train_subset = cbr_data[cbr_data['srch_id'].isin(train_srch)].copy()
val_subset = cbr_data[cbr_data['srch_id'].isin(val_srch)].copy()

train_subset = train_subset.sort_values('srch_id')
val_subset = val_subset.sort_values('srch_id')

# impute missing numeric values
for col in feature_cols:
    if train_subset[col].dtype in [np.float64, np.int64]:
        train_subset[col] = train_subset[col].fillna(-999)
    if val_subset[col].dtype in [np.float64, np.int64]:
        val_subset[col] = val_subset[col].fillna(-999)

# same with categorical
for col in cat_features:
    train_subset[col] = train_subset[col].fillna('missing')
    val_subset[col] = val_subset[col].fillna('missing')

# Helper to compute group sizes
def get_group_sizes(df, group_key='srch_id'):
    return df.groupby(group_key).size().tolist()

# Group sizes for CatBoost
train_group_sizes = get_group_sizes(train_subset)
val_group_sizes = get_group_sizes(val_subset)

train_pool = Pool(
    data=train_subset[feature_cols],
    label=train_subset['target'],
    group_id=train_subset['srch_id'].astype(int).tolist(),
    cat_features=cat_features
)

val_pool = Pool(
    data=val_subset[feature_cols],
    label=val_subset['target'],
    group_id=val_subset['srch_id'].astype(int).tolist(),
    cat_features=cat_features
)

# hyperparameter optimization function using optuna
def objective(trial):
    params = {
        "iterations": 500,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "depth": trial.suggest_int("depth", 4, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        "random_strength": trial.suggest_float("random_strength", 1e-9, 10),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 1),
        "loss_function": "YetiRank",
        "random_seed": 42,
        "verbose": 0
    }

    model = CatBoostRanker(**params)
    model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=30, verbose=0)

    preds = model.predict(val_pool)

    # Evaluate using NDCG@5
    val_preds = val_subset.copy()
    val_preds['pred'] = preds
    val_preds = val_preds.sort_values(by=['srch_id', 'pred'], ascending=[True, False])

    ndcg_scores = []
    for srch_id, group in val_preds.groupby('srch_id'):
        y_true = group['target'].values
        y_score = group['pred'].values
        if len(y_true) > 1:  # NDCG isn't defined for length 1
            ndcg_scores.append(ndcg_score([y_true], [y_score], k=5))

    return np.mean(ndcg_scores)

In [9]:
# optimizing the hyperparameters. This may take a while...
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[I 2025-05-17 14:14:52,125] A new study created in memory with name: no-name-3680557c-bffb-4948-8de9-92cd061cefc9
[I 2025-05-17 14:18:20,512] Trial 0 finished with value: 0.2683810374043468 and parameters: {'learning_rate': 0.12135972776624952, 'depth': 10, 'l2_leaf_reg': 3.3335617645086293, 'random_strength': 2.1678334242715267, 'bagging_temperature': 0.6263378646502356}. Best is trial 0 with value: 0.2683810374043468.
[I 2025-05-17 14:21:06,784] Trial 1 finished with value: 0.2682413128102511 and parameters: {'learning_rate': 0.09949034094126227, 'depth': 8, 'l2_leaf_reg': 2.1698774043657916, 'random_strength': 3.8700635817340343, 'bagging_temperature': 0.5403071674999499}. Best is trial 0 with value: 0.2683810374043468.
[I 2025-05-17 14:23:52,794] Trial 2 finished with value: 0.26713757766538043 and parameters: {'learning_rate': 0.1399389830243865, 'depth': 7, 'l2_leaf_reg': 9.284922557783096, 'random_strength': 2.339575686631948, 'bagging_temperature': 0.16293968981245965}. Best is

In [13]:
with open('catboostranker_hyperparameters.txt', 'w') as f:
    f.write("Hyperparameters:\n")
    f.write(str(study.best_params))


In [17]:
# The model will now be trained on the full dataset

# I will set up a CatBoostRanker. It handles numeric variables natively, so only the categorical ones need to be specified (it assumes everything else is numeric).

# this column is used as a sort of relevance feature for the ranking, as the most relevant hotels are the ones the customers actually booked. This will help the model decide what is most important
train_data = train_data.to_pandas()
train_data['target'] = train_data['booking_bool']

# now we exclude target-only columns
exclude_cols = ['target', 'booking_bool', 'srch_id', 'prop_id']
feature_cols = [col for col in train_data.columns if col not in exclude_cols]

# specifying categorical features
cat_features = [col for col in train_data[feature_cols].columns if train_data[col].dtype == 'object' or 'id' in col]

# do the train/test split by group (srch_id). This is to prevent the data being split such that both the training and test sets contain the same search id
unique_srch_ids = train_data['srch_id'].unique()
train_srch, val_srch = train_test_split(unique_srch_ids, test_size=0.2, random_state=42)

# actually creating the training and validation sets
train_subset = train_data[train_data['srch_id'].isin(train_srch)].copy()
val_subset = train_data[train_data['srch_id'].isin(val_srch)].copy()

train_subset = train_subset.sort_values('srch_id')
val_subset = val_subset.sort_values('srch_id')

# impute missing numeric values
for col in feature_cols:
    if train_subset[col].dtype in [np.float64, np.int64]:
        train_subset[col] = train_subset[col].fillna(-999)
    if val_subset[col].dtype in [np.float64, np.int64]:
        val_subset[col] = val_subset[col].fillna(-999)

# same with categorical
for col in cat_features:
    train_subset[col] = train_subset[col].fillna('missing')
    val_subset[col] = val_subset[col].fillna('missing')

# Helper to compute group sizes
def get_group_sizes(df, group_key='srch_id'):
    return df.groupby(group_key).size().tolist()

# Group sizes for CatBoost
train_group_sizes = get_group_sizes(train_subset)
val_group_sizes = get_group_sizes(val_subset)

train_pool = Pool(
    data=train_subset[feature_cols],
    label=train_subset['target'],
    group_id=train_subset['srch_id'].astype(int).tolist(),
    cat_features=cat_features
)

val_pool = Pool(
    data=val_subset[feature_cols],
    label=val_subset['target'],
    group_id=val_subset['srch_id'].astype(int).tolist(),
    cat_features=cat_features
)

In [18]:
# applying the best hyperparameters to the model and training it
best_params = study.best_params
best_params["iterations"] = 500
best_params["loss_function"] = "YetiRank"

model = CatBoostRanker(**best_params)
model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=30)

Groupwise loss function. OneHotMaxSize set to 10
0:	test: 0.2506159	best: 0.2506159 (0)	total: 4.41s	remaining: 36m 40s
1:	test: 0.2665479	best: 0.2665479 (1)	total: 8.67s	remaining: 35m 57s
2:	test: 0.2787950	best: 0.2787950 (2)	total: 12.4s	remaining: 34m 11s
3:	test: 0.3034428	best: 0.3034428 (3)	total: 15.8s	remaining: 32m 39s
4:	test: 0.3129781	best: 0.3129781 (4)	total: 19.9s	remaining: 32m 47s
5:	test: 0.3279341	best: 0.3279341 (5)	total: 23.3s	remaining: 32m
6:	test: 0.3334153	best: 0.3334153 (6)	total: 27.3s	remaining: 32m 1s
7:	test: 0.3346231	best: 0.3346231 (7)	total: 30.9s	remaining: 31m 38s
8:	test: 0.3414112	best: 0.3414112 (8)	total: 34.5s	remaining: 31m 23s
9:	test: 0.3428203	best: 0.3428203 (9)	total: 38.5s	remaining: 31m 27s
10:	test: 0.3477656	best: 0.3477656 (10)	total: 41.9s	remaining: 31m 4s
11:	test: 0.3479229	best: 0.3479229 (11)	total: 45.3s	remaining: 30m 42s
12:	test: 0.3474856	best: 0.3479229 (11)	total: 48.8s	remaining: 30m 27s
13:	test: 0.3477396	best: 0.

<catboost.core.CatBoostRanker at 0x1a2fcc240b0>

In [20]:
test_subset = test_data.to_pandas().copy()

# impute missing numeric values
for col in feature_cols:
    if test_subset[col].dtype in [np.float64, np.int64]:
        test_subset[col] = test_subset[col].fillna(-999)

# same with categorical
for col in cat_features:
    test_subset[col] = test_subset[col].fillna('missing')

# Apply model on test data
test_pool = Pool(test_subset[feature_cols], group_id=test_subset['srch_id'], cat_features=cat_features)
test_subset['pred_score'] = model.predict(test_pool)

# Rank hotels for each search
ranked = test_subset.sort_values(by=['srch_id', 'pred_score'], ascending=[True, False])

# the prediction score from catboost isnt a probability, it instead is a relevance score. Converting this to a probability
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

# apply softmax to the prediction scores
ranked['softmax_score'] = ranked.groupby('srch_id')['pred_score'].transform(softmax)
ranked = ranked[['srch_id', 'prop_id']]
print(ranked)

# save the ranked dataframe to a csv
ranked.to_csv('catboostranker_submission.csv', index=False)

         srch_id  prop_id
23             1    99484
9              1    54937
12             1    61934
5              1    28181
4              1    24194
...          ...      ...
4959179   332787    33959
4959178   332787    32019
4959182   332787    99509
4959181   332787    94437
4959180   332787    35240

[4959183 rows x 2 columns]
