In [16]:
import pandas as pd
import numpy as np
from catboost import CatBoostRanker, Pool
import polars as pl
from sklearn.model_selection import train_test_split
import optuna
from sklearn.metrics import ndcg_score

# loading in the cleaned training data
train_data = pl.read_csv('cleaned_training_data.csv')
test_data = pl.read_csv('cleaned_test_data.csv')

# I will drop unnecessary columns
train_data = train_data.drop(['prop_log_historical_price', 'price_usd', 'parsed_date', 'year', 'month', 'day', 'search_hour', 'day_of_week', 'year_month', 'date_time', 'prop_historical_price', 'price_usd_per_night_test', 'price_ratio', 'position', 'gross_bookings_usd', 'click_bool']). \
    rename({'price_usd_without_promo': 'price_usd'})

test_data = test_data.drop(['prop_log_historical_price', 'price_usd', 'parsed_date', 'year', 'month', 'day', 'search_hour', 'day_of_week', 'year_month', 'date_time', 'prop_historical_price', 'price_usd_per_night_test', 'price_ratio']). \
    rename({'price_usd_without_promo': 'price_usd'})

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# creating the dataframe for the CatBoostRanker
cbr_data = train_data.to_pandas().copy()

# I will train the model on a subset
cbr_data = cbr_data[cbr_data['srch_id'].isin(cbr_data['srch_id'].unique()[:4000])]
print(cbr_data.shape) # 2492 observations

# test model subset
test_subset = test_data.to_pandas().copy()
test_subset = test_subset[test_subset['srch_id'].isin(cbr_data['srch_id'])]

(98835, 40)


In [17]:
# I will set up a CatBoostRanker. It handles numeric variables natively, so only the categorical ones need to be specified (it assumes everything else is numeric).

# this column is used as a sort of relevance feature for the ranking, as the most relevant hotels are the ones the customers actually booked. This will help the model decide what is most important
cbr_data['target'] = cbr_data['booking_bool']

# now we exclude target-only columns
exclude_cols = ['target', 'booking_bool', 'srch_id', 'prop_id']
feature_cols = [col for col in cbr_data.columns if col not in exclude_cols]

# specifying categorical features
cat_features = [col for col in cbr_data[feature_cols].columns if cbr_data[col].dtype == 'object' or 'id' in col]

# do the train/test split by group (srch_id). This is to prevent the data being split such that both the training and test sets contain the same search id
unique_srch_ids = cbr_data['srch_id'].unique()
train_srch, val_srch = train_test_split(unique_srch_ids, test_size=0.2, random_state=42)

# actually creating the training and validation sets
train_subset = cbr_data[cbr_data['srch_id'].isin(train_srch)].copy()
val_subset = cbr_data[cbr_data['srch_id'].isin(val_srch)].copy()

train_subset = train_subset.sort_values('srch_id')
val_subset = val_subset.sort_values('srch_id')

# impute missing numeric values
for col in feature_cols:
    if train_subset[col].dtype in [np.float64, np.int64]:
        train_subset[col] = train_subset[col].fillna(-999)
    if val_subset[col].dtype in [np.float64, np.int64]:
        val_subset[col] = val_subset[col].fillna(-999)

# same with categorical
for col in cat_features:
    train_subset[col] = train_subset[col].fillna('missing')
    val_subset[col] = val_subset[col].fillna('missing')

# Helper to compute group sizes
def get_group_sizes(df, group_key='srch_id'):
    return df.groupby(group_key).size().tolist()

# Group sizes for CatBoost
train_group_sizes = get_group_sizes(train_subset)
val_group_sizes = get_group_sizes(val_subset)

train_pool = Pool(
    data=train_subset[feature_cols],
    label=train_subset['target'],
    group_id=train_subset['srch_id'].astype(int).tolist(),
    cat_features=cat_features
)

val_pool = Pool(
    data=val_subset[feature_cols],
    label=val_subset['target'],
    group_id=val_subset['srch_id'].astype(int).tolist(),
    cat_features=cat_features
)

# hyperparameter optimization function using optuna
def objective(trial):
    params = {
        "iterations": 500,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "depth": trial.suggest_int("depth", 4, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        "random_strength": trial.suggest_float("random_strength", 1e-9, 10),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 1),
        "loss_function": "YetiRank",
        "random_seed": 42,
        "verbose": 0
    }

    model = CatBoostRanker(**params)
    model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=30, verbose=0)

    preds = model.predict(val_pool)

    # Evaluate using NDCG@5
    val_preds = val_subset.copy()
    val_preds['pred'] = preds
    val_preds = val_preds.sort_values(by=['srch_id', 'pred'], ascending=[True, False])

    ndcg_scores = []
    for srch_id, group in val_preds.groupby('srch_id'):
        y_true = group['target'].values
        y_score = group['pred'].values
        if len(y_true) > 1:  # NDCG isn't defined for length 1
            ndcg_scores.append(ndcg_score([y_true], [y_score], k=5))

    return np.mean(ndcg_scores)

In [18]:
# optimizing the hyperparameters. This may take a while...
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[I 2025-05-15 22:01:03,538] A new study created in memory with name: no-name-521c0b79-12bf-4120-b74d-d62026ccea6b
[I 2025-05-15 22:01:32,157] Trial 0 finished with value: 0.2712068192419567 and parameters: {'learning_rate': 0.14473774123981006, 'depth': 6, 'l2_leaf_reg': 7.322683446957264, 'random_strength': 3.4288533830143, 'bagging_temperature': 0.36046318253198717}. Best is trial 0 with value: 0.2712068192419567.
[I 2025-05-15 22:02:06,413] Trial 1 finished with value: 0.25385728772894417 and parameters: {'learning_rate': 0.1333486097718593, 'depth': 10, 'l2_leaf_reg': 9.468379064320265, 'random_strength': 0.8269161221853737, 'bagging_temperature': 0.23188740761120574}. Best is trial 0 with value: 0.2712068192419567.
[I 2025-05-15 22:02:29,568] Trial 2 finished with value: 0.2612854745602355 and parameters: {'learning_rate': 0.053242844636241474, 'depth': 5, 'l2_leaf_reg': 2.922165986232749, 'random_strength': 8.104884593927416, 'bagging_temperature': 0.9408540015767577}. Best is tr

In [19]:
# applying the best hyperparameters to the model and training it
best_params = study.best_params
best_params["iterations"] = 500
best_params["loss_function"] = "YetiRank"

model = CatBoostRanker(**best_params)
model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=30)

Groupwise loss function. OneHotMaxSize set to 10
0:	test: 0.1755675	best: 0.1755675 (0)	total: 66.2ms	remaining: 33s
1:	test: 0.1876903	best: 0.1876903 (1)	total: 124ms	remaining: 31s
2:	test: 0.2546740	best: 0.2546740 (2)	total: 188ms	remaining: 31.2s
3:	test: 0.2560749	best: 0.2560749 (3)	total: 251ms	remaining: 31.1s
4:	test: 0.2733953	best: 0.2733953 (4)	total: 358ms	remaining: 35.5s
5:	test: 0.2720244	best: 0.2733953 (4)	total: 443ms	remaining: 36.5s
6:	test: 0.2811242	best: 0.2811242 (6)	total: 548ms	remaining: 38.6s
7:	test: 0.2989139	best: 0.2989139 (7)	total: 651ms	remaining: 40s
8:	test: 0.3025726	best: 0.3025726 (8)	total: 760ms	remaining: 41.5s
9:	test: 0.3148292	best: 0.3148292 (9)	total: 913ms	remaining: 44.7s
10:	test: 0.3151545	best: 0.3151545 (10)	total: 1.04s	remaining: 46.4s
11:	test: 0.3156747	best: 0.3156747 (11)	total: 1.17s	remaining: 47.5s
12:	test: 0.3238488	best: 0.3238488 (12)	total: 1.35s	remaining: 50.6s
13:	test: 0.3279175	best: 0.3279175 (13)	total: 1.5s	

<catboost.core.CatBoostRanker at 0x5083c12d0>

In [24]:
# impute missing numeric values
for col in feature_cols:
    if test_subset[col].dtype in [np.float64, np.int64]:
        test_subset[col] = test_subset[col].fillna(-999)

# same with categorical
for col in cat_features:
    test_subset[col] = test_subset[col].fillna('missing')

# Apply model on test data
test_pool = Pool(test_subset[feature_cols], group_id=test_subset['srch_id'], cat_features=cat_features)
test_subset['pred_score'] = model.predict(test_pool)

# Rank hotels for each search
ranked = test_subset.sort_values(by=['srch_id', 'pred_score'], ascending=[True, False])

# the prediction score from catboost isnt a probability, it instead is a relevance score. Converting this to a probability
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

# apply softmax to the prediction scores
ranked['softmax_score'] = ranked.groupby('srch_id')['pred_score'].transform(softmax)
ranked = ranked[['srch_id', 'pred_score', 'softmax_score', 'prop_id']]
print(ranked)

# save the ranked dataframe to a csv
ranked.to_csv('catboostranker_submission.csv', index=False)

        srch_id  pred_score  softmax_score  prop_id
23            1   -0.400051       0.086668    99484
12            1   -0.522541       0.076676    61934
9             1   -0.675180       0.065822    54937
4             1   -0.873562       0.053978    24194
16            1   -0.984588       0.048305    74045
...         ...         ...            ...      ...
100343     6634   -0.373659       0.028846   119844
100339     6634   -0.373721       0.028844   100637
100335     6634   -0.559014       0.023965    52684
100340     6634   -0.871218       0.017539   109828
100341     6634   -0.877161       0.017435   110643

[60628 rows x 4 columns]
