In [5]:
import pandas as pd
import numpy as np
from catboost import CatBoostRanker, Pool
import polars as pl
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

# loading in the cleaned training data
train_data = pl.read_csv('cleaned_training_data.csv')
test_data = pl.read_csv('cleaned_test_data.csv')

# I will drop unnecessary columns
train_data = train_data.drop(['prop_log_historical_price', 'price_usd', 'parsed_date', 'year', 'month', 'day', 'search_hour', 'day_of_week', 'year_month', 'date_time', 'prop_historical_price', 'price_usd_per_night_test', 'price_ratio', 'position', 'gross_bookings_usd', 'click_bool']). \
    rename({'price_usd_without_promo': 'price_usd'})

test_data = test_data.drop(['prop_log_historical_price', 'price_usd', 'parsed_date', 'year', 'month', 'day', 'search_hour', 'day_of_week', 'year_month', 'date_time', 'prop_historical_price', 'price_usd_per_night_test', 'price_ratio']). \
    rename({'price_usd_without_promo': 'price_usd'})

In [8]:
# creating the dataframe for the CatBoostRanker
cbr_data = train_data.to_pandas().copy()

# I will train the model on a subset
cbr_data = cbr_data[cbr_data['srch_id'].isin(cbr_data['srch_id'].unique()[:4000])]
print(cbr_data.shape) # 2492 observations

# test model subset
test_subset = test_data.to_pandas().copy()
test_subset = test_subset[test_subset['srch_id'].isin(cbr_data['srch_id'])]

(98835, 40)


In [10]:
# I will set up a CatBoostRanker. It handles numeric variables natively, so only the categorical ones need to be specified (it assumes everything else is numeric).

# this column is used as a sort of relevance feature for the ranking, as the most relevant hotels are the ones the customers actually booked. This will help the model decide what is most important
cbr_data['target'] = cbr_data['booking_bool']

# now we exclude target-only columns
exclude_cols = ['target', 'booking_bool', 'srch_id', 'prop_id']
feature_cols = [col for col in cbr_data.columns if col not in exclude_cols]

# specifying categorical features
cat_features = [col for col in cbr_data[feature_cols].columns if cbr_data[col].dtype == 'object' or 'id' in col]

# do the train/test split by group (srch_id). This is to prevent the data being split such that both the training and test sets contain the same search id
unique_srch_ids = cbr_data['srch_id'].unique()
train_srch, val_srch = train_test_split(unique_srch_ids, test_size=0.2, random_state=42)

# actually creating the training and validation sets
train_subset = cbr_data[cbr_data['srch_id'].isin(train_srch)].copy()
val_subset = cbr_data[cbr_data['srch_id'].isin(val_srch)].copy()

train_subset = train_subset.sort_values('srch_id')
val_subset = val_subset.sort_values('srch_id')

# impute missing numeric values
for col in feature_cols:
    if train_subset[col].dtype in [np.float64, np.int64]:
        train_subset[col] = train_subset[col].fillna(-999)
    if val_subset[col].dtype in [np.float64, np.int64]:
        val_subset[col] = val_subset[col].fillna(-999)

# same with categorical
for col in cat_features:
    train_subset[col] = train_subset[col].fillna('missing')
    val_subset[col] = val_subset[col].fillna('missing')

# Helper to compute group sizes
def get_group_sizes(df, group_key='srch_id'):
    return df.groupby(group_key).size().tolist()

# Group sizes for CatBoost
train_group_sizes = get_group_sizes(train_subset)
val_group_sizes = get_group_sizes(val_subset)

train_pool = Pool(
    data=train_subset[feature_cols],
    label=train_subset['target'],
    group_id=train_subset['srch_id'].astype(int).tolist(),
    cat_features=cat_features
)

val_pool = Pool(
    data=val_subset[feature_cols],
    label=val_subset['target'],
    group_id=val_subset['srch_id'].astype(int).tolist(),
    cat_features=cat_features
)

# Train the CatBoostRanker
model = CatBoostRanker(
    iterations=500,
    learning_rate=0.1,
    loss_function='YetiRank',
    random_seed=42,
    verbose=50,
    early_stopping_rounds=30
)

model.fit(train_pool, eval_set=val_pool)

Groupwise loss function. OneHotMaxSize set to 10
0:	test: 0.2551497	best: 0.2551497 (0)	total: 92ms	remaining: 45.9s
50:	test: 0.3686263	best: 0.3699189 (47)	total: 3.38s	remaining: 29.7s
100:	test: 0.3783472	best: 0.3800034 (93)	total: 7.9s	remaining: 31.2s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.3800034271
bestIteration = 93

Shrink model to first 94 iterations.


<catboost.core.CatBoostRanker at 0x3002468b0>

In [13]:
# impute missing numeric values
for col in feature_cols:
    if test_subset[col].dtype in [np.float64, np.int64]:
        test_subset[col] = test_subset[col].fillna(-999)

# same with categorical
for col in cat_features:
    test_subset[col] = test_subset[col].fillna('missing')

# Apply model on test data
test_pool = Pool(test_subset[feature_cols], group_id=test_subset['srch_id'], cat_features=cat_features)
test_subset['pred_score'] = model.predict(test_pool)

# Rank hotels for each search
ranked = test_subset.sort_values(by=['srch_id', 'pred_score'], ascending=[True, False])

In [14]:
ranked

Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,...,prop_review_score_filled,has_usable_review,num_comps_lower,num_comps_higher,num_comps_with_inventory,avg_comp_rate_percent_diff,query_affinity_score_cleaned,query_affinity_missing,price_usd,pred_score
23,1,24,216,missing,missing,219,99484,3,4.0,1,...,4.0,1.0,1,0,0,0.25,-251.2864,1,91.77,-0.126009
9,1,24,216,missing,missing,219,54937,3,4.0,1,...,4.0,1.0,0,0,0,0.00,-251.2864,1,110.79,-0.152803
12,1,24,216,missing,missing,219,61934,3,4.5,1,...,4.5,1.0,0,0,0,0.00,-251.2864,1,118.21,-0.333972
8,1,24,216,missing,missing,219,50162,2,3.5,1,...,3.5,1.0,0,1,0,1.25,-251.2864,1,66.49,-0.832956
16,1,24,216,missing,missing,219,74045,3,4.0,1,...,4.0,1.0,0,0,0,0.00,-251.2864,1,89.99,-0.888605
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100338,6634,5,219,missing,missing,219,93356,3,5.0,1,...,5.0,1.0,0,0,1,0.00,-251.2864,1,309.00,0.060907
100335,6634,5,219,missing,missing,219,52684,3,4.0,1,...,4.0,1.0,0,0,0,0.00,-251.2864,1,230.00,-0.077975
100339,6634,5,219,missing,missing,219,100637,3,4.5,1,...,4.5,1.0,0,0,0,0.00,-251.2864,1,239.00,-0.211091
100341,6634,5,219,missing,missing,219,110643,0,3.5,1,...,3.5,1.0,0,3,0,1.50,-251.2864,1,170.00,-0.515999


In [15]:
test_subset.shape

(60628, 40)