In [176]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

In [202]:
# load the dataset
CSV_PATH = "../2/dmt-2025-2nd-assignment/training_set_VU_DM.csv"
reader = pd.read_csv(CSV_PATH, nrows=1_000_00)
df = reader.copy()

In [221]:
# combined labels for click an booking
df['label'] = df['booking_bool'] * 6 + df['click_bool'] * 1

# group search session by user id
unique_searches = df['srch_id'].unique()

# split the train and test set
search_train, search_test = train_test_split(unique_searches, test_size=0.1, random_state=42)

# create a train and test dataset
train_df = df[df['srch_id'].isin(search_train)].sort_values('srch_id')
test_df  = df[df['srch_id'].isin(search_test)].sort_values('srch_id')

print(f"Train set size: {train_df.shape}")
print(f"Test set size: {test_df.shape}")


Train set size: (89546, 55)
Test set size: (10454, 55)


In [222]:
# prepare the input and labels for the model
features = ["srch_destination_id",
        "srch_length_of_stay",
        "srch_booking_window",
        "srch_adults_count",
        "srch_children_count",
        "srch_room_count",
        "srch_saturday_night_bool",
        "prop_review_score",
        "prop_starrating",
        "price_usd",
        "promotion_flag",
        "prop_brand_bool",
        "prop_location_score1",
        "prop_log_historical_price",]
X_train = train_df[features]
y_train = train_df['label']
group_train = train_df.groupby('srch_id').size().to_list()

# these two parts should have the same size
print(f"Group train size: {sum(group_train)}")
print(f"X_train size: {X_train.shape}")

Group train size: 89546
X_train size: (89546, 14)


In [237]:
# train the models
params = {
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbose": 0,
}
train_data = lgb.Dataset(X_train, label=y_train, group=group_train)
model = lgb.train(params, train_data, num_boost_round=100)

In [238]:
# prediction on the test set
X_test = test_df[features]
test_df['score'] = model.predict(X_test)
test_df['rank'] = test_df.groupby('srch_id')['score'].rank(ascending=False)
output = test_df[['srch_id', 'prop_id', 'score', 'rank']].sort_values(['srch_id', 'rank'])
print(output.head(20))

     srch_id  prop_id     score  rank
223       25    66209  0.162338   1.0
227       25    81772  0.132279   2.0
209       25    20707  0.087507   3.0
224       25    68528  0.085641   4.0
210       25    22710  0.085594   5.0
216       25    54226  0.084712   6.0
231       25   112716  0.081688   7.0
218       25    61189  0.072072   8.0
211       25    23228  0.068430   9.0
232       25   118859  0.062947  10.0
237       25   126136  0.062157  11.0
233       25   120046  0.058764  12.0
229       25    88629  0.048486  13.0
207       25     2863  0.047009  14.0
214       25    53298  0.046958  15.0
236       25   125384  0.045658  16.0
226       25    79870  0.045264  17.0
219       25    61885  0.044372  18.0
222       25    66171  0.043264  19.0
228       25    83281  0.042120  20.0


In [239]:
# evaluate the ranking 
top_preds = test_df[test_df['rank'] == 1]
hit_rate = top_preds['booking_bool'].mean()
print(f"Hit@1 (Top-1 Booking Accuracy): {hit_rate:.3f}")

# compute the ndcg score
ndcg_list = []
for srch_id, group in test_df.groupby('srch_id'):
    y_true = group['label'].values.reshape(1, -1)
    y_score = group['score'].values.reshape(1, -1)
    ndcg = ndcg_score(y_true, y_score, k = 5)
    ndcg_list.append(ndcg)
ndcg = np.mean(ndcg_list)
print(f"The ndcg score of the model is: {ndcg:.3f}")

Hit@1 (Top-1 Booking Accuracy): 0.092
The ndcg score of the model is: 0.260
