In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

"""
Citation: https://www.kaggle.com/code/divyansh22/lgbm-classifier-for-airline-recommendation/notebook
"""

'\nCitation: https://www.kaggle.com/code/divyansh22/lgbm-classifier-for-airline-recommendation/notebook\n'

In [2]:
# load the dataset
CSV_PATH = "../2/dmt-2025-2nd-assignment/training_set_VU_DM.csv"
reader = pd.read_csv(CSV_PATH,nrows=10_000_00)
df = reader.copy()

CSV_PATH2 = "../2/dmt-2025-2nd-assignment/test_set_VU_DM.csv"
reader2 = pd.read_csv(CSV_PATH2)
df2 = reader2.copy()

In [48]:
# define user features
user_features = [
    "visitor_location_country_id",
    "srch_destination_id",
]
df_groupable = df.dropna(subset=user_features + ['prop_id'])

In [49]:
# clean the data
df_groupable["srch_destination_id"] = pd.qcut(
    df['visitor_hist_adr_usd'].fillna(df['visitor_hist_adr_usd'].median()),
    q=4,
    duplicates='drop'
)
df_groupable['visitor_location_country_id'] = pd.qcut(
    df['visitor_hist_adr_usd'].fillna(df['visitor_hist_adr_usd'].median()),
    q=10,
    duplicates='drop'
)

In [50]:
# aggregate user features to get click and booking rates
agg = df_groupable.groupby(user_features + ['prop_id']).agg(
    sim_user_click_rate=('click_bool', 'mean'),
    sim_user_book_rate=('booking_bool', 'mean')
).reset_index()

# merge back to the original dataframe
df = df.merge(agg, on=user_features + ['prop_id'], how='left') # train
df2 = df2.merge(agg, on=user_features + ['prop_id'], how='left') # test

  agg = df_groupable.groupby(user_features + ['prop_id']).agg(


In [51]:
# fill in the missing values
df['sim_user_click_rate'] = df['sim_user_click_rate'].fillna(0)
df['sim_user_book_rate'] = df['sim_user_book_rate'].fillna(0)
df2['sim_user_click_rate'] = df2['sim_user_click_rate'].fillna(0)
df2['sim_user_book_rate'] = df2['sim_user_book_rate'].fillna(0)


In [52]:
# combined labels for click an booking
df['label'] = df['booking_bool'] * 5 + df['click_bool'] * 1

# group search session by user id
unique_searches = df['srch_id'].unique()

# split the train and test set
search_train, search_va = train_test_split(unique_searches, test_size=0.05, random_state=42)

# create a train and test dataset
train_df = df[df['srch_id'].isin(search_train)].sort_values('srch_id')
val_df  = df[df['srch_id'].isin(search_va)].sort_values('srch_id')

print(df['label'].value_counts())
print(f"Train set size: {train_df.shape}")
print(f"Validation set size: {val_df.shape}")


label
0    955274
6     27738
1     16988
Name: count, dtype: int64
Train set size: (949546, 57)
Validation set size: (50454, 57)


In [55]:
# prepare the input and labels for the model
features = [
        "srch_length_of_stay",
        "srch_booking_window",
        "srch_adults_count",
        "srch_children_count",
        "srch_room_count",
        "srch_saturday_night_bool",
        "prop_review_score",
        "prop_starrating",
        "price_usd",
        "promotion_flag",
        "prop_brand_bool",
        "prop_location_score1",
        "prop_log_historical_price",
        "sim_user_click_rate",
        "sim_user_book_rate",
        ]
X_train = train_df[features]
X_val = val_df[features]
y_train = train_df['label']
y_val = val_df['label'] 
group_train = train_df.groupby('srch_id').size().to_list()

# these two parts should have the same size
print(f"Group train size: {sum(group_train)}")
print(f"X_train size: {X_train.shape}")

# create a validation set
val_group = val_df.groupby('srch_id').size().to_list()
val_set = lgb.Dataset(X_val, label=y_val, group=val_group)

Group train size: 949546
X_train size: (949546, 15)


In [56]:
# train the models
params = {
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbose": 0,
}
train_data = lgb.Dataset(X_train, label=y_train, group=group_train)
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_set],
    valid_names=['train', 'valid'],
    num_boost_round=1000,
)

In [57]:
# prediction on the test set
X_test = df2[features]
df2['score'] = model.predict(X_test)
df2['rank'] = df2.groupby('srch_id')['score'].rank(ascending=False)
output = df2[['srch_id', 'prop_id', 'score', 'rank']].sort_values(['srch_id', 'rank'])
print(output.head(20))

    srch_id  prop_id     score  rank
23        1    99484  0.080885   1.0
9         1    54937  0.075727   2.0
12        1    61934  0.074005   3.0
14        1    72090  0.058078   4.0
20        1    90385  0.057916   5.0
7         1    37567  0.054481   6.0
4         1    24194  0.054430   7.0
5         1    28181  0.053690   8.0
6         1    34263  0.052208   9.0
8         1    50162  0.048823  10.0
22        1    95031  0.045193  11.0
18        1    82231  0.044895  12.0
16        1    74045  0.043972  13.0
17        1    78599  0.043812  14.0
13        1    63894  0.043341  15.0
0         1     3180  0.040648  16.0
1         1     5543  0.038564  17.0
25        1   128085  0.037398  18.0
24        1   123675  0.035264  19.0
3         1    22393  0.033581  20.0


In [28]:
# evaluate the ranking 
"""top_preds = test_df[test_df['rank'] == 1]
hit_rate = top_preds['booking_bool'].mean()
print(f"Hit@1 (Top-1 Booking Accuracy): {hit_rate:.3f}")

# compute the ndcg score
ndcg_list = []
for srch_id, group in test_df.groupby('srch_id'):
    y_true = group['label'].values.reshape(1, -1)
    y_score = group['score'].values.reshape(1, -1)
    ndcg = ndcg_score(y_true, y_score, k = 5)
    ndcg_list.append(ndcg)
ndcg = np.mean(ndcg_list)
print(f"The ndcg score of the model is: {ndcg:.3f}")"""

'top_preds = test_df[test_df[\'rank\'] == 1]\nhit_rate = top_preds[\'booking_bool\'].mean()\nprint(f"Hit@1 (Top-1 Booking Accuracy): {hit_rate:.3f}")\n\n# compute the ndcg score\nndcg_list = []\nfor srch_id, group in test_df.groupby(\'srch_id\'):\n    y_true = group[\'label\'].values.reshape(1, -1)\n    y_score = group[\'score\'].values.reshape(1, -1)\n    ndcg = ndcg_score(y_true, y_score, k = 5)\n    ndcg_list.append(ndcg)\nndcg = np.mean(ndcg_list)\nprint(f"The ndcg score of the model is: {ndcg:.3f}")'

In [59]:
# print the output as a csv file
submission = output[['srch_id', 'prop_id']].copy()
submission.to_csv('hotel_rankings.csv', index=False)