In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score


In [2]:
from data_impute_and_fe.process_A_srch import process_search_features_smoothed
from data_impute_and_fe.process_B_prop import process_hotel_features
from data_impute_and_fe.process_C_price import process_price_feature_smoothed
from data_impute_and_fe.process_D_user import process_new_user

In [None]:
# load the dataset
CSV_PATH = "../2/dmt-2025-2nd-assignment/training_set_VU_DM.csv"
reader = pd.read_csv(CSV_PATH,nrows=2_500_000)
df = reader.copy()

CSV_PATH2 = "../2/dmt-2025-2nd-assignment/test_set_VU_DM.csv"
reader2 = pd.read_csv(CSV_PATH2)
df2 = reader2.copy()

In [4]:
# clean training set
df_final, cols_A, cols_categorical_A = process_search_features_smoothed(df, drop_raw_columns=False)
df_final, cols_B, cols_categorical_B = process_hotel_features(df_final, drop_raw_columns=False)
df_df_finalc_clean, cols_C, cols_categorical_C = process_price_feature_smoothed(df_final, drop_raw_columns=False)
df_final, cols_D, cols_categorical_D = process_new_user(df_final, drop_raw_columns=False)
df = df_final.copy()
print(df.columns)

# clean test set
df2_final, cols_A, cols_categorical_A = process_search_features_smoothed(df2, drop_raw_columns=False)  
df2_final, cols_B, cols_categorical_B = process_hotel_features(df2_final, drop_raw_columns=False)
df2_final, cols_C, cols_categorical_C = process_price_feature_smoothed(df2_final, drop_raw_columns=False)
df2_final, cols_D, cols_categorical_D = process_new_user(df2_final, drop_raw_columns=False)
df2 = df2_final.copy()


Index(['srch_id', 'visitor_location_country_id', 'visitor_hist_starrating',
       'visitor_hist_adr_usd', 'prop_country_id', 'prop_id', 'prop_starrating',
       'prop_review_score', 'prop_brand_bool', 'prop_location_score2',
       'prop_log_historical_price', 'position', 'price_usd', 'promotion_flag',
       'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_saturday_night_bool', 'srch_query_affinity_score',
       'orig_destination_distance', 'random_bool', 'comp1_rate', 'comp1_inv',
       'comp1_rate_percent_diff', 'comp2_rate', 'comp2_inv',
       'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv',
       'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv',
       'comp4_rate_percent_diff', 'comp5_rate', 'comp5_inv',
       'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv',
       'comp6_rate_percent_diff', 'comp7_rate', 'comp7_inv',
       'comp7_rate_percent_diff', 'comp8_rate', 'comp8_inv',
       'comp8_rate_percent_diff', 'click_bool', '

In [5]:
# define user features
user_features = [
    "visitor_location_country_id",
    "srch_destination_id",
]
df_groupable = df.dropna(subset=user_features + ['prop_id'])



In [6]:
# aggregate user features to get click and booking rates
agg = df_groupable.groupby(user_features + ['prop_id']).agg(
    sim_user_click_rate=('click_bool', 'mean'),
    sim_user_book_rate=('booking_bool', 'mean')
).reset_index()

# merge back to the original dataframe
df = df.merge(agg, on=user_features + ['prop_id'], how='left') # train
df2 = df2.merge(agg, on=user_features + ['prop_id'], how='left') # test
print(df2.columns)

Index(['srch_id', 'visitor_location_country_id', 'visitor_hist_starrating',
       'visitor_hist_adr_usd', 'prop_country_id', 'prop_id', 'prop_starrating',
       'prop_review_score', 'prop_brand_bool', 'prop_location_score2',
       'prop_log_historical_price', 'price_usd', 'promotion_flag',
       'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_saturday_night_bool', 'srch_query_affinity_score',
       'orig_destination_distance', 'random_bool', 'comp1_rate', 'comp1_inv',
       'comp1_rate_percent_diff', 'comp2_rate', 'comp2_inv',
       'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv',
       'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv',
       'comp4_rate_percent_diff', 'comp5_rate', 'comp5_inv',
       'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv',
       'comp6_rate_percent_diff', 'comp7_rate', 'comp7_inv',
       'comp7_rate_percent_diff', 'comp8_rate', 'comp8_inv',
       'comp8_rate_percent_diff', 'total_guests', 'review_sco

In [7]:
# fill in the missing values
df['sim_user_click_rate'] = df['sim_user_click_rate'].fillna(0)
df['sim_user_book_rate'] = df['sim_user_book_rate'].fillna(0)
df2['sim_user_click_rate'] = df2['sim_user_click_rate'].fillna(0)
df2['sim_user_book_rate'] = df2['sim_user_book_rate'].fillna(0)


In [8]:
def assign_weight(df, booking_label, click_label, booking_weight, click_weight, default_weight):
    """
    This function assigns weights to the labels based on the number of clicks and bookings
    """
    df = df.copy()
    df['label'] = df['booking_bool'] * booking_label + df['click_bool'] * click_label
    
    # assign weight according to the label
    df['weight'] = default_weight
    df.loc[(df['click_bool'] == 1) & (df['booking_bool'] == 0), 'weight'] = click_weight
    df.loc[df['booking_bool'] == 1, 'weight'] = booking_weight

    return df

In [10]:
# combined labels for click an booking
booking_label = 10
click_label = 5
booking_weight = 10
click_weight = 5
default_weight = 1
df_labeled = assign_weight(df, booking_label, click_label, booking_weight, click_weight, default_weight)

# group search session by user id
unique_searches = df['srch_id'].unique()

# split the train and test set
search_train, search_va = train_test_split(unique_searches, test_size=0.05, random_state=42)

# create a train and test dataset
train_df = df_labeled[df_labeled['srch_id'].isin(search_train)].sort_values('srch_id')
val_df  = df_labeled[df_labeled['srch_id'].isin(search_va)].sort_values('srch_id')

print(df_labeled['label'].value_counts())
print(f"Train set size: {train_df.shape}")
print(f"Validation set size: {val_df.shape}")

label
0     95513
15     2791
5      1696
Name: count, dtype: int64
Train set size: (94629, 57)
Validation set size: (5371, 57)


In [11]:
# prepare the input and labels for the model
features = [
        "srch_length_of_stay",
        "srch_booking_window",
        "total_guests",
        "srch_saturday_night_bool",
        "prop_review_score",
        "prop_starrating",
        "price_usd",
        "promotion_flag",
        "prop_brand_bool",
        "prop_log_historical_price",
        "historical_price_level",
        "review_score_label",
        "sim_user_click_rate",
        "sim_user_book_rate",
        ]
X_train = train_df[features]
X_val = val_df[features]
y_train = train_df['label']
y_val = val_df['label'] 
group_train = train_df.groupby('srch_id').size().to_list()

# these two parts should have the same size
print(f"Group train size: {sum(group_train)}")
print(f"X_train size: {X_train.shape}")

# create a validation set
val_group = val_df.groupby('srch_id').size().to_list()
val_set = lgb.Dataset(X_val, label=y_val,group=val_group)

Group train size: 94629
X_train size: (94629, 14)


In [14]:
# train the models
params = {
    "objective": "lambdarank",
    "metric": "ndcg",
    "boosting_type": "gbdt",
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbose": 1,
}
train_data = lgb.Dataset(X_train, label=y_train, weight = train_df['weight'], group=group_train)
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_set],
    valid_names=['train', 'valid'],
    num_boost_round=1000,
)

[LightGBM] [Info] Calculating query weights...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003475 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 842
[LightGBM] [Info] Number of data points in the train set: 94629, number of used features: 14


In [15]:
# prediction on the test set
X_test = df2[features]
df2['score'] = model.predict(X_test)
df2['rank'] = df2.groupby('srch_id')['score'].rank(ascending=False)
output = df2[['srch_id', 'prop_id', 'score', 'rank']].sort_values(['srch_id', 'rank'])
print(output.head(20))

    srch_id  prop_id      score  rank
14        1    72090  31.876518   1.0
0         1     3180  31.734204   2.0
1         1     5543  30.234303   3.0
3         1    22393  30.007148   4.0
6         1    34263  29.931554   5.0
22        1    95031  29.916733   6.0
28        1   139162  29.819018   7.0
17        1    78599  29.735594   8.0
13        1    63894  29.719202   9.0
18        1    82231  29.683274  10.0
23        1    99484  29.606237  11.0
4         1    24194  29.281695  12.0
16        1    74045  29.199996  13.0
15        1    73666  28.648356  14.0
5         1    28181  28.595128  15.0
20        1    90385  28.535879  16.0
24        1   123675  28.495206  17.0
21        1    94729  28.362105  18.0
9         1    54937  28.253016  19.0
2         1    14142  28.227429  20.0


In [16]:
# evaluate the ranking 
val_df['score'] = model.predict(X_val)
val_df['rank'] = val_df.groupby('srch_id')['score'].rank(ascending=False, method='first')

# Step 6: Evaluate Hit@1 and NDCG@5
top_preds = val_df[val_df['rank'] == 1]
hit_rate = (top_preds['booking_bool'] == 1).mean()

ndcg_list = []
for srch_id, group in val_df.groupby('srch_id'):
    y_true = group['label'].values.reshape(1, -1)
    y_score = group['score'].values.reshape(1, -1)
    ndcg = ndcg_score(y_true, y_score, k=5)
    ndcg_list.append(ndcg)
average_ndcg = np.mean(ndcg_list)
print(f"Hit Rate: {hit_rate:.4f}")
print(f"NDCG@5: {average_ndcg:.4f}")

Hit Rate: 0.5813
NDCG@5: 0.9205


In [14]:
# print the output as a csv file
submission = output[['srch_id', 'prop_id']].copy()
submission.to_csv('hotel_ranking_test.csv', index=False)