In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

"""
Citation: https://www.kaggle.com/code/divyansh22/lgbm-classifier-for-airline-recommendation/notebook
"""

'\nCitation: https://www.kaggle.com/code/divyansh22/lgbm-classifier-for-airline-recommendation/notebook\n'

## Step 1: load the dataset and prepare the data for training##

In [2]:
# load the dataset
CSV_PATH = "../2/dmt-2025-2nd-assignment/training_set_VU_DM.csv"
reader = pd.read_csv(CSV_PATH,nrows=250_000_0)
df = reader.copy()

CSV_PATH2 = "../2/dmt-2025-2nd-assignment/test_set_VU_DM.csv"
reader2 = pd.read_csv(CSV_PATH2)
df2 = reader2.copy()

In [19]:
# feature engineering
df['price_vs_mean'] = df['price_usd'] - df.groupby('srch_id')['price_usd'].transform('mean')
df['price_rank'] = df.groupby('srch_id')['price_usd'].rank()
df['review_rank'] = df.groupby('srch_id')['prop_review_score'].rank()
df["people count"] = df["srch_adults_count"] + df["srch_children_count"]

df2['price_rank'] = df2.groupby('srch_id')['price_usd'].rank()
df2['review_rank'] = df2.groupby('srch_id')['prop_review_score'].rank()
df2["people count"] = df2["srch_adults_count"] + df["srch_children_count"]

# interaction features
df['price_review_interaction'] = df['price_usd'] * df['prop_review_score']
df2['price_review_interaction'] = df2['price_usd'] * df2['prop_review_score']

# define user features
user_features = [
    "visitor_location_country_id",
    "srch_destination_id",
]
df_groupable = df.dropna(subset=user_features + ['prop_id'])

MemoryError: Unable to allocate 801. MiB for an array with shape (42, 2500000) and data type float64

In [5]:
# clean the data
df_groupable["srch_destination_id"] = pd.qcut(
    df['visitor_hist_adr_usd'].fillna(df['visitor_hist_adr_usd'].median()),
    q=4,
    duplicates='drop'
)
df_groupable['visitor_location_country_id'] = pd.qcut(
    df['visitor_hist_adr_usd'].fillna(df['visitor_hist_adr_usd'].median()),
    q=10,
    duplicates='drop'
)

In [6]:
# aggregate user features to get click and booking rates
agg = df_groupable.groupby(user_features + ['prop_id']).agg(
    sim_user_click_rate=('click_bool', 'mean'),
    sim_user_book_rate=('booking_bool', 'mean')
).reset_index()

# merge back to the original dataframe
df = df.merge(agg, on=user_features + ['prop_id'], how='left') # train
df2 = df2.merge(agg, on=user_features + ['prop_id'], how='left') # test

  agg = df_groupable.groupby(user_features + ['prop_id']).agg(


In [7]:
# fill in the missing values
df['sim_user_click_rate'] = df['sim_user_click_rate'].fillna(0)
df['sim_user_book_rate'] = df['sim_user_book_rate'].fillna(0)
df2['sim_user_click_rate'] = df2['sim_user_click_rate'].fillna(0)
df2['sim_user_book_rate'] = df2['sim_user_book_rate'].fillna(0)


In [8]:
# write a function that assign weights to the labels
def assign_weight(df, booking_label, click_label, booking_weight, click_weight, default_weight):
    """
    This function assigns weights to the labels based on the number of clicks and bookings
    """
    df = df.copy()
    df['label'] = df['booking_bool'] * booking_label + df['click_bool'] * click_label
    
    # assign weight according to the label
    df['weight'] = default_weight
    df.loc[(df['click_bool'] == 1) & (df['booking_bool'] == 0), 'weight'] = click_weight
    df.loc[df['booking_bool'] == 1, 'weight'] = booking_weight

    return df

In [9]:
# combined labels for click an booking
booking_label = 10
click_label = 5
booking_weight = 10
click_weight = 5
default_weight = 1
df_labeled = assign_weight(df, booking_label, click_label, booking_weight, click_weight, default_weight)

# group search session by user id
unique_searches = df_labeled['srch_id'].unique()

# split the train and test set
search_train, search_va = train_test_split(unique_searches, test_size=0.05, random_state=42)

# create a train and test dataset
train_df = df_labeled[df_labeled['srch_id'].isin(search_train)].sort_values('srch_id')
val_df  = df_labeled[df_labeled['srch_id'].isin(search_va)].sort_values('srch_id')
print(df_labeled['label'].value_counts())
print(f"Train set size: {train_df.shape}")
print(f"Validation set size: {val_df.shape}")


label
0     2388229
15      69679
5       42092
Name: count, dtype: int64
Train set size: (2374566, 63)
Validation set size: (125434, 63)


In [13]:
# prepare the input and labels for the model
features = [
        "srch_length_of_stay",
        "srch_booking_window",
        "people count",
        "srch_room_count",
        "srch_saturday_night_bool",
        "prop_review_score",
        "prop_starrating",
        "price_usd",
        "promotion_flag",
        "prop_brand_bool",
        "prop_location_score1",
        "prop_log_historical_price",
        "price_vs_mean",      # relative
        "star_diff",          # relative
        "review_diff",         # relative
        'price_review_interaction',
        "sim_user_click_rate",
        "sim_user_book_rate",
        ]
X_train = train_df[features]
X_val = val_df[features]
y_train = train_df['label']
y_val = val_df['label'] 
group_train = train_df.groupby('srch_id').size().to_list()

# these two parts should have the same size
print(f"Group train size: {sum(group_train)}")
print(f"X_train size: {X_train.shape}")

# create a validation set
val_group = val_df.groupby('srch_id').size().to_list()
val_set = lgb.Dataset(X_val, label=y_val, group=val_group)

Group train size: 2374566
X_train size: (2374566, 18)


## Step 2: train the LGBM model with LGBM training set and testing set

In [14]:
# train the models
params = {
    "objective": "lambdarank",
    "metric": "ndcg",
    "boosting_type": "gbdt",
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbose": 1,
}
train_data = lgb.Dataset(X_train, label=y_train, weight = train_df['weight'], group=group_train)
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_set],
    valid_names=['train', 'valid'],
    num_boost_round=1000,
)

[LightGBM] [Info] Calculating query weights...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.070661 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2086
[LightGBM] [Info] Number of data points in the train set: 2374566, number of used features: 16


## Step 3: Make and prediction and produce output

In [15]:
# prediction on the test set
X_test = df2[features]
df2['score'] = model.predict(X_test)
df2['rank'] = df2.groupby('srch_id')['score'].rank(ascending=False)
output = df2[['srch_id', 'prop_id', 'score', 'rank']].sort_values(['srch_id', 'rank'])
print(output.head(20))

    srch_id  prop_id      score  rank
23        1    99484  78.893356   1.0
12        1    61934  78.869016   2.0
20        1    90385  78.867132   3.0
14        1    72090  78.863138   4.0
9         1    54937  78.859816   5.0
4         1    24194  78.858536   6.0
6         1    34263  78.852761   7.0
7         1    37567  78.848721   8.0
5         1    28181  78.835857   9.0
22        1    95031  78.831040  10.0
8         1    50162  78.824122  11.0
18        1    82231  78.817031  12.0
0         1     3180  78.807989  13.0
17        1    78599  78.804626  14.0
1         1     5543  78.796179  15.0
16        1    74045  78.793331  16.0
25        1   128085  78.776872  17.0
24        1   123675  78.773399  18.0
3         1    22393  78.750752  19.0
2         1    14142  78.750126  20.0


In [16]:
# evaluate the ranking 
val_df['score'] = model.predict(X_val)
val_df['rank'] = val_df.groupby('srch_id')['score'].rank(ascending=False, method='first')

# Step 6: Evaluate Hit@1 and NDCG@5
top_preds = val_df[val_df['rank'] == 1]
hit_rate = (top_preds['booking_bool'] == 1).mean()

ndcg_list = []
for srch_id, group in val_df.groupby('srch_id'):
    y_true = group['label'].values.reshape(1, -1)
    y_score = group['score'].values.reshape(1, -1)
    ndcg = ndcg_score(y_true, y_score, k=5)
    ndcg_list.append(ndcg)
average_ndcg = np.mean(ndcg_list)
print(f"Hit Rate: {hit_rate:.4f}")
print(f"NDCG@5: {average_ndcg:.4f}")

Hit Rate: 0.0967
NDCG@5: 0.2896


In [13]:
# print the output as a csv file
submission = output[['srch_id', 'prop_id']].copy()
submission.to_csv('hotel_rankings.csv', index=False)