In [5]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

In [6]:
all_data = pd.read_parquet("data/train_cleaned.parquet")

In [7]:
# Step 1: Construct ranking label (5 for booking, 1 for click, 0 otherwise)
all_data['label'] = 5 * all_data['booking_bool'] + (1 - all_data['booking_bool']) * all_data['click_bool']

In [8]:

# Step 2: Choose features to use (exclude label, IDs, and target leakage like position)
drop_cols = [
    'date_time', 'click_bool', 'booking_bool', 'gross_bookings_usd',
    'label', 'position'  # 'position' only available in training set; not available in test
]
feature_cols = [col for col in all_data.columns if col not in drop_cols]

In [9]:

# Step 3: Train/validation split by srch_id to preserve group integrity
unique_srch_ids = all_data['srch_id'].unique()
train_ids, valid_ids = train_test_split(unique_srch_ids, test_size=0.2, random_state=42)

train_data = all_data[all_data['srch_id'].isin(train_ids)]
valid_data = all_data[all_data['srch_id'].isin(valid_ids)]

X_train = train_data[feature_cols]
y_train = train_data['label']
group_train = train_data.groupby('srch_id').size().values

X_valid = valid_data[feature_cols]
y_valid = valid_data['label']
group_valid = valid_data.groupby('srch_id').size().values

In [10]:

# Step 4: LightGBM dataset formatting
lgb_train = lgb.Dataset(X_train, label=y_train, group=group_train)
lgb_valid = lgb.Dataset(X_valid, label=y_valid, group=group_valid, reference=lgb_train)

In [28]:
# Step 5: Set parameters for LambdaMART
params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [5],
    'learning_rate': 0.1,
    'num_leaves': 31,
    'min_data_in_leaf': 20,
    'verbosity': -1
}

# Step 6: Train model
model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_valid],
    num_boost_round=200,
    callbacks=[
        lgb.early_stopping(stopping_rounds=10, verbose=True),
        lgb.log_evaluation(period=10)
    ],
)

Training until validation scores don't improve for 10 rounds
[10]	valid_0's ndcg@5: 0.352032
[20]	valid_0's ndcg@5: 0.360225
[30]	valid_0's ndcg@5: 0.365261
[40]	valid_0's ndcg@5: 0.368089
[50]	valid_0's ndcg@5: 0.370143
[60]	valid_0's ndcg@5: 0.371919
[70]	valid_0's ndcg@5: 0.37285
[80]	valid_0's ndcg@5: 0.374035
[90]	valid_0's ndcg@5: 0.374728
[100]	valid_0's ndcg@5: 0.376276
[110]	valid_0's ndcg@5: 0.376743
Early stopping, best iteration is:
[105]	valid_0's ndcg@5: 0.377084


In [29]:

# Step 7: Predict on validation set and compute NDCG@5
valid_preds = model.predict(X_valid)
valid_data = valid_data.copy()
valid_data['pred'] = valid_preds

In [30]:

# Step 8: Compute average NDCG@5 across queries
ndcg_scores = []
for srch_id, group in valid_data.groupby('srch_id'):
    if len(group) < 5:
        continue
    true_labels = group['label'].values
    preds = group['pred'].values
    score = ndcg_score([true_labels], [preds], k=5)
    ndcg_scores.append(score)

average_ndcg_5 = np.mean(ndcg_scores)
print(f"\nValidation NDCG@5: {average_ndcg_5:.4f}")



Validation NDCG@5: 0.3776


Generate predictions for the test set, to be submitted to Kaggle.

In [31]:
test_data = pd.read_parquet("data/test_cleaned.parquet")

In [32]:
len(test_data)

4959183

In [33]:
# Step 1: Prepare test features
X_test = test_data[feature_cols]  # use same feature list from training

# Step 2: Predict scores
test_data = test_data.copy()  # to avoid SettingWithCopyWarning
test_data['pred'] = model.predict(X_test, num_iteration=model.best_iteration)

# Step 3: Sort predictions per search
submission = (
    test_data
    .sort_values(by=['srch_id', 'pred'], ascending=[True, False])
    [['srch_id', 'prop_id']]
)

# Step 4: Save submission file
submission.to_csv("data/submission.csv", index=False)

In [34]:
submission = pd.read_csv("data/submission.csv")

In [35]:
len(submission)

4959183