GRID SEARCH?

In [1]:
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
import pandas as pd
from lightgbm import LGBMRanker
from tqdm import tqdm
import re
from sklearn.metrics import ndcg_score
from tqdm import tqdm

def train_test_split_by_group(data, id_column, test_size=0.2, random_state=None):
    gss = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    groups = data[id_column]
    train_idx, test_idx = next(gss.split(data, groups=groups))
    train_set = data.iloc[train_idx]
    test_set = data.iloc[test_idx]

    return train_set, test_set


def preprocess_data(train_set, test_set, test_label=False):
    train_set = train_set.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x))
    test_set = test_set.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x))
    X_train = train_set.loc[:, ~train_set.columns.isin(['srch_id','target_label', 'position', 'gross_booking_usd'])]
    y_train = train_set['target_label']
    qid_train = train_set['srch_id']
    X_test = test_set.loc[:, ~test_set.columns.isin(['srch_id','target_label', 'position', 'gross_booking_usd'])]
    if test_label:
        y_test = test_set['target_label']
    else:
        y_test = None
    qid_test = test_set['srch_id']

    return X_train, y_train, qid_train, X_test, y_test, qid_test


def train_lgbm_ranker(X_train, y_train, train_groups, params=None):
    if params is None:
        params = {
            "objective": "lambdarank",
            "metric": "ndcg",
            "n_estimators": 2000,
            "learning_rate": 0.05,
            "verbose": -1
        }

    gbm = LGBMRanker(**params)
    gbm.fit(X_train, y_train, group=train_groups)

    return gbm


def predict_and_generate_submission(model, X_test, q_id_test, output_file):
    predictions = []
    for group in tqdm(np.unique(q_id_test), desc='Processing groups'):
        preds = model.predict(X_test[q_id_test == group])
        predictions.extend(preds)

    X_test['preds'] = predictions
    X_test['srch_id'] = q_id_test

    result = X_test.sort_values(by=['srch_id', 'preds'], ascending=[True, False])
    result[['srch_id', 'prop_id']].reset_index(drop=True).to_csv(output_file, index=False)



def calculate_ndcg(model, X_test, y_test, q_id_test, k=5, use_tqdm = True):

    
    ndcg_scores = []
    qids = np.unique(q_id_test)

    iterator = tqdm(qids, desc='Calculating NDCG scores') if use_tqdm else qids
    
    for qid in iterator:
        y = y_test[q_id_test == qid].values.flatten()
        p = model.predict(X_test[q_id_test == qid])
        ndcg_scores.append(ndcg_score([y], [p], k=k))
    return np.mean(ndcg_scores)


In [2]:
train_set_path = './dataset/train_new_feature.csv'
test_set_path = './dataset/test_new_feature.csv'

train_set, test_set = train_test_split_by_group(pd.read_csv(train_set_path), id_column='srch_id', test_size=0.1, random_state= 7)

In [3]:
X_train, y_train, qid_train, X_test, y_test, qid_test = preprocess_data(train_set, test_set, test_label=True)
qid_train.value_counts().sort_index()

srch_id
1         28
4         32
6          5
8         21
11        33
          ..
332776    10
332777    32
332781    15
332782    24
332785     6
Name: count, Length: 179815, dtype: int64

In [4]:
params = {
    "objective": "lambdarank",
    "metric": "ndcg",
    "n_estimators": 100,
    "learning_rate": 0.1
}
model = train_lgbm_ranker(X_train, y_train, qid_train.value_counts().sort_index(), params)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.156598 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2976
[LightGBM] [Info] Number of data points in the train set: 4461236, number of used features: 21


In [5]:
calculate_ndcg(model, X_test, y_test, qid_test, k=5)

Calculating NDCG scores: 100%|█| 19980/19980 [00:31<00:00, 638


0.37994632886014024

In [None]:
from itertools import product
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging


def grid_search_lgbm_ranker(X_train, y_train, train_groups, X_test, y_test, q_id_test, param_grid = None):
    best_ndcg = -np.inf
    best_model = None
    best_params = {}

    if not param_grid:
        param_grid = {
            'objective': ['lambdarank'],
            'learning_rate': [0.01, 0.05, 0.1],
            'num_leaves': [31, 64, 256],
            'min_data_in_leaf': [20, 100, 200],
            'lambda_l2': [0, 0.5, 1.0],
            'max_bin': [256, 512],
            'n_estimators': [500, 1000],
            'metric': [['ndcg']],
            "verbose": [-1]
        }
        

    from itertools import product
    keys, values = zip(*param_grid.items())

    # All combination of parameters
    param_combinations = [dict(zip(keys, v)) for v in product(*values)]

    for params in tqdm(param_combinations, desc='Testing parameter combinations'):
        model = train_lgbm_ranker(X_train, y_train, train_groups, params)
        
        ndcg_score = calculate_ndcg(model, X_test, y_test, q_id_test, k=5, use_tqdm = False)
        
        if ndcg_score > best_ndcg:
            best_ndcg = ndcg_score
            best_model = model
            best_params = params

            print(f"The best parameters yet: {best_params}")
            print(f"The best ndcg_score yet: {best_ndcg}")

    return best_model, best_params, best_ndcg


train_groups = qid_train.value_counts().sort_index()

best_model, best_params, best_ndcg_score = grid_search_lgbm_ranker(
    X_train, y_train, train_groups, X_test, y_test, qid_test
)


# predict_and_generate_submission(best_model, X_test, qid_test, "output_predictions.csv")

print("Best Params:", best_params)
print("Best NDCG Score:", best_ndcg_score)


Testing parameter combinations:   0%| | 1/324 [02:10<11:43:44,

The best parameters yet: {'objective': 'lambdarank', 'learning_rate': 0.01, 'num_leaves': 31, 'min_data_in_leaf': 20, 'lambda_l2': 0, 'max_bin': 256, 'n_estimators': 500, 'metric': ['ndcg'], 'verbose': -1}
The best ndcg_score yet: 0.3734379328932991


Testing parameter combinations:   1%| | 2/324 [05:45<16:06:47,

The best parameters yet: {'objective': 'lambdarank', 'learning_rate': 0.01, 'num_leaves': 31, 'min_data_in_leaf': 20, 'lambda_l2': 0, 'max_bin': 256, 'n_estimators': 1000, 'metric': ['ndcg'], 'verbose': -1}
The best ndcg_score yet: 0.3799843565015373


Testing parameter combinations:   1%| | 4/324 [11:37<16:16:43,

The best parameters yet: {'objective': 'lambdarank', 'learning_rate': 0.01, 'num_leaves': 31, 'min_data_in_leaf': 20, 'lambda_l2': 0, 'max_bin': 512, 'n_estimators': 1000, 'metric': ['ndcg'], 'verbose': -1}
The best ndcg_score yet: 0.3810214954272664


Testing parameter combinations:   4%| | 12/324 [35:08<15:48:34

The best parameters yet: {'objective': 'lambdarank', 'learning_rate': 0.01, 'num_leaves': 31, 'min_data_in_leaf': 20, 'lambda_l2': 1.0, 'max_bin': 512, 'n_estimators': 1000, 'metric': ['ndcg'], 'verbose': -1}
The best ndcg_score yet: 0.38104075651368263


Testing parameter combinations:   6%| | 20/324 [58:48<15:53:50