GRID SEARCH?

In [1]:
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
import pandas as pd
from lightgbm import LGBMRanker
from tqdm import tqdm
import re
from sklearn.metrics import ndcg_score
from tqdm import tqdm

def train_test_split_by_group(data, id_column, test_size=0.2, random_state=None):
    gss = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    groups = data[id_column]
    train_idx, test_idx = next(gss.split(data, groups=groups))
    train_set = data.iloc[train_idx]
    test_set = data.iloc[test_idx]

    return train_set, test_set


def preprocess_data(train_set, test_set, test_label=False):
    train_set = train_set.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x))
    test_set = test_set.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x))
    X_train = train_set.loc[:, ~train_set.columns.isin(['srch_id','target_label', 'position', 'gross_booking_usd'])]
    y_train = train_set['target_label']
    qid_train = train_set['srch_id']
    X_test = test_set.loc[:, ~test_set.columns.isin(['srch_id','target_label', 'position', 'gross_booking_usd'])]
    if test_label:
        y_test = test_set['target_label']
    else:
        y_test = None
    qid_test = test_set['srch_id']

    return X_train, y_train, qid_train, X_test, y_test, qid_test


def train_lgbm_ranker(X_train, y_train, train_groups, params=None):
    if params is None:
        params = {
            "objective": "lambdarank",
            "metric": "ndcg",
            "n_estimators": 2000,
            "learning_rate": 0.05,
            "verbose": -1
        }

    gbm = LGBMRanker(**params)
    gbm.fit(X_train, y_train, group=train_groups)

    return gbm


def predict_and_generate_submission(model, X_test, q_id_test, output_file):
    predictions = []
    for group in tqdm(np.unique(q_id_test), desc='Processing groups'):
        preds = model.predict(X_test[q_id_test == group])
        predictions.extend(preds)

    X_test['preds'] = predictions
    X_test['srch_id'] = q_id_test

    result = X_test.sort_values(by=['srch_id', 'preds'], ascending=[True, False])
    result[['srch_id', 'prop_id']].reset_index(drop=True).to_csv(output_file, index=False)



def calculate_ndcg(model, X_test, y_test, q_id_test, k=5, use_tqdm = True):

    
    ndcg_scores = []
    qids = np.unique(q_id_test)

    iterator = tqdm(qids, desc='Calculating NDCG scores') if use_tqdm else qids
    
    for qid in iterator:
        y = y_test[q_id_test == qid].values.flatten()
        p = model.predict(X_test[q_id_test == qid])
        ndcg_scores.append(ndcg_score([y], [p], k=k))
    return np.mean(ndcg_scores)


In [2]:
train_set_path = './dataset/train_new_feature.csv'
test_set_path = './dataset/test_new_feature.csv'

train_set, test_set = train_test_split_by_group(pd.read_csv(train_set_path), id_column='srch_id', test_size=0.1, random_state= 7)

In [3]:
X_train, y_train, qid_train, X_test, y_test, qid_test = preprocess_data(train_set, test_set, test_label=True)
qid_train.value_counts().sort_index()

srch_id
1         28
4         32
6          5
8         21
11        33
          ..
332776    10
332777    32
332781    15
332782    24
332785     6
Name: count, Length: 179815, dtype: int64

In [4]:
params = {
    "objective": "lambdarank",
    "metric": "ndcg",
    "n_estimators": 100,
    "learning_rate": 0.1
}
model = train_lgbm_ranker(X_train, y_train, qid_train.value_counts().sort_index(), params)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.173887 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3231
[LightGBM] [Info] Number of data points in the train set: 4461236, number of used features: 22


In [5]:
calculate_ndcg(model, X_test, y_test, qid_test, k=5)

Calculating NDCG scores: 100%|â–ˆ| 19980/19980 [00:31<00:00, 632


0.3824297543228811

In [None]:
import logging
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def grid_search_lgbm_ranker(X_train, y_train, train_groups, X_test, y_test, q_id_test, param_grid=None):
    best_ndcg = -np.inf
    best_model = None
    current_params = {}

    if not param_grid:
        param_grid = {
            'objective': ['lambdarank'],
            'learning_rate': [0.01, 0.05, 0.1],
            'num_leaves': [31, 64, 256],
            'min_data_in_leaf': [20, 100, 200],
            'lambda_l2': [0, 0.5, 1.0],
            'max_bin': [256, 512],
            'n_estimators': [500, 1000],
            'metric': [['ndcg']],
            "verbose": [-1]
        }

    # Initialize current_params with the first option for each parameter
    for key in param_grid:
        current_params[key] = param_grid[key][0]

    # Sequentially iterate over each hyperparameter
    for key in param_grid:
        logging.info(f"Starting optimization for {key} with {len(param_grid[key])} possible values.")
        for index, value in enumerate(param_grid[key]):
            current_params[key] = value
            model = train_lgbm_ranker(X_train, y_train, train_groups, current_params)
            ndcg_score = calculate_ndcg(model, X_test, y_test, q_id_test, k=5, use_tqdm=False)

            logging.info(f"Iteration {index + 1}/{len(param_grid[key])} for {key}: Testing value {value}")

            if ndcg_score > best_ndcg:
                best_ndcg = ndcg_score
                best_model = model
                best_params = current_params.copy()  # Copy current optimal parameters

                logging.info(f"New best parameters found for {key}: {best_params}")
                logging.info(f"New best ndcg_score: {best_ndcg}")

        # Fix the best parameter found for the current hyperparameter
        current_params[key] = best_params[key]

    return best_model, best_params, best_ndcg

# Example usage
train_groups = qid_train.value_counts().sort_index()
best_model, best_params, best_ndcg_score = grid_search_lgbm_ranker(
    X_train, y_train, train_groups, X_test, y_test, qid_test
)

print("Best Params:", best_params)
print("Best NDCG Score:", best_ndcg_score)


2024-05-18 22:55:31,529 - INFO - Starting optimization for objective with 1 possible values.
2024-05-18 22:57:51,789 - INFO - Iteration 1/1 for objective: Testing value lambdarank
2024-05-18 22:57:51,789 - INFO - New best parameters found for objective: {'objective': 'lambdarank', 'learning_rate': 0.01, 'num_leaves': 31, 'min_data_in_leaf': 20, 'lambda_l2': 0, 'max_bin': 256, 'n_estimators': 500, 'metric': ['ndcg'], 'verbose': -1}
2024-05-18 22:57:51,789 - INFO - New best ndcg_score: 0.37548179042298896
2024-05-18 22:57:51,790 - INFO - Starting optimization for learning_rate with 3 possible values.
2024-05-18 23:00:18,108 - INFO - Iteration 1/3 for learning_rate: Testing value 0.01
2024-05-18 23:02:26,847 - INFO - Iteration 2/3 for learning_rate: Testing value 0.05
2024-05-18 23:02:26,848 - INFO - New best parameters found for learning_rate: {'objective': 'lambdarank', 'learning_rate': 0.05, 'num_leaves': 31, 'min_data_in_leaf': 20, 'lambda_l2': 0, 'max_bin': 256, 'n_estimators': 500, 