In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
import xgboost as xgb

def train_test_split_by_group(data, id_column, test_size=0.2, random_state=None):
    """
    Split a dataset by groups defined by a specific column.
    This function is used to split the queries into train and test set
    Parameters:
    - data: pandas DataFrame, the dataset to be split.
    - id_column: str, the name of the column containing the group IDs.
    - test_size: float, optional (default=0.2), the proportion of the dataset to include in the test split.
    - random_state: int or RandomState instance, optional (default=None), control the randomness of the shuffling.

    Returns:
    - train_set: pandas DataFrame, the training set.
    - test_set: pandas DataFrame, the test set.
    """
    # Create GroupShuffleSplit object
    gss = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)

    # Group by the specified column and apply GroupShuffleSplit
    groups = data[id_column]
    train_idx, test_idx = next(gss.split(data, groups=groups))

    # Split the dataset into train and test sets
    train_set = data.iloc[train_idx]
    test_set = data.iloc[test_idx]

    return train_set, test_set

df = pd.read_csv('./dataset/train_clean_v1.csv')
train_set, test_set = train_test_split_by_group(df, 'srch_id', test_size=0.4, random_state=7)


In [2]:
X_train = train_set.loc[:, ~train_set.columns.isin(['srch_id','target_label'])]
y_train = train_set.loc[:, train_set.columns.isin(['target_label'])]

q_id_train = train_set['srch_id']
q_id_test = test_set['srch_id']

groups = train_set.groupby('srch_id').size()

#We need to keep the id for later predictions
X_test = test_set.loc[:, ~test_set.columns.isin(['srch_id','target_label'])]
y_test = test_set.loc[:, test_set.columns.isin(['target_label'])]
print(groups.sum())


2975504


In [None]:
from lightgbm import LGBMRanker

gbm = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    n_estimators=2000,
    learning_rate=0.12,
)
gbm.fit(X_train, y_train, group=groups)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.057722 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2125
[LightGBM] [Info] Number of data points in the train set: 2975504, number of used features: 19


In [None]:
from sklearn.metrics import ndcg_score

ndcg_ = list()
qids = np.unique(q_id_test)
for i, qid in enumerate(qids):
    y = y_test[q_id_test == qid].values.flatten()

    # if (y == 0).any():
    #     continue
    
    p = gbm.predict(X_test[q_id_test == qid])
    ndcg_.append(ndcg_score([y], [p], k=5))

In [None]:
np.mean(ndcg_)
