In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
import xgboost as xgb

def train_test_split_by_group(data, id_column, test_size=0.2, random_state=None):
    """
    Split a dataset by groups defined by a specific column.
    This function is used to split the queries into train and test set
    Parameters:
    - data: pandas DataFrame, the dataset to be split.
    - id_column: str, the name of the column containing the group IDs.
    - test_size: float, optional (default=0.2), the proportion of the dataset to include in the test split.
    - random_state: int or RandomState instance, optional (default=None), control the randomness of the shuffling.

    Returns:
    - train_set: pandas DataFrame, the training set.
    - test_set: pandas DataFrame, the test set.
    """
    # Create GroupShuffleSplit object
    gss = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)

    # Group by the specified column and apply GroupShuffleSplit
    groups = data[id_column]
    train_idx, test_idx = next(gss.split(data, groups=groups))

    # Split the dataset into train and test sets
    train_set = data.iloc[train_idx]
    test_set = data.iloc[test_idx]

    return train_set, test_set

df = pd.read_csv('./dataset/train_new_feature.csv')
import re
df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

train_set, test_set = train_test_split_by_group(df, 'srch_id', test_size=0.1, random_state=7)
train_set

Unnamed: 0,srch_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,...,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,target_label,MEANprop_idprice_usd,SUBSTRACTprice_usdMEAN
0,1,,,219,893,3,3.5,1,2.83,0.0438,...,1,0,4,0,1,1,,0,118.758742,-13.988742
1,1,,,219,10404,4,4.0,1,2.20,0.0149,...,1,0,4,0,1,1,,0,152.054082,18.685918
2,1,,,219,21315,3,4.5,1,2.20,0.0245,...,1,0,4,0,1,1,,0,168.540871,11.259129
3,1,,,219,27348,2,4.0,1,2.83,0.0125,...,1,0,4,0,1,1,,0,82.598870,520.171130
4,1,,,219,29604,4,3.5,1,2.64,0.1241,...,1,0,4,0,1,1,,0,137.648135,5.931865
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4958342,332785,,,219,77700,3,4.0,1,1.61,0.0471,...,1,21,3,0,1,0,,0,131.241702,-13.241702
4958343,332785,,,219,88083,3,4.0,1,1.95,0.1520,...,1,21,3,0,1,0,,0,84.545789,4.454211
4958344,332785,,,219,94508,3,3.5,1,1.10,0.0164,...,1,21,3,0,1,0,,0,116.537209,-17.537209
4958345,332785,,,219,128360,3,5.0,1,1.95,0.0662,...,1,21,3,0,1,0,,5,150.336757,-11.336757


In [2]:
X_train = train_set.loc[:, ~train_set.columns.isin(['srch_id','target_label'])]
y_train = train_set.loc[:, train_set.columns.isin(['target_label'])]

q_id_train = train_set['srch_id']
q_id_test = test_set['srch_id']

groups = train_set.groupby('srch_id').size()

#We need to keep the id for later predictions
X_test = test_set.loc[:, ~test_set.columns.isin(['srch_id','target_label'])]
y_test = test_set.loc[:, test_set.columns.isin(['target_label'])]
print(groups.sum())


4461236


In [3]:
from lightgbm import LGBMRanker

gbm = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    n_estimators=2000,
    learning_rate=0.12,
)
gbm.fit(X_train, y_train, group=groups)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.111101 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3017
[LightGBM] [Info] Number of data points in the train set: 4461236, number of used features: 22


In [4]:
from sklearn.metrics import ndcg_score
from tqdm import tqdm

ndcg_ = list()
qids = np.unique(q_id_test)

for i, qid in enumerate(tqdm(qids)):  # Wrap qids with tqdm
    y = y_test[q_id_test == qid].values.flatten()

    # if (y == 0).any():
    #     continue
    
    p = gbm.predict(X_test[q_id_test == qid])
    ndcg_.append(ndcg_score([y], [p], k=5))

100%|██████████| 19980/19980 [00:22<00:00, 884.44it/s]


In [5]:
np.mean(ndcg_)


0.41883659175091154