In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
import xgboost as xgb

train_set = pd.read_csv('./dataset/train_new_feature.csv')
import re
train_set = train_set.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
test_set = pd.read_csv('./dataset/test_new_feature.csv')
test_set = test_set.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))


In [2]:
X_train = train_set.loc[:, ~train_set.columns.isin(['srch_id','target_label', 'position'])]
y_train = train_set.loc[:, train_set.columns.isin(['target_label'])]

q_id_train = train_set['srch_id']
q_id_test = test_set['srch_id']

groups = train_set.groupby('srch_id').size()

#We need to keep the id for later predictions
X_test = test_set.loc[:, ~test_set.columns.isin(['srch_id','target_label'])]
y_test = test_set.loc[:, test_set.columns.isin(['target_label'])]

In [3]:
from lightgbm import LGBMRanker

gbm = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    n_estimators=2000,
    learning_rate=0.12,
)
gbm.fit(X_train, y_train, group=groups)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.101179 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2978
[LightGBM] [Info] Number of data points in the train set: 4958347, number of used features: 21


In [4]:
print(X_test.columns)
print(X_train.columns)

Index(['visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_country_id',
       'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'price_usd', 'promotion_flag',
       'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count',
       'srch_children_count', 'srch_room_count', 'srch_saturday_night_bool',
       'srch_query_affinity_score', 'MEANprop_idprice_usd',
       'SUBSTRACTprice_usdMEAN'],
      dtype='object')
Index(['visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_country_id',
       'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'price_usd', 'promotion_flag',
       'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count',
       'srch_children_count', 'srch_room_count', 'srch_saturday_night_bool',
       'srch_query_affinity_sc

In [5]:
from tqdm import tqdm

predictions = []
# Use tqdm to track progress over unique groups
for group in tqdm(np.unique(q_id_test), desc='Processing groups'):
    preds = gbm.predict(X_test[q_id_test == group])
    predictions.extend(preds)




Processing groups: 100%|██████████| 199549/199549 [09:01<00:00, 368.18it/s]


In [6]:
X_test['preds'] = predictions
X_test['srch_id'] = q_id_test

result = X_test.sort_values(by=['srch_id', 'preds'], ascending=[True, False])
result[['srch_id', 'prop_id']].reset_index(drop=True).to_csv("dataset/submission8_lightgbm.csv", index=False)

In [7]:
X_test


Unnamed: 0,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,...,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,MEANprop_idprice_usd,SUBSTRACTprice_usdMEAN,preds,srch_id
0,,,219,3180,3,4.5,1,2.94,0.0691,5.03,...,10,2,0,1,0,,125.520757,-6.520757,-0.042211,1
1,,,219,5543,3,4.5,1,2.64,0.0843,4.93,...,10,2,0,1,0,,117.532579,0.467421,0.029925,1
2,,,219,14142,2,3.5,1,2.71,0.0556,4.16,...,10,2,0,1,0,,51.886600,-2.886600,-0.404749,1
3,,,219,22393,3,4.5,1,2.40,0.0561,5.03,...,10,2,0,1,0,,133.987619,9.012381,-0.538686,1
4,,,219,24194,3,4.5,1,2.94,0.2090,4.72,...,10,2,0,1,0,,86.308224,-7.308224,0.887949,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4959178,,,117,32019,4,3.5,0,2.48,0.0551,4.53,...,7,1,0,1,0,,62.260769,3.809231,-0.149487,332787
4959179,,,117,33959,4,3.0,1,2.20,0.3344,4.39,...,7,1,0,1,0,,63.847692,3.252308,0.842564,332787
4959180,,,117,35240,4,0.0,0,1.79,,4.64,...,7,1,0,1,0,,62.955000,10.955000,-0.953241,332787
4959181,,,117,94437,4,0.0,0,2.94,0.0928,4.64,...,7,1,0,1,0,,64.810000,1.260000,-0.250466,332787
