# Learning to rank model
testing is done on validation set 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
import xgboost as xgb

def train_test_split_by_group(data, id_column, test_size=0.2, random_state=None):
    """
    Split a dataset by groups defined by a specific column.
    This function is used to split the queries into train and test set
    Parameters:
    - data: pandas DataFrame, the dataset to be split.
    - id_column: str, the name of the column containing the group IDs.
    - test_size: float, optional (default=0.2), the proportion of the dataset to include in the test split.
    - random_state: int or RandomState instance, optional (default=None), control the randomness of the shuffling.

    Returns:
    - train_set: pandas DataFrame, the training set.
    - test_set: pandas DataFrame, the test set.
    """
    # Create GroupShuffleSplit object
    gss = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)

    # Group by the specified column and apply GroupShuffleSplit
    groups = data[id_column]
    train_idx, test_idx = next(gss.split(data, groups=groups))

    # Split the dataset into train and test sets
    train_set = data.iloc[train_idx]
    test_set = data.iloc[test_idx]

    return train_set, test_set

def ndcg_at_k(true_relevance, predicted_relevance, k):
    # Ensure lengths of true_relevance and predicted_relevance are equal
    if len(true_relevance) != len(predicted_relevance):
        raise ValueError("Lengths of true_relevance and predicted_relevance must be equal")


    # Get the true relevance scores based on the sorted indices
    true_relevance_sorted = [true_relevance[i] for i in predicted_relevance]

    # Calculate DCG (Discounted Cumulative Gain) at k
    dcg_at_k = 0
    for i in range(min(k, len(true_relevance))):
        dcg_at_k += (2 ** true_relevance_sorted[i] - 1) / np.log2(i + 2)

    # Sort the true relevance scores
    true_relevance_sorted_desc = sorted(true_relevance, reverse=True)

    # Calculate ideal DCG at k
    idcg_at_k = 0
    for i in range(min(k, len(true_relevance))):
        idcg_at_k += (2 ** true_relevance_sorted_desc[i] - 1) / np.log2(i + 2)

    # Calculate NDCG at k
    if idcg_at_k == 0:
        ndcg_at_k = 0
    else:
        ndcg_at_k = dcg_at_k / idcg_at_k

    return ndcg_at_k



# Loading the data, splitting into train and validation set

In [2]:
df = pd.read_csv('./dataset/train_clean_v1_noIDs.csv')
train_set, test_set = train_test_split_by_group(df, 'srch_id', test_size=0.4, random_state=7)
train_set

Unnamed: 0,srch_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_log_historical_price,position,price_usd,promotion_flag,srch_length_of_stay,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,target_label
0,1,3.374334,176.022659,219,893,3,3.5,1,2.83,4.95,27,104.77,0,1,4,0,1,1,0
1,1,3.374334,176.022659,219,10404,4,4.0,1,2.20,5.03,26,170.74,0,1,4,0,1,1,0
2,1,3.374334,176.022659,219,21315,3,4.5,1,2.20,4.92,21,179.80,0,1,4,0,1,1,0
3,1,3.374334,176.022659,219,27348,2,4.0,1,2.83,4.39,34,602.77,0,1,4,0,1,1,0
4,1,3.374334,176.022659,219,29604,4,3.5,1,2.64,4.93,4,143.58,0,1,4,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4958342,332785,3.374334,176.022659,219,77700,3,4.0,1,1.61,0.00,2,118.00,0,1,3,0,1,0,0
4958343,332785,3.374334,176.022659,219,88083,3,4.0,1,1.95,0.00,3,89.00,0,1,3,0,1,0,0
4958344,332785,3.374334,176.022659,219,94508,3,3.5,1,1.10,0.00,4,99.00,0,1,3,0,1,0,0
4958345,332785,3.374334,176.022659,219,128360,3,5.0,1,1.95,0.00,1,139.00,0,1,3,0,1,0,5


# preparing the data for XGBRanker

In [3]:
X_train = train_set.loc[:, ~train_set.columns.isin(['srch_id','target_label'])]
y_train = train_set.loc[:, train_set.columns.isin(['target_label'])]

groups = train_set['srch_id'].to_numpy()

#We need to keep the id for later predictions
X_test = test_set.loc[:, ~test_set.columns.isin(['target_label'])]
y_test = test_set.loc[:, test_set.columns.isin(['srch_id','target_label'])]


In [4]:
# TODO: hyper-parameter tuning
params = {
    'tree_method': 'hist',
    'objective': 'rank:ndcg',  # Use rank:ndcg for optimizing NDCG
    'eval_metric': 'ndcg@5',    # Evaluation metric NDCG@k
    'eta': 0.1,                  # Learning rate
    'max_depth': 6               # Maximum depth of a tree
}

# Initialize the XGBRanker with modified parameters
model = xgb.XGBRanker(**params)

model.fit(X_train.to_numpy(), y_train.to_numpy(), qid = groups, verbose=True)

# Prediction
prediction is done query by query

In [5]:
def predict(model, df):
    return model.predict(df.loc[:, ~df.columns.isin(['srch_id', 'target_label'])])

predictions = (X_test.groupby('srch_id')
               .apply(lambda x: predict(model, x)))


modified_df = X_test.groupby('srch_id').apply(lambda group: group.assign(predicted_reg=predictions[group.name])).reset_index(drop=True)

predictions


  predictions = (X_test.groupby('srch_id')
  modified_df = X_test.groupby('srch_id').apply(lambda group: group.assign(predicted_reg=predictions[group.name])).reset_index(drop=True)


srch_id
8         [-0.6645105, -0.3880853, -0.91065043, -0.30604...
12        [-0.66960853, -1.0302794, -1.2702277, -0.21748...
17        [-1.1658673, 0.103358746, -0.5715241, -1.19735...
25        [1.8042231, -0.42806253, 1.798525, 0.23056786,...
28        [0.8711667, -1.2103298, 0.050944675, -0.036663...
                                ...                        
332765    [0.37761077, -0.39742115, -0.5439628, 0.681853...
332768    [-0.7833695, 0.46244556, -0.3092047, -0.610602...
332777    [-1.3024815, -0.91815746, 0.59281975, -0.81732...
332781    [-0.25221604, 1.6292592, 0.9338594, -0.5175794...
332784    [-1.1683971, -0.3608559, -0.038870476, -0.7946...
Length: 79918, dtype: object

In [6]:
from sklearn.metrics import ndcg_score
import numpy as np

# result = modified_df.sort_values(by=['srch_id', 'predicted_reg'], ascending=[True, False])

# Assuming `y_test['target_label'].to_numpy()` contains true relevance scores
# and `result['predicted_reg'].to_numpy()` contains predicted relevance scores

# Calculate NDCG@5 per query
ndcg_per_query = []
for query_id in modified_df['srch_id'].unique():
    true_relevance = y_test[y_test['srch_id'] == query_id]['target_label'].values
    predicted_relevance = modified_df[modified_df['srch_id'] == query_id]['predicted_reg'].values
    ndcg_per_query.append(ndcg_score([true_relevance], [predicted_relevance], k=5))


# Average over all queries
average_ndcg = np.mean(ndcg_per_query)
print("Average NDCG@5 weighted by log2:", average_ndcg)

Average NDCG@5 weighted by log2: 0.39641765916161265


In [7]:
modified_df

Unnamed: 0,srch_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_log_historical_price,position,price_usd,promotion_flag,srch_length_of_stay,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,predicted_reg
0,8,3.374334,176.022659,219,10250,3,3.5,1,1.39,4.64,15,82.00,0,1,1,0,1,0,-0.664510
1,8,3.374334,176.022659,219,13252,4,4.5,1,0.00,5.49,19,160.00,1,1,1,0,1,0,-0.388085
2,8,3.374334,176.022659,219,22756,4,4.0,1,2.83,5.11,10,259.00,0,1,1,0,1,0,-0.910650
3,8,3.374334,176.022659,219,27669,3,3.5,1,3.00,4.62,16,85.00,0,1,1,0,1,0,-0.306044
4,8,3.374334,176.022659,219,30630,3,4.0,1,0.00,4.81,20,109.00,0,1,1,0,1,0,-1.104660
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1982838,332784,3.374334,176.022659,100,114177,4,4.5,1,4.75,0.00,15,300.74,0,2,1,0,1,0,-1.260272
1982839,332784,3.374334,176.022659,100,115562,4,4.0,1,4.74,0.00,19,213.15,0,2,1,0,1,0,-0.840384
1982840,332784,3.374334,176.022659,100,121275,4,4.0,0,4.98,0.00,25,193.87,1,2,1,0,1,0,-0.933107
1982841,332784,3.374334,176.022659,100,128677,3,3.5,1,4.56,0.00,8,107.05,0,2,1,0,1,0,0.146990
