# Learning to rank model
testing is done on validation set 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
import xgboost as xgb

def train_test_split_by_group(data, id_column, test_size=0.2, random_state=None):
    """
    Split a dataset by groups defined by a specific column.
    This function is used to split the queries into train and test set
    Parameters:
    - data: pandas DataFrame, the dataset to be split.
    - id_column: str, the name of the column containing the group IDs.
    - test_size: float, optional (default=0.2), the proportion of the dataset to include in the test split.
    - random_state: int or RandomState instance, optional (default=None), control the randomness of the shuffling.

    Returns:
    - train_set: pandas DataFrame, the training set.
    - test_set: pandas DataFrame, the test set.
    """
    # Create GroupShuffleSplit object
    gss = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)

    # Group by the specified column and apply GroupShuffleSplit
    groups = data[id_column]
    train_idx, test_idx = next(gss.split(data, groups=groups))

    # Split the dataset into train and test sets
    train_set = data.iloc[train_idx]
    test_set = data.iloc[test_idx]

    return train_set, test_set

def ndcg_at_k(true_relevance, predicted_relevance, k):
    # Ensure lengths of true_relevance and predicted_relevance are equal
    if len(true_relevance) != len(predicted_relevance):
        raise ValueError("Lengths of true_relevance and predicted_relevance must be equal")


    # Get the true relevance scores based on the sorted indices
    true_relevance_sorted = [true_relevance[i] for i in predicted_relevance]

    # Calculate DCG (Discounted Cumulative Gain) at k
    dcg_at_k = 0
    for i in range(min(k, len(true_relevance))):
        dcg_at_k += (2 ** true_relevance_sorted[i] - 1) / np.log2(i + 2)

    # Sort the true relevance scores
    true_relevance_sorted_desc = sorted(true_relevance, reverse=True)

    # Calculate ideal DCG at k
    idcg_at_k = 0
    for i in range(min(k, len(true_relevance))):
        idcg_at_k += (2 ** true_relevance_sorted_desc[i] - 1) / np.log2(i + 2)

    # Calculate NDCG at k
    if idcg_at_k == 0:
        ndcg_at_k = 0
    else:
        ndcg_at_k = dcg_at_k / idcg_at_k

    return ndcg_at_k



# Loading the data, splitting into train and validation set

In [2]:
df = pd.read_csv('./dataset/train_clean_v1.csv')
train_set, test_set = train_test_split_by_group(df, 'srch_id', test_size=0.4, random_state=7)
train_set

Unnamed: 0,srch_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,...,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,target_label
0,1,187,3.374334,176.022659,219,893,3,3.5,1,2.83,...,27,104.77,0,23246,1,4,0,1,1,0
1,1,187,3.374334,176.022659,219,10404,4,4.0,1,2.20,...,26,170.74,0,23246,1,4,0,1,1,0
2,1,187,3.374334,176.022659,219,21315,3,4.5,1,2.20,...,21,179.80,0,23246,1,4,0,1,1,0
3,1,187,3.374334,176.022659,219,27348,2,4.0,1,2.83,...,34,602.77,0,23246,1,4,0,1,1,0
4,1,187,3.374334,176.022659,219,29604,4,3.5,1,2.64,...,4,143.58,0,23246,1,4,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4958342,332785,219,3.374334,176.022659,219,77700,3,4.0,1,1.61,...,2,118.00,0,16974,1,3,0,1,0,0
4958343,332785,219,3.374334,176.022659,219,88083,3,4.0,1,1.95,...,3,89.00,0,16974,1,3,0,1,0,0
4958344,332785,219,3.374334,176.022659,219,94508,3,3.5,1,1.10,...,4,99.00,0,16974,1,3,0,1,0,0
4958345,332785,219,3.374334,176.022659,219,128360,3,5.0,1,1.95,...,1,139.00,0,16974,1,3,0,1,0,5


# preparing the data for XGBRanker

In [3]:
X_train = train_set.loc[:, ~train_set.columns.isin(['srch_id','target_label'])]
y_train = train_set.loc[:, train_set.columns.isin(['target_label'])]

groups = train_set['srch_id'].to_numpy()

#We need to keep the id for later predictions
X_test = test_set.loc[:, ~test_set.columns.isin(['target_label'])]
y_test = test_set.loc[:, test_set.columns.isin(['target_label'])]


In [4]:
# TODO: hyper-parameter tuning
params = {
    'tree_method': 'hist',
    'objective': 'rank:ndcg',  # Use rank:ndcg for optimizing NDCG
    'eval_metric': 'ndcg@5',    # Evaluation metric NDCG@k
    'eta': 0.1,                  # Learning rate
    'max_depth': 6               # Maximum depth of a tree
}

# Initialize the XGBRanker with modified parameters
model = xgb.XGBRanker(**params)

model.fit(X_train.to_numpy(), y_train.to_numpy(), qid = groups, verbose=True)

# Prediction
prediction is done query by query

In [5]:
def predict(model, df):
    return model.predict(df.loc[:, ~df.columns.isin(['srch_id', 'target_label'])])

predictions = (X_test.groupby('srch_id')
               .apply(lambda x: predict(model, x)))


modified_df = X_test.groupby('srch_id').apply(lambda group: group.assign(predicted_reg=predictions[group.name])).reset_index(drop=True)

predictions


  predictions = (X_test.groupby('srch_id')
  modified_df = X_test.groupby('srch_id').apply(lambda group: group.assign(predicted_reg=predictions[group.name])).reset_index(drop=True)


srch_id
8         [-0.64375854, -0.34511906, -0.94781065, -0.305...
12        [-0.62729126, -1.0236231, -1.1924335, -0.24230...
17        [-1.211541, 0.10486238, -0.6549049, -1.1793015...
25        [1.794562, -0.39742678, 1.7205083, 0.26119342,...
28        [0.89292794, -1.1600217, 0.074762106, 0.023751...
                                ...                        
332765    [0.31580502, -0.40884826, -0.6016005, 0.724629...
332768    [-0.79299444, 0.3566039, -0.33526954, -0.70068...
332777    [-1.2935704, -0.89689356, 0.6533425, -0.783709...
332781    [-0.30176976, 1.5457252, 0.9517893, -0.5357024...
332784    [-1.1668267, -0.38300407, -0.07444897, -0.7496...
Length: 79918, dtype: object

In [7]:
from sklearn.metrics import ndcg_score

result = modified_df.sort_values(by=['srch_id', 'predicted_reg'], ascending=[True, False])
print(ndcg_score([y_test['target_label'].to_numpy()], [result['predicted_reg'].to_numpy()], k=5))
# y_test['target_label']
# result['predicted_reg']
# y_test['target_label'].to_numpy()
# result['predicted_reg'].to_numpy()

0.24090885754831726
