In [33]:
import faiss
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import json
import xgboost as xgb
import catboost
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
import tqdm

# The Dataset Preparation for the L2R Model

In [2]:
# Load datasets
base = pd.read_csv("./data/base.csv")
train = pd.read_csv("./data/train.csv")

In [3]:
base.head()

Unnamed: 0,Id,0,1,2,3,4,5,6,7,8,...,62,63,64,65,66,67,68,69,70,71
0,0-base,-115.08389,11.152912,-64.42676,-118.88089,216.48244,-104.69806,-469.070588,44.348083,120.915344,...,-42.808693,38.800827,-151.76218,-74.38909,63.66634,-4.703861,92.93361,115.26919,-112.75664,-60.830353
1,1-base,-34.562202,13.332763,-69.78761,-166.53348,57.680607,-86.09837,-85.076666,-35.637436,119.718636,...,-117.767525,41.1,-157.8294,-94.446806,68.20211,24.346846,179.93793,116.834,-84.888941,-59.52461
2,2-base,-54.233746,6.379371,-29.210136,-133.41383,150.89583,-99.435326,52.554795,62.381706,128.95145,...,-76.3978,46.011803,-207.14442,127.32557,65.56618,66.32568,81.07349,116.594154,-1074.464888,-32.527206
3,3-base,-87.52013,4.037884,-87.80303,-185.06763,76.36954,-58.985165,-383.182845,-33.611237,122.03191,...,-70.64794,-6.358921,-147.20105,-37.69275,66.20289,-20.56691,137.20694,117.4741,-1074.464888,-72.91549
4,4-base,-72.74385,6.522049,43.671265,-140.60803,5.820023,-112.07408,-397.711282,45.1825,122.16718,...,-57.199104,56.642403,-159.35184,85.944724,66.76632,-2.505783,65.315285,135.05159,-1074.464888,0.319401


In [4]:
train.head()

Unnamed: 0,Id,0,1,2,3,4,5,6,7,8,...,63,64,65,66,67,68,69,70,71,Target
0,0-query,-53.882748,17.971436,-42.117104,-183.93668,187.51749,-87.14493,-347.360606,38.307602,109.08556,...,70.10736,-155.80257,-101.965943,65.90379,34.4575,62.642094,134.7636,-415.750254,-25.958572,675816-base
1,1-query,-87.77637,6.806268,-32.054546,-177.26039,120.80333,-83.81059,-94.572749,-78.43309,124.9159,...,4.669178,-151.69771,-1.638704,68.170876,25.096191,89.974976,130.58963,-1035.092211,-51.276833,366656-base
2,2-query,-49.979565,3.841486,-116.11859,-180.40198,190.12843,-50.83762,26.943937,-30.447489,125.771164,...,78.039764,-169.1462,82.144186,66.00822,18.400496,212.40973,121.93147,-1074.464888,-22.547178,1447819-base
3,3-query,-47.810562,9.086598,-115.401695,-121.01136,94.65284,-109.25541,-775.150134,79.18652,124.0031,...,44.515266,-145.41675,93.990981,64.13135,106.06192,83.17876,118.277725,-1074.464888,-19.902788,1472602-base
4,4-query,-79.632126,14.442886,-58.903397,-147.05254,57.127068,-16.239529,-321.317964,45.984676,125.941284,...,45.02891,-196.09207,-117.626337,66.92622,42.45617,77.621765,92.47993,-1074.464888,-21.149351,717819-base


In [5]:
train.shape

(100000, 74)

# L2R Model Training

In [6]:
# Load the JSON file into a Python dictionary
with open("./data/predictions_targets.json", "r") as file:
    predictions_targets_dict = json.load(file)
    file.close()

# Display the first few key-value pairs in the dictionary for inspection
list(predictions_targets_dict.items())[:5]

[('0-query',
  ['361564-base',
   '1375561-base',
   '2515747-base',
   '530165-base',
   '244376-base',
   '3543241-base',
   '3411737-base',
   '499041-base',
   '877519-base',
   '91611-base']),
 ('1-query',
  ['15226-base',
   '577617-base',
   '854272-base',
   '3883204-base',
   '234491-base',
   '732136-base',
   '1075687-base',
   '3121612-base',
   '321221-base',
   '511045-base']),
 ('2-query',
  ['1447819-base',
   '179726-base',
   '2456246-base',
   '142037-base',
   '3779305-base',
   '2027109-base',
   '4619736-base',
   '292904-base',
   '1149627-base',
   '1419149-base']),
 ('3-query',
  ['1472602-base',
   '1184684-base',
   '623886-base',
   '1625548-base',
   '1384447-base',
   '489064-base',
   '83039-base',
   '446431-base',
   '330427-base',
   '2752407-base']),
 ('4-query',
  ['717819-base',
   '839750-base',
   '4728419-base',
   '1663474-base',
   '624597-base',
   '377906-base',
   '1896066-base',
   '3590756-base',
   '254968-base',
   '1217803-base'])]

In [7]:
# Transform the dictionary into a flat DataFrame format
query_ids = []
candidate_ids = []

# Loop through the dictionary to populate the lists
for query, candidates in predictions_targets_dict.items():
    for candidate in candidates:
        query_ids.append(query)
        candidate_ids.append(candidate)

# Convert the lists into a DataFrame
faiss_results_df = pd.DataFrame({
    'Query_Id': query_ids,
    'Candidate_Id': candidate_ids
})

faiss_results_df.head()

Unnamed: 0,Query_Id,Candidate_Id
0,0-query,361564-base
1,0-query,1375561-base
2,0-query,2515747-base
3,0-query,530165-base
4,0-query,244376-base


In [8]:
faiss_results_df.shape

(917940, 2)

In [9]:
# Merge train with faiss_results_df to get feature vectors for queries
merged_train = train.merge(faiss_results_df, left_on="Id", right_on="Query_Id")

# Merge the above result with base to get feature vectors for candidates
merged_data = merged_train.merge(base, left_on="Candidate_Id", right_on="Id", suffixes=('_query', '_candidate'))

# Calculate the difference for all features
feature_columns = [col for col in train.columns if col not in ["Id", "Target"]]
for feature in feature_columns:
    merged_data[f"{feature}_diff"] = merged_data[f"{feature}_query"] - merged_data[f"{feature}_candidate"]

# Set the Target: 1 if Candidate_Id matches Target from train, 0 otherwise
merged_data["Target_L2R"] = (merged_data["Candidate_Id"] == merged_data["Target"]).astype(int)

# Final L2R dataset
l2r_dataset = merged_data[["Query_Id", "Candidate_Id"] + [f"{feature}_diff" for feature in feature_columns] + ["Target_L2R"]]

# Display the first few rows of the adapted dataset
l2r_dataset.head()

Unnamed: 0,Query_Id,Candidate_Id,0_diff,1_diff,2_diff,3_diff,4_diff,5_diff,6_diff,7_diff,...,63_diff,64_diff,65_diff,66_diff,67_diff,68_diff,69_diff,70_diff,71_diff,Target_L2R
0,0-query,361564-base,6.338712,0.555882,2.766986,3.35803,10.24057,-7.20804,412.265459,-11.900298,...,-12.97195,14.2261,-213.930144,0.11825,-4.633908,17.867024,7.92966,174.087874,18.063721,0
1,16747-query,361564-base,-15.324805,-2.823055,-19.54266,16.44458,16.49818,9.29644,680.426865,8.223496,...,7.34443,5.67448,-238.973847,0.63214,-4.824364,-22.379139,0.46874,-548.630964,7.378573,0
2,39372-query,361564-base,-20.371436,0.12221,-3.65419,2.93686,-1.39196,15.91954,776.451515,2.464833,...,8.833135,-15.84862,-112.315377,0.14332,12.426702,-20.876797,4.48402,352.492461,25.946973,0
3,51736-query,361564-base,12.316385,-1.236839,-19.69623,1.95479,-2.56882,9.0956,1.020728,-0.84453,...,1.41848,-7.5179,-47.994667,0.62228,13.850922,7.825867,8.65172,-484.62676,3.944576,0
4,78506-query,361564-base,-11.10026,0.285606,-4.704834,2.27825,-16.62448,1.762986,277.35781,-0.68402,...,7.78651,-6.99577,-210.755128,0.63065,2.905159,-30.589397,-4.6374,-484.62676,8.64666,0


In [10]:
l2r_dataset["Target_L2R"].value_counts()

0    850763
1     67177
Name: Target_L2R, dtype: int64

In [11]:
l2r_dataset.shape

(917940, 75)

# L2R Modeling

## XGBClassifier

In [12]:
drop_columns = ["6", "44", "70", "21", "25", "33", "59", "65"]

# Create a list of difference columns to drop based on the provided drop_columns list
drop_diff_columns = [f"{col}_diff" for col in drop_columns]

l2r_dataset = l2r_dataset.drop(columns=drop_diff_columns)

print(l2r_dataset.head())
print()
print(l2r_dataset.shape)

      Query_Id Candidate_Id     0_diff    1_diff     2_diff    3_diff  \
0      0-query  361564-base   6.338712  0.555882   2.766986   3.35803   
1  16747-query  361564-base -15.324805 -2.823055 -19.542660  16.44458   
2  39372-query  361564-base -20.371436  0.122210  -3.654190   2.93686   
3  51736-query  361564-base  12.316385 -1.236839 -19.696230   1.95479   
4  78506-query  361564-base -11.100260  0.285606  -4.704834   2.27825   

     4_diff     5_diff     7_diff   8_diff  ...  61_diff    62_diff  \
0  10.24057  -7.208040 -11.900298 -3.46740  ... -3.79131 -16.288866   
1  16.49818   9.296440   8.223496 -1.64476  ... -3.28058  -3.971426   
2  -1.39196  15.919540   2.464833  0.22253  ... -2.22205  -3.133946   
3  -2.56882   9.095600  -0.844530  2.84566  ... -2.68282 -10.134926   
4 -16.62448   1.762986  -0.684020  0.04751  ... -0.14190  -5.341596   

     63_diff   64_diff  66_diff    67_diff    68_diff  69_diff    71_diff  \
0 -12.971950  14.22610  0.11825  -4.633908  17.867024  7.

In [13]:
# Splitting the dataset into features (X) and target label (y)
X = l2r_dataset.drop(columns=['Query_Id', 'Candidate_Id', 'Target_L2R'])
y = l2r_dataset['Target_L2R']

# Splitting the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_val.shape

((734352, 64), (183588, 64))

In [14]:
# # Initialize and train the L2R model using XGBoost
# model_xgb = xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss")
# model_xgb.fit(X_train, y_train)

# # Validate the model on the validation set
# y_pred_xgb = model_xgb.predict(X_val)
# accuracy_xgb = accuracy_score(y_val, y_pred_xgb)
# accuracy_xgb

## Catboost

In [15]:
# Initialize the CatBoostClassifier model
cat_model = catboost.CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=13,
    loss_function='Logloss',
    verbose=500,
    early_stopping_rounds=2, 
    devices="0:1",
    task_type="GPU"
)

# Train the model on the training data
cat_model.fit(X_train, y_train, eval_set=(X_val, y_val), plot=True)

y_pred_cat = cat_model.predict(X_val)
accuracy_cat = accuracy_score(y_val, y_pred_cat)
accuracy_cat

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.5609685	test: 0.5612298	best: 0.5612298 (0)	total: 97.7ms	remaining: 1m 37s
bestTest = 0.1721107095
bestIteration = 108
Shrink model to first 109 iterations.


0.9389611521450204

# Predicting Probabilities, Sorting/Re-Ranking, Top 5 Candidates Selection

In [16]:
validation = pd.read_csv("./data/validation.csv")
validation_answer = pd.read_csv("./data/validation_answer.csv")

In [17]:
validation = validation.drop(drop_columns, axis=1)

In [18]:
with open("./data/predictions_targets_valid.json", "r") as file:
    validation_predictions_targets = json.load(file)  
    file.close()
    
# Transform the dictionary into a flat DataFrame format
query_ids = []
candidate_ids = []

# Loop through the dictionary to populate the lists
for query, candidates in validation_predictions_targets.items():
    for candidate in candidates:
        query_ids.append(query)
        candidate_ids.append(candidate)

# Convert the lists into a DataFrame
validation_nearest_neighbors_df = pd.DataFrame({
    'Query_Id': query_ids,
    'Candidate_Id': candidate_ids
})

validation_nearest_neighbors_df.head()

Unnamed: 0,Query_Id,Candidate_Id
0,100000-query,2760762-base
1,100000-query,3209652-base
2,100000-query,1542803-base
3,100000-query,3839597-base
4,100000-query,508555-base


In [19]:
validation_nearest_neighbors_df.shape

(915020, 2)

In [22]:
# Merge with validation.csv to get feature vectors for queries
merged_validation = validation.merge(validation_nearest_neighbors_df, left_on="Id", right_on="Query_Id")
# merged_validation = merged_validation.drop(drop_columns, axis=1)

# Merge the above result with base_sample.csv to get feature vectors for candidates
merged_validation_data = merged_validation.merge(base, left_on="Candidate_Id", right_on="Id", suffixes=('_query', '_candidate'))

In [23]:
feature_columns = ['0', '1', '2', '3', '4', '5', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20', '22', '23', '24', '26',
       '27', '28', '29', '30', '31', '32', '34', '35', '36', '37', '38', '39',
       '40', '41', '42', '43', '45', '46', '47', '48', '49', '50', '51', '52',
       '53', '54', '55', '56', '57', '58', '60', '61', '62', '63', '64', '66',
       '67', '68', '69', '71']

In [24]:
feature_diff_columns = [f"{x}_diff" for x in feature_columns]
feature_query_columns = [f"{x}_query" for x in feature_columns]
feature_candidate_columns = [f"{x}_candidate" for x in feature_columns]

In [25]:
base = None
train = None
X = None
y = None
X_train = None
X_val = None
y_train = None 
y_val = None

In [26]:
# Calculate the difference for all features
feature_columns = [col for col in validation.columns if col not in ["Id", "Target"]]
for feature in feature_columns:
    merged_validation_data[f"{feature}_diff"] = merged_validation_data[f"{feature}_query"] - merged_validation_data[f"{feature}_candidate"]

In [27]:
#Use the trained CatBoost model to predict the relevance probabilities for each candidate
probs = cat_model.predict_proba(merged_validation_data[feature_diff_columns])[:, 1]

In [29]:
# For each query, sort its candidates based on the predicted probabilities
merged_validation_data["predicted_prob"] = probs
sorted_candidates = merged_validation_data.groupby("Query_Id").apply(lambda x: x.nlargest(5, 'predicted_prob')).reset_index(drop=True)

# Evaluation

In [35]:
# Evaluate using accuracy@5 metric

correct = 0
for _, row in tqdm.tqdm(validation_answer.iterrows()):
    top_5 = sorted_candidates[sorted_candidates["Query_Id"] == row["Id"]]["Candidate_Id"].tolist()
    if row["Expected"] in top_5:
        correct += 1

accuracy_at_5 = correct / len(validation_answer)
accuracy_at_5

100000it [37:24, 44.56it/s]


0.63303