In [19]:
import pandas as pd
import numpy as np

from catboost import CatBoostRanker, Pool

In [2]:
train_df = pd.read_csv('data/train_df.csv')

In [3]:
test_df = pd.read_csv('data/test_df.csv')

## Data overview

In [4]:
train_df.head()

Unnamed: 0,search_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,target
0,758,9,0,0,1,20,3,40,0,3,...,0.204682,0.271755,0.055623,0,0,0,0.38648,0.0,0.0,0
1,758,9,0,0,1,20,3,40,0,3,...,0.195531,0.188787,0.036914,0,0,0,0.10982,0.0,0.0,0
2,758,9,0,0,1,20,3,40,0,3,...,0.148609,0.186517,0.027718,0,0,0,0.03674,0.0,0.0,0
3,758,9,0,0,1,20,3,40,0,3,...,0.223748,0.229039,0.051247,0,0,0,0.0,0.0,0.0,0
4,758,9,0,0,1,20,3,40,0,3,...,0.170935,0.249031,0.042568,0,0,0,0.0,0.0,0.0,0


In [5]:
train_df.shape

(15081, 81)

In [6]:
train_df['search_id'].nunique()

1000

In [7]:
train_df['target'].value_counts()

0    14759
1      322
Name: target, dtype: int64

In [8]:
X_train = train_df.drop(['search_id', 'target'], axis=1).values
y_train = train_df['target'].values
queries_train = train_df['search_id'].values

X_test = test_df.drop(['search_id', 'target'], axis=1).values
y_test = test_df['target'].values
queries_test = test_df['search_id'].values

In [9]:
train_pool = Pool(
    data=X_train,
    label=y_train,
    group_id=queries_train
)

test_pool = Pool(
    data=X_test,
    label=y_test,
    group_id=queries_test
)

## Training pipeline

In [14]:
def fit_model(loss_function, train_pool=train_pool, test_pool=test_pool):
    parameters = {
    'iterations': 2000,
    'custom_metric': ['NDCG'],
    'verbose': False,
    'loss_function' : loss_function,
    'train_dir' : loss_function
    }
        
    model = CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool, plot=False)
    
    return model

In [15]:
model = fit_model('YetiRank')

In [16]:
model.get_evals_result()['validation']

{'NDCG:type=Base': [0.8828742799586826,
  0.8915951573185785,
  0.8982635513847069,
  0.8960550449646592,
  0.8959827047075974,
  0.8952083149211723,
  0.897521178425935,
  0.8927254265850756,
  0.8901140879926298,
  0.8980745129583374,
  0.8914053493804615,
  0.8883829810243415,
  0.8931716920143054,
  0.895179128203954,
  0.8978732263171685,
  0.8964386890152696,
  0.8979899354720031,
  0.9002052764254868,
  0.8977434142597732,
  0.901800511020169,
  0.8998854047138038,
  0.899817006805289,
  0.9005627390944739,
  0.904427853190377,
  0.902782559995648,
  0.8986231145453272,
  0.8996314022299591,
  0.9003587969442384,
  0.9005050695901248,
  0.9041653097907184,
  0.9022655749254115,
  0.9021662705154455,
  0.9039019119467487,
  0.9004224035564732,
  0.9046166363038659,
  0.9073688043450833,
  0.9121808418370496,
  0.9090266964722283,
  0.9126845273645798,
  0.908602756794506,
  0.9114893253378914,
  0.9021798863360841,
  0.9025468634385487,
  0.9034335587953973,
  0.9027128526296967,

In [24]:
def find_best_estimator(models_list=["YetiRank", "YetiRankPairwise", "PairLogit", "PairLogitPairwise", "RMSE"]):
    results = {

    }
    
    for model_name in models_list:
        results[model_name] = {}
        model = fit_model(model_name)
        val_res = model.get_evals_result()['validation']['NDCG:type=Base']
        
        results[model_name]['NDCG'] = val_res
        results[model_name]['NDCG_mean'] = sum(val_res) / len(val_res)
        results[model_name]['estimator'] = model        


    return results

In [25]:
training_results = find_best_estimator()



In [27]:
training_results.keys()

dict_keys(['YetiRank', 'YetiRankPairwise', 'PairLogit', 'PairLogitPairwise', 'RMSE'])

In [28]:
for k in training_results.keys():
    print(k, f"NDCG_mean: {training_results[k]['NDCG_mean']}")

YetiRank NDCG_mean: 0.9014128336818806
YetiRankPairwise NDCG_mean: 0.9163690046467837
PairLogit NDCG_mean: 0.9111565081876303
PairLogitPairwise NDCG_mean: 0.9195243862861883
RMSE NDCG_mean: 0.907236109788757


In [29]:
training_results['PairLogitPairwise']['NDCG']

[0.8869915895318775,
 0.8881742929097416,
 0.8904670755687253,
 0.8919590591990472,
 0.8926667366450488,
 0.90125251116899,
 0.889045581002139,
 0.8926426175459306,
 0.895942365539621,
 0.897078568573655,
 0.8967511062808904,
 0.8944968202076649,
 0.895246060361774,
 0.903409953650349,
 0.9010822127981717,
 0.9052281023404686,
 0.9074329043420245,
 0.9094220003372188,
 0.9052787906580531,
 0.9047863076508468,
 0.90991515452694,
 0.9123004632635454,
 0.912724025250733,
 0.9143131766221728,
 0.913524741805308,
 0.923100454410417,
 0.9246438511043595,
 0.9271290434469057,
 0.9234407263270942,
 0.9169196314078286,
 0.9150365813090481,
 0.9173994366468117,
 0.9182907358237068,
 0.9227061166151218,
 0.9185003927333183,
 0.9203122867124766,
 0.9199837873311911,
 0.9159835550903398,
 0.9153057893079525,
 0.9146125548886864,
 0.9196125548886863,
 0.9200604322287844,
 0.916514989937032,
 0.915411333396803,
 0.9161456245021351,
 0.9161840775982414,
 0.9147429064264416,
 0.9163062128257222,
 0.915