In [1]:
import pandas as pd
import json
import numpy as np
from rankeval.dataset import Dataset
from sklearn.model_selection import GroupShuffleSplit 
from ilmart import Ilmart,IlmartDistill
from rankeval.metrics import NDCG
from collections import defaultdict

In [22]:
input_file = r'Related_files_L1_input.json'
output_file = r'ILMART_models.json'

## Algorithm 

Based on 
- https://arxiv.org/abs/2206.00473
- https://github.com/veneres/ilmart/tree/main/experiments/ilmart

1. Modify Light GBM and constrain it to  build trees with single or double features only
2. Build trees with LambdaMart as Loss function using a single featues. At this stage, based on the gain the model will reduce the number of features from d to R. So we will have R trees at this point. 
3. Build trees with GBMs constraining it to use pairs of features from 'p'. These trees are also constrained to use only 3 leaves. This step is identify and rank the feature pairs. This will generate S trees
4. Combine the top feature pairs in step 3 with R trees in step 2 to produce a forest with R+T trees. 
5. Convert the trees to a histogram format.
6. For prediction, discretely bin the features based on value and add the scores. The scores are the feature contributions

## Convert data to rank eval format

Load the dataset and split it into 10 record train and 5 record test set. Then we convert it into numpy arrays to load into rank_Eval_dataset

In [3]:
df=pd.read_json(input_file,lines=True)
df=df[:500]
splitter = GroupShuffleSplit(test_size=.30, n_splits=1, random_state = 7)
split = splitter.split(df, groups=df['topic_id'])
train_inds, test_inds = next(split)

train_df = df.iloc[train_inds]
vali_df = df.iloc[test_inds]

In [4]:
y_train = train_df['RelatedDocumentLabel'].to_numpy()
y_vali = vali_df['RelatedDocumentLabel'].to_numpy()
type(y_train),y_train.shape

(numpy.ndarray, (348,))

In [12]:
# Keeping it simple and using 7 features
train_features =['TitleQuery_TokenMatchScore','TitleQuery_FullMatchScore','EditorMatchScore','ViewsLifeTimeScore',
                 'Title_UnmatchedTokenCountScore','FileTypeScore','Title_GoodTitleKeywordsScore',]

In [13]:
X_train = train_df[train_features].to_numpy()
X_vali = vali_df[train_features].to_numpy()
type(X_train),X_train.shape

(numpy.ndarray, (348, 7))

In [14]:
train_qids=train_df['topic_id'].to_numpy()
vali_qids = vali_df['topic_id'].to_numpy()
L0_training_dataset = Dataset(X_train, y_train, train_qids, name="L0_training_dataset")
L0_vali_dataset = Dataset(X_vali, y_vali, vali_qids, name="L0_testing_dataset")

In [15]:
ilmart_ranker = Ilmart(verbose=True)

In [16]:
config_path = 'ilMart_config.json'
with open(config_path) as f:
    json_args = json.load(f)
    path_out = json_args["path_out"]
    common_params = json_args["common_params"]
    param_grid = json_args["param_grid"]
    boosting_rounds = json_args["boosting_rounds"]
    n_interactions = json_args["n_interactions"]
common_params['min_data_in_leaf'] =75
common_params['eval_at'] = 1
common_params['verbosity'] =1
common_params['lambdarank_truncation_level'] =  2
common_params['num_threads'] =1
boosting_rounds =3


In [17]:
ilmart_ranker.fit(common_params, boosting_rounds, L0_training_dataset, L0_vali_dataset) # small dataset so split req wont be met. thats ok!

{'objective': 'lambdarank', 'min_data_in_leaf': 75, 'min_sum_hessian_in_leaf': 0, 'lambdarank_truncation_level': 2, 'num_threads': 1, 'eval_at': 1, 'force_col_wise': True, 'verbosity': 1, 'interaction_constraints': [[0], [1], [2], [3], [4], [5], [6]]}
[LightGBM] [Info] Total Bins 44
[LightGBM] [Info] Number of data points in the train set: 348, number of used features: 4
[1]	valid_0's ndcg@1: 0.962585
Training until validation scores don't improve for 100 rounds
[2]	valid_0's ndcg@1: 0.950113
[3]	valid_0's ndcg@1: 0.945578
Did not meet early stopping. Best iteration is:
[1]	valid_0's ndcg@1: 0.962585
{'objective': 'lambdarank', 'min_data_in_leaf': 75, 'min_sum_hessian_in_leaf': 0, 'lambdarank_truncation_level': 2, 'num_threads': 1, 'eval_at': 1, 'force_col_wise': True, 'verbosity': 1, 'num_leaves': 3, 'learning_rate': 0.1}
[LightGBM] [Info] Total Bins 7
[LightGBM] [Info] Number of data points in the train set: 348, number of used features: 1






tree_interaction_constraints: []
{'objective': 'lambdarank', 'min_data_in_leaf': 75, 'min_sum_hessian_in_leaf': 0, 'lambdarank_truncation_level': 2, 'num_threads': 1, 'eval_at': 1, 'force_col_wise': True, 'verbosity': 1, 'tree_interaction_constraints': []}
[LightGBM] [Info] Total Bins 44
[LightGBM] [Info] Number of data points in the train set: 348, number of used features: 4
[2]	valid_0's ndcg@1: 0.950113
Training until validation scores don't improve for 100 rounds
[3]	valid_0's ndcg@1: 0.94898
[4]	valid_0's ndcg@1: 0.95805
Did not meet early stopping. Best iteration is:
[4]	valid_0's ndcg@1: 0.95805


## Save Required parameters from the model

In [18]:
distilled_ilmart_ranker = IlmartDistill(ilmart_ranker.get_model()) # default distill mode is full.  if we give anything else
# we need to also specific number of bins we need to put each feature in. It looks like then we have to give same number of bins
# to every feature

In [50]:
hist =  dict((''.join(str(k)), v) for k,v in distilled_ilmart_ranker.hist.items())

In [51]:
model_dict = dict()
model_dict["hist"] = hist
model_dict["splitting_values"] = distilled_ilmart_ranker.splitting_values

In [52]:
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

In [53]:
json_dump=json.dumps(model_dict,cls=NumpyEncoder)

In [54]:
with open(output_file,'w',encoding='utf-8') as outF:
    outF.write(json_dump )

## Predict using saved parameters

In [67]:
def __predict(row, hist,splitting_values, interactions_limit=-1):
    res = []
    res_ind = []
    interaction_to_exclude = []
    if interactions_limit != -1:
        inter_contrib = [(feats, value)for feats, value in expected_contribution(hist).items() if len(feats) > 1]
        inter_contrib.sort(key=lambda x: x[1], reverse=True)
        interaction_to_exclude = [feats for feats, value in inter_contrib[interactions_limit:]]
    for feats_hist, hist_feat in hist.items():
        if feats_hist in interaction_to_exclude:
            continue
        indices = []
        for feat in feats_hist:
            index_to_add = np.searchsorted(splitting_values[feat], row[feat])
            index_to_add -= 1
            index_to_add = max(0, index_to_add)
            index_to_add = min(len(splitting_values[feat]) - 2, index_to_add)
            indices.append(index_to_add)
        res.append( round(hist_feat[tuple(indices)],2)) # Each key in hist is a tree called hist_feat. the indices say which value to take
        res_ind.append(tuple(indices))
    return res,res_ind

In [68]:
def expected_contribution(hist):
        return {feats: np.abs(hist_item).mean() for feats, hist_item in hist.items()}

In [69]:
a,b=__predict(X_vali[0], distilled_ilmart_ranker.hist,model_dict["splitting_values"], interactions_limit=-1) # [(4,)[0],(3,4)[1][1],(4,6)[1][1],(0,4)[1][1]]
a

[-0.0, 0.07, -0.01, -0.01]

In [70]:
for i in range(len(a)):
    print("Contribution of feature ", str(b[i]) , " is : ", str(a[i]))

Contribution of feature  (0,)  is :  -0.0
Contribution of feature  (1, 0)  is :  0.07
Contribution of feature  (0, 0)  is :  -0.01
Contribution of feature  (1, 0)  is :  -0.01


In [71]:
print("Total score of the record", str(np.sum(a)))

Total score of the record 0.05
