In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost
from sklearn.datasets import fetch_openml
import pyltr
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import ndcg_score
from sklearn.model_selection import train_test_split, cross_val_score

## Yahoo LTRC

In [3]:
def dcg(scores, k=10):
    score = 0
    for i in range(k):
        if (i==0):
            score=score+scores[i]
        else:
            score=score+(scores[i]/np.log2(i+1))
    return score

def dcg_exp(scores, k=10):
    score = 0
    for i in range(k):
        score = score + ((2**scores[i] - 1) / np.log2(i + 2))
    return score
'''
def ndcg(y_true, y_pred, ids, k=10):
    unique_ids=np.unique(ids)
    ndcg_scores=[]
    for qid in unique_ids:
        mask = ids == qid
        true_scores=y_true[mask]
        pred_scores=y_pred[mask]

        
        sorted_indices=np.argsort(pred_scores.reshape(1, -1)).reshape(-1, 1)[::-1]
        sorted_true_scores=true_scores[sorted_indices] #true scores sorted based on predicted
        if (len(sorted_true_scores < k)):
            k = len(sorted_true_scores)
        dcg_score=dcg(sorted_true_scores, k)
        idcg_score=dcg(sorted(np.array(true_scores), reverse=True), k)

        #print(sorted_true_scores)
        #print(sorted(np.array(true_scores), reverse=True))
        
        if (idcg_score>0):
            #print(dcg_score, idcg_score)
            ndcg_scores.append(dcg_score/idcg_score)
        
    return np.mean(ndcg_scores)
'''


def ndcg(y_true, y_pred, ids, k=10, handle_ties=True, exp=False):
    unique_ids=np.unique(ids)
    ndcg_scores=[]
    for qid in unique_ids:
        mask = ids == qid
        true_scores=y_true[mask]
        pred_scores=y_pred[mask]

        
        sorted_indices=np.argsort(pred_scores.reshape(1, -1)).reshape(-1, 1)[::-1]
        sorted_true_scores=true_scores[sorted_indices] #true scores sorted based on predicted
        sorted_pred_scores=pred_scores[sorted_indices]        
        if (len(sorted_true_scores < k)):
            k = len(sorted_true_scores)

        if handle_ties == True:
            for j in np.unique(pred_scores):
                ties = sorted_pred_scores == j
                #print(j, np.array(ties).flatten())
                #print(sorted_true_scores)
                mean_gain = np.mean(sorted_true_scores[np.array(ties).flatten()])
                for l in np.array(ties).flatten():
                    sorted_true_scores[l] = mean_gain
                

        if exp==True: dcg_score=dcg_exp(sorted_true_scores, k)
        else: dcg_score=dcg(sorted_true_scores, k)
        if exp==True: idcg_score=dcg_exp(sorted(np.array(true_scores), reverse=True), k)
        else: idcg_score=dcg(sorted(np.array(true_scores), reverse=True), k)

        dcg_score=dcg(sorted_true_scores, k)
        idcg_score=dcg(sorted(np.array(true_scores), reverse=True), k)
        
        #print(sorted_true_scores)
        #print(sorted(np.array(true_scores), reverse=True))
        
        if (idcg_score>0):
            #print(dcg_score, idcg_score)
            ndcg_scores.append(dcg_score/idcg_score)
        
    return np.mean(ndcg_scores)

In [4]:
def max_ndcg(y_true, y_pred, ids, k=10, exp=False):
    unique_ids=np.unique(ids)
    ndcg_scores=[]
    for qid in unique_ids:
        mask = ids == qid
        true_scores=y_true[mask]
        pred_scores=y_pred[mask]

        
        sorted_indices=np.argsort(pred_scores.reshape(1, -1)).reshape(-1, 1)[::-1]
        sorted_true_scores=true_scores[sorted_indices] #true scores sorted based on predicted
        sorted_pred_scores=pred_scores[sorted_indices]        
        if (len(sorted_true_scores < k)):
            k = len(sorted_true_scores)

        if exp==True: idcg_score=dcg_exp(sorted(np.array(true_scores), reverse=True), k)
        else: idcg_score=dcg(sorted(np.array(true_scores), reverse=True), k)
        if idcg_score > 0:
            if exp==True: dcg_score=dcg_exp(sorted_true_scores, k)
            else: dcg_score=dcg(sorted_true_scores, k)
            best_score=dcg_score/idcg_score


        ties = np.array([False]*len(sorted_true_scores))
        for j in range(len(sorted_pred_scores)-1):
            if sorted_pred_scores[j] == sorted_pred_scores[j+1]:
                ties[j] = True
        true_ties = np.where(ties==True)
        for j in np.array(true_ties).flatten():
            if sorted_pred_scores[j] == sorted_pred_scores[j+1]:
                placeholder = sorted_true_scores[j]
                sorted_true_scores[j] = sorted_true_scores[j+1]
                sorted_true_scores[j+1] = placeholder
                if exp==True: dcg_score=dcg_exp(sorted_true_scores, k)
                else: dcg_score=dcg(sorted_true_scores, k)
                if idcg_score > 0:
                    ndcg_score = dcg_score/idcg_score
                    if best_score < ndcg_score: best_score=ndcg_score

        #print(sorted_true_scores)
        #print(sorted(np.array(true_scores), reverse=True))
        
        if (idcg_score>0):
            #print(dcg_score, idcg_score)
            ndcg_scores.append(best_score)
        
    return np.mean(ndcg_scores)

def ndcg2(y_true, y_pred, ids, k=10):
    unique_ids=np.unique(ids)
    ndcg_list = []
    for qid in unique_ids:
        mask = ids == qid
        true_scores = y_true[mask]
        pred_scores = y_pred[mask]
        if (true_scores.shape[0] > 1):
            ndcg_now = ndcg_score(true_scores.reshape(1, -1), pred_scores.reshape(1, -1), k=k, ignore_ties=True)
        #else:
        #    ndcg_now = 0
            if ndcg_now > 0:
                ndcg_list.append(ndcg_now)
    return np.mean(ndcg_list)

In [6]:
with open('Learning_to_Rank_Challenge/ltrc_yahoo/set1.train.txt') as trainfile, \
        open('Learning_to_Rank_Challenge/ltrc_yahoo/set1.valid.txt') as valifile, \
        open('Learning_to_Rank_Challenge/ltrc_yahoo/set1.test.txt') as evalfile:
    TX, Ty, Tqids, _ = pyltr.data.letor.read_dataset(trainfile)
    VX, Vy, Vqids, _ = pyltr.data.letor.read_dataset(valifile)
    EX, Ey, Eqids, _ = pyltr.data.letor.read_dataset(evalfile)

In [7]:
Tqids = np.array(Tqids, dtype=object)
Vqids = np.array(Vqids, dtype=object)
Eqids = np.array(Eqids, dtype=object)

#### Hist

In [20]:
n_estimators = [600]
max_leaves = [8, 12, 16, 20, 24, None]
max_depths = [4]#[4, 6, 8]
etas = [0.1, 0.2, 0.3]

min_split_loss = [0.5]
reg_alpha = [0, 0.1]
reg_lambda = [0.5, 1, 5, 10]
subsampling = [0.5, 0.7, 0.9, 1]

In [22]:
%%time

best_ndcg = 0
best_parameters = [0, 0, 0, 0]

#for i in n_estimators:
#    for j in max_depths:
#        for k in max_leaves:
#            for l in etas:

for i in min_split_loss:
    for j in reg_alpha:
        for k in reg_lambda:
            for l in subsampling:

                
                ranker = xgboost.XGBRanker(tree_method="hist", n_estimators=200, max_depth=6, max_leaves=12,  eta=0.2, random_state=42,
                                            objective="rank:ndcg", n_jobs=-1, eval_metric='ndcg@10', ndcg_exp_gain=False,
                                            min_split_loss=i, reg_alpha=j, reg_lambda=k, subsample=l,
                                            #colsample_bytree=0.5
                                          )
                ranker.fit(TX, Ty, qid=Tqids, eval_set=[(VX, Vy)], eval_qid=[Vqids])
            
                scores = ranker.predict(EX)
                sorted_idx = np.argsort(scores)[::-1]
                # Sort the relevance scores from most relevant to least relevant
                scores = scores[sorted_idx].reshape(1, -1)
    
                ndcg_now = ndcg(Ey, scores.reshape(-1, 1), Eqids, handle_ties=True, exp=False)
                #print('%s , %s , %s , %s' % (i, j, k, l))
                print('min_split_loss = %s , reg_alpha = %s , reg_lambda = %s , sample = %s' % (i, j, k, l))
                if (ndcg_now>=best_ndcg):
                    best_ndcg = ndcg_now
                    best_parameters = [i, j, k, l]
                    print(ndcg_now)

min_split_loss = 0.5 , reg_alpha = 0 , reg_lambda = 0.5 , sample = 0.5
0.7829405468879008
min_split_loss = 0.5 , reg_alpha = 0 , reg_lambda = 0.5 , sample = 0.7
min_split_loss = 0.5 , reg_alpha = 0 , reg_lambda = 0.5 , sample = 0.9
min_split_loss = 0.5 , reg_alpha = 0 , reg_lambda = 0.5 , sample = 1
min_split_loss = 0.5 , reg_alpha = 0 , reg_lambda = 1 , sample = 0.5
min_split_loss = 0.5 , reg_alpha = 0 , reg_lambda = 1 , sample = 0.7
0.7830117848755901
min_split_loss = 0.5 , reg_alpha = 0 , reg_lambda = 1 , sample = 0.9
min_split_loss = 0.5 , reg_alpha = 0 , reg_lambda = 1 , sample = 1
min_split_loss = 0.5 , reg_alpha = 0 , reg_lambda = 5 , sample = 0.5
min_split_loss = 0.5 , reg_alpha = 0 , reg_lambda = 5 , sample = 0.7
min_split_loss = 0.5 , reg_alpha = 0 , reg_lambda = 5 , sample = 0.9
min_split_loss = 0.5 , reg_alpha = 0 , reg_lambda = 5 , sample = 1
min_split_loss = 0.5 , reg_alpha = 0 , reg_lambda = 10 , sample = 0.5
min_split_loss = 0.5 , reg_alpha = 0 , reg_lambda = 10 , sampl

200 , 6 , 12 , 0.2 , min_split_loss = 0 , reg_alpha = 0 , reg_lambda = 10 , sample = 0.9
0.7857732873715134

In [14]:
%%time

n_estimators = [500]
max_leaves = [8]
max_depths = [8]
etas = [0.1]

best_ndcg = 0
best_parameters = [0, 0, 0, 0]

for i in n_estimators:
    for j in max_depths:
        for k in max_leaves:
            for l in etas:
#    for j in colsample_bytree:
                ranker = xgboost.XGBRanker(tree_method="exact", n_estimators=i, max_depth=j, max_leaves=k,  eta=l, random_state=42,
                                            objective="rank:ndcg", n_jobs=-1, eval_metric='ndcg@10', ndcg_exp_gain=False,
                                            min_split_loss=0.6, reg_alpha=0.2, reg_lambda=11, 
                                            subsample=0.5,
                                            #colsample_bytree=0.5
                                          )
                ranker.fit(TX, Ty, qid=Tqids, eval_set=[(VX, Vy)], eval_qid=[Vqids])
            
                scores = ranker.predict(EX)
                sorted_idx = np.argsort(scores)[::-1]
                # Sort the relevance scores from most relevant to least relevant
                scores = scores[sorted_idx].reshape(1, -1)
    
                ndcg_now = ndcg(Ey, scores.reshape(-1, 1), Eqids, handle_ties=True, exp=False)
                print('%s , %s , %s , %s' % (i, j, k, l))
                if (ndcg_now>=best_ndcg):
                    best_ndcg = ndcg_now
                    best_parameters = [i, j, k, l]
                    print(ndcg_now)

500 , 8 , 8 , 0.1
0.7805511857325435
CPU times: total: 2h 58min 55s
Wall time: 6min 1s


In [15]:
a = ranker.evals_result()

In [16]:
np.mean(a["validation_0"]['ndcg@10'])

0.7805432815718214

In [9]:
n_estimators = [400]
max_leaves = [20, 24, None]
max_depths = [8]
etas = [0.3, 0.2, 0.1]

min_split_loss = [0, 0.1, 0.5]
reg_alpha = [0, 0.1, 0.5]
reg_lambda = [0.5, 1, 5, 10]
subsampling = [0.5, 0.7, 0.9, 1]

In [None]:
%%time

best_ndcg = 0
best_parameters = [0, 0, 0, 0]

for i in min_split_loss:
    for j in reg_alpha:
        for k in reg_lambda:
            for l in subsampling:
#    for j in colsample_bytree:
                ranker = xgboost.XGBRanker(tree_method="approx", n_estimators=300, max_depth=6, max_leaves=12,  eta=0.3, random_state=42,
                                            objective="rank:ndcg", n_jobs=-1, eval_metric='ndcg@10', ndcg_exp_gain=False,
                                            min_split_loss=i, reg_alpha=j, reg_lambda=k, 
                                            subsample=l,
                                            colsample_bytree=0.5
                                          )
                ranker.fit(TX, Ty, qid=Tqids, eval_set=[(VX, Vy)], eval_qid=[Vqids])
            
                scores = ranker.predict(EX)
                sorted_idx = np.argsort(scores)[::-1]
                # Sort the relevance scores from most relevant to least relevant
                scores = scores[sorted_idx].reshape(1, -1)
    
                ndcg_now = ndcg(Ey, scores.reshape(-1, 1), Eqids, handle_ties=True, exp=False)
                print('min_split_loss = %s , reg_alpha = %s , reg_lambda = %s , sample = %s' % (i, j, k, l))
                if (ndcg_now>=best_ndcg):
                    best_ndcg = ndcg_now
                    best_parameters = [i, j, k, l]
                    print(ndcg_now)

min_split_loss = 0.1 , reg_alpha = 0 , reg_lambda = 1 , sample = 0.5
0.7830074876948698
min_split_loss = 0.1 , reg_alpha = 0 , reg_lambda = 1 , sample = 0.7
min_split_loss = 0.1 , reg_alpha = 0 , reg_lambda = 1 , sample = 0.9
min_split_loss = 0.1 , reg_alpha = 0 , reg_lambda = 1 , sample = 1
min_split_loss = 0.1 , reg_alpha = 0 , reg_lambda = 5 , sample = 0.5
min_split_loss = 0.1 , reg_alpha = 0 , reg_lambda = 5 , sample = 0.7
min_split_loss = 0.1 , reg_alpha = 0 , reg_lambda = 5 , sample = 0.9
0.7837807395582723
min_split_loss = 0.1 , reg_alpha = 0 , reg_lambda = 5 , sample = 1
min_split_loss = 0.1 , reg_alpha = 0 , reg_lambda = 10 , sample = 0.5
min_split_loss = 0.1 , reg_alpha = 0 , reg_lambda = 10 , sample = 0.7
0.783827602983233
min_split_loss = 0.1 , reg_alpha = 0 , reg_lambda = 10 , sample = 0.9
min_split_loss = 0.1 , reg_alpha = 0 , reg_lambda = 10 , sample = 1
min_split_loss = 0.1 , reg_alpha = 0.1 , reg_lambda = 1 , sample = 0.5
min_split_loss = 0.1 , reg_alpha = 0.1 , reg_la

300 , 6 , 12 , 0.3 , min_split_loss = 0.1 , reg_alpha = 0.1 , reg_lambda = 5 , sample = 0.5
0.78499841721853687

In [19]:
%%time

# max_depth=8, shrinkage=0.1, no column subsampling

ranker = xgboost.XGBRanker(tree_method="exact", n_estimators=500, max_depth=8, random_state=42, objective="rank:ndcg", n_jobs=-1, eta=0.1, eval_metric='ndcg@10',
                           #colsample_bytree=0.5,#, lambdarank_pair_method='topk', lambarank_num_pair_per_sample=10, 
                           #reg_alpha=0.1, reg_lambda=5, min_split_loss=0, subsample=0.75
                           ndcg_exp_gain=False
                           #max_leaves=8
                          )

CPU times: total: 0 ns
Wall time: 0 ns


In [21]:
%%time

ranker.fit(TX, Ty, qid=Tqids, eval_set=[(VX, Vy)], eval_qid=[Vqids])

CPU times: total: 2h 23min 22s
Wall time: 4min 33s


In [40]:
scores = ranker.predict(EX)
sorted_idx = np.argsort(scores)[::-1]
# Sort the relevance scores from most relevant to least relevant
scores = scores[sorted_idx].reshape(1, -1)

In [41]:
ndcg(Ey, scores.reshape(-1, 1), Eqids, handle_ties=True)

0.782185165186294

In [None]:
0.7844600114272402

a = np.array([3, 3, 3, 3, 2, 2, 2, 1, 1, 1]).reshape(-1,1)
a_pred = np.array([2,3,2,3,3,3,2,1,1,1]).reshape(-1, 1)
b = [3]
c = [0]
d = [2, 1]
aids = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
bids = [2]
cids = [3]
dids = [4, 4]

ndcg(a, a_pred, ids = aids)

np.argsort(a_pred.reshape(1, -1)).reshape(-1, 1)[::-1]

0.7848127844739587

### RESULTS

MOLTIPLICA L'NDCG@10 PER 1.0056 PER TENERE CONTO DELLE TIES (DATO CHE IL NOSTRO NDCG NON LO FA), O AGGIUNGI 0.005 PIU' O MENO (PER IL CASO BASE HO AGGIUNTO 0.0046)

[500, 8, 0.1] -> 0.7846 + 0.0046 = 0.7892

[500, 8, 0.1] + [min_split_loss=1, reg_alpha=1, reg_lambda=5] -> 0.7848 + 0.0044 = 0.7892

PER IL COLSUBSAMPLE, SI OTTIENE:

[500, 8, 0.1] + [min_split_loss=1, reg_alpha=5, reg_lambda=0.5] -> 0.7847 + 0.0044 = 0.7891

RESULTS WITH THE CORRECT NDCG WITH TIES:

* rel / log(rank)

Exact     (max without any other hypertuning: 0.7931875331728091,   max: )

0.6 , 0.2 , 11, subsample = 0.5   ->   0.7848432411980334

1 , 0.1 , 10   ->   
0.785013441716267   (colsample_bytree = 0.5)
0.5 , 0.1 , 5, subsample = 0.6   ->   
0.7849270721302724


* 2^rel / log(ran+1+)

Exact     (max without any other hypertuning: 0.7319305264376245,   max: )
0 , 0.1 , 5, subsample = 0.75   ->   
0.719213501205309
7

#### Exact

In [21]:
%%time

best_ndcg = 0
best_parameters = [0, 0, 0]

for i in n_estimators:
    for j in max_depths:
        for k in etas:

#for i in subsampling:
#    for j in colsample_bytree:
            ranker = xgboost.XGBRanker(tree_method="exact", n_estimators=i, max_depth=j,  eta=k, random_state=42, objective="rank:ndcg",
                                        n_jobs=-1, eval_metric='ndcg@10', ndcg_exp_gain=False,
                                        #min_split_loss=i, reg_alpha=j, reg_lambda=k, 
                                        #subsample=i,# colsample_bytree=j
                                      )
            ranker.fit(TX, Ty, qid=Tqids, eval_set=[(VX, Vy)], eval_qid=[Vqids])
        
            scores = ranker.predict(EX)
            sorted_idx = np.argsort(scores)[::-1]
            # Sort the relevance scores from most relevant to least relevant
            scores = scores[sorted_idx].reshape(1, -1)
        
            ndcg_now = ndcg(Ey, scores.reshape(-1, 1), Eqids, exp=False)
            print('%s , %s , %s' % (i, j, k,))
            if (ndcg_now>=best_ndcg):
                best_ndcg = ndcg_now
                best_parameters = [i, j, k]
                print(ndcg_now)

KeyboardInterrupt: 

In [10]:
%%time

n_estimators = [200, 500, 750]#[1000]
subsampling = [0.7, 0.8, 0.9]
reg_alpha = [0.1, 0.3, 0.5]
reg_lambda = [5, 7, 8, 10]
min_split_loss = [0, 0.1, 0.5]

best_ndcg = 0
best_parameters = [0, 0, 0]

for i in n_estimators:
    for j in subsampling:
        for k in reg_alpha:
            for l in reg_lambda:

                for m in min_split_loss:
#    for j in colsample_bytree:
                    ranker = xgboost.XGBRanker(tree_method="exact", n_estimators=i, max_depth=4,  eta=0.3, random_state=42, objective="rank:ndcg",
                                                n_jobs=-1, eval_metric='ndcg@10', ndcg_exp_gain=False,
                                                reg_alpha=k, reg_lambda=l, min_split_loss=m, 
                                                subsample=j,# colsample_bytree=j
                                              )
                    ranker.fit(TX, Ty, qid=Tqids, eval_set=[(VX, Vy)], eval_qid=[Vqids])
                
                    scores = ranker.predict(EX)
                    sorted_idx = np.argsort(scores)[::-1]
                    # Sort the relevance scores from most relevant to least relevant
                    scores = scores[sorted_idx].reshape(1, -1)
                
                    ndcg_now = ndcg(Ey, scores.reshape(-1, 1), Eqids, exp=False)
                    print('%s , 4 , 0.3 , sample = %s , alpha = %s , lambda = %s , min_loss = %s' % (i, j, k, l, m))
                    if (ndcg_now>=best_ndcg):
                        best_ndcg = ndcg_now
                        best_parameters = [i, j, k, l, m]
                        print(ndcg_now)

200 , 4 , 0.3 , sample = 0.7 , alpha = 0.1 , lambda = 5 , min_loss = 0
0.780322888460122
200 , 4 , 0.3 , sample = 0.7 , alpha = 0.1 , lambda = 5 , min_loss = 0.1
0.7811891912996477
200 , 4 , 0.3 , sample = 0.7 , alpha = 0.1 , lambda = 5 , min_loss = 0.5
200 , 4 , 0.3 , sample = 0.7 , alpha = 0.1 , lambda = 7 , min_loss = 0
0.782271619510142
200 , 4 , 0.3 , sample = 0.7 , alpha = 0.1 , lambda = 7 , min_loss = 0.1
200 , 4 , 0.3 , sample = 0.7 , alpha = 0.1 , lambda = 7 , min_loss = 0.5
0.7841766273747603
200 , 4 , 0.3 , sample = 0.7 , alpha = 0.1 , lambda = 8 , min_loss = 0
200 , 4 , 0.3 , sample = 0.7 , alpha = 0.1 , lambda = 8 , min_loss = 0.1
0.7843683018455087
200 , 4 , 0.3 , sample = 0.7 , alpha = 0.1 , lambda = 8 , min_loss = 0.5
200 , 4 , 0.3 , sample = 0.7 , alpha = 0.1 , lambda = 10 , min_loss = 0
0.7852955797502225
200 , 4 , 0.3 , sample = 0.7 , alpha = 0.1 , lambda = 10 , min_loss = 0.1
200 , 4 , 0.3 , sample = 0.7 , alpha = 0.1 , lambda = 10 , min_loss = 0.5
200 , 4 , 0.3 , s

In [None]:
1000 , 4 , 0.2 , sample = 0.9 , alpha = 0.5 , lambda = 0
0.7839232596919713

In [None]:
200 , 4 , 0.3 , sample = 0.7 , alpha = 0.1 , lambda = 10 , min_loss = 0
0.7852955797502225

In [119]:
%%time

ranker = xgboost.XGBRanker(tree_method="exact", n_estimators=400, max_leaves=20, max_depth=6, random_state=42, objective="rank:ndcg", n_jobs=-1)
ranker.fit(TX, Ty, qid=Tqids)

CPU times: total: 1h 21min 13s
Wall time: 2min 35s


In [120]:
scores = ranker.predict(EX)
sorted_idx = np.argsort(scores)[::-1]
# Sort the relevance scores from most relevant to least relevant
scores = scores[sorted_idx].reshape(1, -1)

In [121]:
ndcg_test = ndcg_score(Ey.reshape(1,-1), scores)
ndcg_test

0.9263475852631387

[400, 8, 6] :  0. 926 347 585 263 138 7   Wall Time: 2 min 34 s
[400, 12, 6] : 0. 926 347 585 263 138 7   Wall Time: 2 min 34 s
[400, 20, 6] : 0. 926 347 585 263 138 7   Wall Time: 2 min 35 s

Being that they are all the same settings except for the max leaves, we can conclude that they generated the same exact trees, so the same results. This is likely because ahving max depth 6 means generating at most 2^6=64 leaves, but if the algorithm ends up generating less leaves (e.g.:8), we get the same result even if we increase max_leaves. So probably everything from max_leaves=8 to max_leaves=20 (and also beyond 20) gives the same results.

765

In [None]:
%%time

best_ndcg = 0
best_parameters = [0, 0, 0]

for i in n_estimators:
    for j in max_depths:
        for k in max_leaves:
            for l in etas:

#for i in subsampling:
#    for j in colsample_bytree:
                ranker = xgboost.XGBRanker(tree_method="hist", n_estimators=i, max_depth=j, max_leaves=k, eta=l, random_state=42,
                                           objective="rank:ndcg", n_jobs=-1, #eval_metric='ndcg@10',
                                           #min_split_loss=i, reg_alpha=j, reg_lambda=k, 
                                           #subsample=i,# colsample_bytree=j
                                          )
                ranker.fit(TX, Ty, qid=Tqids, eval_set=[(VX, Vy)], eval_qid=[Vqids])
        
                scores = ranker.predict(EX)
                sorted_idx = np.argsort(scores)[::-1]
                # Sort the relevance scores from most relevant to least relevant
                scores = scores[sorted_idx].reshape(1, -1)
        
                ndcg_now = ndcg(Ey, scores.reshape(-1, 1), Eqids)
                if (ndcg_now>best_ndcg):
                    best_ndcg = ndcg_now
                    best_parameters = [i, j, k, l]
                    print('%s , %s , %s , %s' % (i, j, k, l))
                    print(ndcg_now)

300 , 4 , 4 , 0.1
0.7837142753772979
300 , 4 , 4 , 0.2
0.7843192321614486
300 , 4 , 16 , 0.2
0.7843382369037932
300 , 6 , 64 , 0.1
0.7844426566934753


Exception ignored on calling ctypes callback function: <bound method DataIter._next_wrapper of <xgboost.data.SingleBatchInternalIter object at 0x0000028BBDD0CE30>>
Traceback (most recent call last):
  File "C:\Users\Utente\anaconda3\Lib\site-packages\xgboost\core.py", line 582, in _next_wrapper
    def _next_wrapper(self, this: None) -> int:  # pylint: disable=unused-argument

KeyboardInterrupt: 


In [1]:
[i,j,k,l]

NameError: name 'i' is not defined

Best iteration ([400, 16, 6]):       0. 917 848 267 915 044 1
New best iteration ([400, 20, 6]):   0. 917 899 452 217 445 4
Newer best iteration ([450, 20, 6]): 0. 917 903 662 354 776 4

In [125]:
%%time

# max_depth=8, shrinkage=0.1, no column subsampling

ranker = xgboost.XGBRanker(tree_method="approx", n_estimators=450, max_leaves=20, max_depth=6, random_state=42, objective="rank:ndcg", n_jobs=-1)
ranker.fit(TX, Ty, qid=Tqids)

CPU times: total: 2h 12min 41s
Wall time: 4min 31s


In [136]:
scores = ranker.predict(EX)
sorted_idx = np.argsort(scores)[::-1]
# Sort the relevance scores from most relevant to least relevant
scores = scores[sorted_idx].reshape(1, -1)

In [138]:
ndcg_now = ndcg_score(Ey.reshape(1,-1), scores)
ndcg_now

0.9263464638428349

Results with the same settings as "exact":
[400, 8, 6]:   vali: 0. 917 847 980 586 229 2   test: 0. 926 319 076 828 725 4
[400, 12, 6]:  vali: 0. 917 846 945 793 579 5   test: 0. 926 346 760 524 999 6
[450, 20, 6]:  vali: 0. 917 903 662 354 776 4   test: 0. 926 346 463 842 834 9

#### With Standard normalization

In [41]:
#Normalization

scaler = StandardScaler()
scaler.fit(TX)
nTX = scaler.transform(TX)
scaler.fit(VX)
nVX = scaler.transform(VX)
scaler.fit(EX)
nEX = scaler.transform(EX)

In [151]:
#n_estimators = [350, 400, 450, 500]
#max_leaves = [8, 12, 16, 20]
#max_depths = [4, 5, 6, 7, 8, 9, 10]

In [None]:
%%time

best_ndcg = 0
best_parameters = [0, 0, 0]

for i in min_split_loss:
    for j in reg_alpha:
        for k in reg_lambda:

#for i in subsampling:
#    for j in colsample_bytree:
            ranker = xgboost.XGBRanker(tree_method="exact", n_estimators=500, max_depth=8, random_state=42, objective="rank:ndcg", eta=0.1,
                                        n_jobs=-1, #colsample_bylevel=0.5, #eval_metric='ndcg@10',
                                        min_split_loss=i, reg_alpha=j, reg_lambda=k, 
                                        #subsample=i,# colsample_bytree=j
                                      )
            ranker.fit(nTX, Ty, qid=Tqids, eval_set=[(nVX, Vy)], eval_qid=[Vqids])
        
            scores = ranker.predict(nEX)
            sorted_idx = np.argsort(scores)[::-1]
            # Sort the relevance scores from most relevant to least relevant
            scores = scores[sorted_idx].reshape(1, -1)
        
            ndcg_now = ndcg(Ey, scores.reshape(-1, 1), Eqids)
            if (ndcg_now>best_ndcg):
                best_ndcg = ndcg_now
                best_parameters = [i, j, k]
                print('%s , %s , %s' % (i, j, k))
                print(ndcg_now)

0 , 0 , 0.5
0.7842471166376468
0 , 0 , 5
0.784372513020244
0 , 5 , 20
0.7843934023449157


Best iteration ([400, 8, 8]):       0. 917 873 348 102 320 9

In [155]:
%%time

ranker = xgboost.XGBRanker(tree_method="exact", n_estimators=400, max_leaves=8, max_depth=8, random_state=42, objective="rank:ndcg", n_jobs=-1)
ranker.fit(nTX, Ty, qid=Tqids)

CPU times: total: 1h 47min 14s
Wall time: 3min 24s


In [156]:
scores = ranker.predict(nEX)
sorted_idx = np.argsort(scores)[::-1]
# Sort the relevance scores from most relevant to least relevant
scores = scores[sorted_idx].reshape(1, -1)

In [157]:
ndcg_now = ndcg_score(Ey.reshape(1,-1), scores)
ndcg_now

0.9263359652490694

Best iteration ([400, 8, 8]):       0. 926 335 965 249 069 4          wall time: 3 min 24 s

In [161]:
%%time

best_ndcg = 0
best_parameters = [0, 0, 0]

for i in n_estimators:
    for k in max_leaves:
        for l in max_depths:
            ranker = xgboost.XGBRanker(tree_method="approx", random_state=42, objective="rank:ndcg",
                                   n_estimators=i, max_leaves=k, max_depth=l, n_jobs=-1)
            ranker.fit(nTX, Ty, qid=Tqids)
            
            y_vali = ranker.predict(nVX)
            sorted_idx = np.argsort(y_vali)[::-1]
            # Sort the relevance scores from most relevant to least relevant
            y_vali = y_vali[sorted_idx].reshape(1, -1)

            ndcg_now = ndcg_score(Vy.reshape(1,-1), y_vali)
            if (ndcg_now>best_ndcg):
                best_ndcg = ndcg_now
                best_parameters = [i, k, l]
                print('%s , %s , %s' % (i, k, l))
                print(ndcg_now)

350 , 8 , 4
0.9178451812849276
350 , 16 , 5
0.9178463341783615
350 , 20 , 5
0.9178758403582332
500 , 20 , 5
0.9178758979709531
CPU times: total: 9d 53min 19s
Wall time: 7h 50min 51s


Best iteration ([500, 20, 5]):       0. 917 875 897 970 953 1

In [163]:
%%time

ranker = xgboost.XGBRanker(tree_method="approx", n_estimators=500, max_leaves=20, max_depth=5, random_state=42, objective="rank:ndcg", n_jobs=-1)
ranker.fit(nTX, Ty, qid=Tqids)

CPU times: total: 2h 17min 59s
Wall time: 5min


In [164]:
scores = ranker.predict(nEX)
sorted_idx = np.argsort(scores)[::-1]
# Sort the relevance scores from most relevant to least relevant
scores = scores[sorted_idx].reshape(1, -1)

In [165]:
ndcg_now = ndcg_score(Ey.reshape(1,-1), scores)
ndcg_now

0.9263363929639176

Best iteration ([400, 8, 8]):       0. 926 336 392 963 917 6          wall time: 5 min

Note that the dataset after normalization seems to be slower than the non-normalized one.

#### With MinMax normalization

In [7]:
#Normalization

scaler = MinMaxScaler()
scaler.fit(TX)
nTX = scaler.transform(TX)
scaler.fit(VX)
nVX = scaler.transform(VX)
scaler.fit(EX)
nEX = scaler.transform(EX)

In [8]:
n_estimators = [350, 400, 450, 500]
max_leaves = [8, 12, 16, 20]
max_depths = [4, 5, 6, 7, 8, 9, 10]

In [13]:
%%time

best_ndcg = 0
best_parameters = [0, 0, 0]

for i in n_estimators:
    for k in max_leaves:
        for l in max_depths:
            ranker = xgboost.XGBRanker(tree_method="exact", random_state=42, objective="rank:ndcg",
                                   n_estimators=i, max_leaves=k, max_depth=l, n_jobs=-1)
            ranker.fit(nTX, Ty, qid=Tqids)
            
            y_vali = ranker.predict(nVX)
            sorted_idx = np.argsort(y_vali)[::-1]
            # Sort the relevance scores from most relevant to least relevant
            y_vali = y_vali[sorted_idx].reshape(1, -1)

            ndcg_now = ndcg_score(Vy.reshape(1,-1), y_vali)
            if (ndcg_now>best_ndcg):
                best_ndcg = ndcg_now
                best_parameters = [i, k, l]
                print('%s , %s , %s' % (i, k, l))
                print(ndcg_now)

350 , 8 , 4
0.9178402249299604
350 , 8 , 5
0.9178792373087499
350 , 8 , 6
0.9179011949807576
400 , 8 , 6
0.9179013958105817
CPU times: total: 7d 21h 18min 33s
Wall time: 6h 1min 34s


[400, 8, 6] :   0. 917 901 395 810 581 7

In [20]:
%%time

ranker = xgboost.XGBRanker(tree_method="exact", n_estimators=400, max_leaves=8, max_depth=6, random_state=42, objective="rank:ndcg", n_jobs=-1)
ranker.fit(nTX, Ty, qid=Tqids)

CPU times: total: 1h 20min 53s
Wall time: 2min 34s


In [21]:
scores = ranker.predict(nEX)
sorted_idx = np.argsort(scores)[::-1]
# Sort the relevance scores from most relevant to least relevant
scores = scores[sorted_idx].reshape(1, -1)

In [22]:
ndcg_now = ndcg_score(Ey.reshape(1,-1), scores)
ndcg_now

0.9263474700520594

[400, 8, 6] : 0.926 347 470 052 059 4   wall time: 2 min 34 s

In [11]:
%%time

best_ndcg = 0
best_parameters = [0, 0, 0]

for i in n_estimators:
    for k in max_leaves:
        for l in max_depths:
            ranker = xgboost.XGBRanker(tree_method="approx", random_state=42, objective="rank:ndcg",
                                   n_estimators=i, max_leaves=k, max_depth=l, n_jobs=-1)
            ranker.fit(nTX, Ty, qid=Tqids)
            
            y_vali = ranker.predict(nVX)
            sorted_idx = np.argsort(y_vali)[::-1]
            # Sort the relevance scores from most relevant to least relevant
            y_vali = y_vali[sorted_idx].reshape(1, -1)

            ndcg_now = ndcg_score(Vy.reshape(1,-1), y_vali)
            if (ndcg_now>best_ndcg):
                best_ndcg = ndcg_now
                best_parameters = [i, k, l]
                print('%s , %s , %s' % (i, k, l))
                print(ndcg_now)

350 , 8 , 4
0.9178458302440359
350 , 12 , 5
0.917847035906581
350 , 20 , 5
0.9178742937695091
400 , 20 , 5
0.9179036635454874
CPU times: total: 9d 18h 26min 32s
Wall time: 7h 48min 52s


In [13]:
%%time

ranker = xgboost.XGBRanker(tree_method="approx", n_estimators=400, max_leaves=20, max_depth=5, random_state=42, objective="rank:ndcg", n_jobs=-1)
ranker.fit(nTX, Ty, qid=Tqids)

CPU times: total: 2h 57s
Wall time: 4min


In [14]:
scores = ranker.predict(nEX)
sorted_idx = np.argsort(scores)[::-1]
# Sort the relevance scores from most relevant to least relevant
scores = scores[sorted_idx].reshape(1, -1)

In [15]:
ndcg_now = ndcg_score(Ey.reshape(1,-1), scores)
ndcg_now

0.9263380821451912

# Istella