In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost
from sklearn.datasets import fetch_openml
import pyltr
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import ndcg_score
from sklearn.model_selection import train_test_split, cross_val_score

In [2]:
def dcg(scores, k=10):
    score = 0
    for i in range(k):
        if (i==0):
            score=score+scores[i]
        else:
            score=score+(scores[i]/np.log2(i+1))
    return score

def dcg_exp(scores, k=10):
    score = 0
    for i in range(k):
        score = score + ((2**scores[i] - 1) / np.log2(i + 2))
    return score
'''
def ndcg(y_true, y_pred, ids, k=10):
    unique_ids=np.unique(ids)
    ndcg_scores=[]
    for qid in unique_ids:
        mask = ids == qid
        true_scores=y_true[mask]
        pred_scores=y_pred[mask]

        
        sorted_indices=np.argsort(pred_scores.reshape(1, -1)).reshape(-1, 1)[::-1]
        sorted_true_scores=true_scores[sorted_indices] #true scores sorted based on predicted
        if (len(sorted_true_scores < k)):
            k = len(sorted_true_scores)
        dcg_score=dcg(sorted_true_scores, k)
        idcg_score=dcg(sorted(np.array(true_scores), reverse=True), k)

        #print(sorted_true_scores)
        #print(sorted(np.array(true_scores), reverse=True))
        
        if (idcg_score>0):
            #print(dcg_score, idcg_score)
            ndcg_scores.append(dcg_score/idcg_score)
        
    return np.mean(ndcg_scores)
'''


def ndcg(y_true, y_pred, ids, k=10, handle_ties=True, exp=False):
    unique_ids=np.unique(ids)
    ndcg_scores=[]
    for qid in unique_ids:
        mask = ids == qid
        true_scores=y_true[mask]
        pred_scores=y_pred[mask]

        
        sorted_indices=np.argsort(pred_scores.reshape(1, -1)).reshape(-1, 1)[::-1]
        sorted_true_scores=true_scores[sorted_indices] #true scores sorted based on predicted
        sorted_pred_scores=pred_scores[sorted_indices]        
        if (len(sorted_true_scores < k)):
            k = len(sorted_true_scores)

        if handle_ties == True:
            for j in np.unique(pred_scores):
                ties = sorted_pred_scores == j
                #print(j, np.array(ties).flatten())
                #print(sorted_true_scores)
                mean_gain = np.mean(sorted_true_scores[np.array(ties).flatten()])
                for l in np.array(ties).flatten():
                    sorted_true_scores[l] = mean_gain
                

        #if exp==True: dcg_score=dcg_exp(sorted_true_scores, k)
        #else: dcg_score=dcg(sorted_true_scores, k)
        #if exp==True: idcg_score=dcg_exp(sorted(np.array(true_scores), reverse=True), k)
        #else: idcg_score=dcg(sorted(np.array(true_scores), reverse=True), k)

        dcg_score=dcg(sorted_true_scores, k)
        idcg_score=dcg(sorted(np.array(true_scores), reverse=True), k)
        
        #print(sorted_true_scores)
        #print(sorted(np.array(true_scores), reverse=True))
        
        if (idcg_score>0):
            #print(dcg_score, idcg_score)
            ndcg_scores.append(dcg_score/idcg_score)
        
    return np.mean(ndcg_scores)

In [3]:
def max_ndcg(y_true, y_pred, ids, k=10, exp=False):
    unique_ids=np.unique(ids)
    ndcg_scores=[]
    for qid in unique_ids:
        mask = ids == qid
        true_scores=y_true[mask]
        pred_scores=y_pred[mask]

        
        sorted_indices=np.argsort(pred_scores.reshape(1, -1)).reshape(-1, 1)[::-1]
        sorted_true_scores=true_scores[sorted_indices] #true scores sorted based on predicted
        sorted_pred_scores=pred_scores[sorted_indices]        
        if (len(sorted_true_scores < k)):
            k = len(sorted_true_scores)

        if exp==True: idcg_score=dcg_exp(sorted(np.array(true_scores), reverse=True), k)
        else: idcg_score=dcg(sorted(np.array(true_scores), reverse=True), k)
        if idcg_score > 0:
            if exp==True: dcg_score=dcg_exp(sorted_true_scores, k)
            else: dcg_score=dcg(sorted_true_scores, k)
            best_score=dcg_score/idcg_score


        ties = np.array([False]*len(sorted_true_scores))
        for j in range(len(sorted_pred_scores)-1):
            if sorted_pred_scores[j] == sorted_pred_scores[j+1]:
                ties[j] = True
        true_ties = np.where(ties==True)
        for j in np.array(true_ties).flatten():
            if sorted_pred_scores[j] == sorted_pred_scores[j+1]:
                placeholder = sorted_true_scores[j]
                sorted_true_scores[j] = sorted_true_scores[j+1]
                sorted_true_scores[j+1] = placeholder
                if exp==True: dcg_score=dcg_exp(sorted_true_scores, k)
                else: dcg_score=dcg(sorted_true_scores, k)
                if idcg_score > 0:
                    ndcg_score = dcg_score/idcg_score
                    if best_score < ndcg_score: best_score=ndcg_score

        #print(sorted_true_scores)
        #print(sorted(np.array(true_scores), reverse=True))
        
        if (idcg_score>0):
            #print(dcg_score, idcg_score)
            ndcg_scores.append(best_score)
        
    return np.mean(ndcg_scores)

## Istella-s-letor

In [5]:
with open('istella-s-letor/istella-s-letor/sample/train.txt') as trainfile, \
        open('istella-s-letor/istella-s-letor/sample/vali.txt') as valifile, \
        open('istella-s-letor/istella-s-letor/sample/test.txt') as evalfile:
    TX, Ty, Tqids, _ = pyltr.data.letor.read_dataset(trainfile)
    VX, Vy, Vqids, _ = pyltr.data.letor.read_dataset(valifile)
    EX, Ey, Eqids, _ = pyltr.data.letor.read_dataset(evalfile)

In [6]:
Tqids = np.array(Tqids, dtype=object)
Vqids = np.array(Vqids, dtype=object)
Eqids = np.array(Eqids, dtype=object)

In [7]:
logTX, logVX, logEX = np.log10(TX[:,193]), np.log10(VX[:,193]), np.log10(EX[:,193])

In [8]:
TX[:,193], VX[:,193], EX[:,193] = logTX, logVX, logEX

In [9]:
logTX, logVX, logEX = np.log10(TX[:,0]), np.log10(VX[:,0]), np.log10(EX[:,0])

In [10]:
TX[:,0], VX[:,0], EX[:,0] = logTX, logVX, logEX

In [11]:
inTqids = Tqids.astype(str).astype(int)
sort_ids = np.argsort(inTqids)
# Sort the relevance scores from most relevant to least relevant
TX, Ty, Tqids = TX[sort_ids], Ty[sort_ids], inTqids[sort_ids]

#### Histogram

In [13]:
n_estimators = [600, 800]
max_leaves = [8, 12, 16, 20, 24]
max_depths = [4, 6, 8]#, 11, 12, 13, 14, 15, 16]
etas = [0.1, 0.2, 0.3]

min_split_loss = [0, 0.5]
reg_alpha = [0.1, 0.5]#[0, 0.1, 0.5]
reg_lambda = [10, 5, 1]
subsampling = [1, 0.9, 0.7, 0.5]

In [None]:
%%time

best_ndcg = 0
best_parameters = [0, 0, 0, 0]

#for i in n_estimators:
#    for j in max_depths:
#        for k in max_leaves:
#            for l in etas:

for i in min_split_loss:
    for j in reg_alpha:
        for k in reg_lambda:
            for l in subsampling:

                ranker = xgboost.XGBRanker(tree_method="hist", n_estimators=400, max_depth=4, max_leaves=16,  eta=0.1, random_state=42,
                                            objective="rank:ndcg", n_jobs=-1, eval_metric='ndcg@10', ndcg_exp_gain=False,
                                            min_split_loss=i, reg_alpha=j, reg_lambda=k, subsample=l,
                                            colsample_bytree=0.5
                                          )
                ranker.fit(TX, Ty, qid=Tqids, eval_set=[(VX, Vy)], eval_qid=[Vqids])
            
                scores = ranker.predict(EX)
                sorted_idx = np.argsort(scores)[::-1]
                # Sort the relevance scores from most relevant to least relevant
                scores = scores[sorted_idx].reshape(1, -1)
    
                ndcg_now = ndcg(Ey, scores.reshape(-1, 1), Eqids, handle_ties=True, exp=False)

                #print('%s , %s , %s , %s' % (i, j, k, l))
                print('min_split_loss = %s , reg_alpha = %s , reg_lambda = %s , sample = %s' % (i, j, k, l))
                if (ndcg_now>=best_ndcg):
                    best_ndcg = ndcg_now
                    best_parameters = [i, j, k, l]
                    print(ndcg_now)

min_split_loss = 0 , reg_alpha = 0.1 , reg_lambda = 10 , sample = 1
0.5424065059313817
min_split_loss = 0 , reg_alpha = 0.1 , reg_lambda = 10 , sample = 0.9
0.5436246662125445
min_split_loss = 0 , reg_alpha = 0.1 , reg_lambda = 10 , sample = 0.7
min_split_loss = 0 , reg_alpha = 0.1 , reg_lambda = 10 , sample = 0.5
0.5445464899379366
min_split_loss = 0 , reg_alpha = 0.1 , reg_lambda = 5 , sample = 1


400 , 4 , 16 , 0.1
0.548050042811841

In [30]:
ranker = xgboost.XGBRanker(tree_method="hist", n_estimators=4, max_depth=6, max_leaves=16,  eta=0.3, random_state=42,
                                            objective="rank:ndcg", n_jobs=-1, eval_metric='ndcg@10', ndcg_exp_gain=False,
                                            #min_split_loss=i, reg_alpha=j, reg_lambda=k, 
                                            #subsample=l,
                                            colsample_bytree=0.5)
ranker.fit(TX, Ty, qid=Tqids, eval_set=[(VX, Vy)], eval_qid=[Vqids])
            
scores = ranker.predict(EX)
sorted_idx = np.argsort(scores)[::-1]
                # Sort the relevance scores from most relevant to least relevant
scores = scores[sorted_idx].reshape(1, -1)
    
ndcg_now = ndcg(Ey, scores.reshape(-1, 1), Eqids, handle_ties=True, exp=False)

In [31]:
ndcg_now

1.0

#### Exact

In [32]:
#n_estimators = [800]
#max_depths = [4, 6, 8]
#etas = [0.1, 0.2, 0.3]
min_split_loss = [0, 0.1, 0.5]
reg_alpha = [0, 0.1, 0.5]
reg_lambda = [0.5, 1, 5, 10]
subsampling = [1]#[0.5, 0.7, 0.9]

In [34]:
%%time

best_ndcg = 0
best_parameters = [0, 0, 0, 0]

for i in min_split_loss:
    for j in reg_alpha:
        for k in reg_lambda:
            for l in subsampling:
#    for j in colsample_bytree:
                ranker = xgboost.XGBRanker(tree_method="exact", n_estimators=400, max_depth=6,  eta=0.1, random_state=42, objective="rank:ndcg",
                                            n_jobs=-1, eval_metric='ndcg@10', ndcg_exp_gain=False,
                                            min_split_loss=i, reg_alpha=j, reg_lambda=k, 
                                            subsample=l,
                                            colsample_bytree=0.5
                                          )
                ranker.fit(TX, Ty, qid=Tqids, eval_set=[(VX, Vy)], eval_qid=[Vqids])
            
                scores = ranker.predict(EX)
                sorted_idx = np.argsort(scores)[::-1]
                # Sort the relevance scores from most relevant to least relevant
                scores = scores[sorted_idx].reshape(1, -1)
    
                print('%s , %s , %s' % (i, j, k,))
                ndcg_now = ndcg(Ey, scores.reshape(-1, 1), Eqids, handle_ties=True, exp=False)
                if (ndcg_now>=best_ndcg):
                    best_ndcg = ndcg_now
                    best_parameters = [i, j, k]
                    print(ndcg_now)
                print('.')

0 , 0 , 0.5
0.5445903442095613
.
0 , 0 , 1
0.5468657011384445
.
0 , 0 , 5
.
0 , 0 , 10
.
0 , 0.1 , 0.5
.
0 , 0.1 , 1
.
0 , 0.1 , 5
.
0 , 0.1 , 10
.
0 , 0.5 , 0.5
.
0 , 0.5 , 1
.
0 , 0.5 , 5
.
0 , 0.5 , 10
.
0.1 , 0 , 0.5
.
0.1 , 0 , 1
.
0.1 , 0 , 5
.
0.1 , 0 , 10
.
0.1 , 0.1 , 0.5
.
0.1 , 0.1 , 1
.
0.1 , 0.1 , 5
.
0.1 , 0.1 , 10
.
0.1 , 0.5 , 0.5
.
0.1 , 0.5 , 1
.
0.1 , 0.5 , 5
.
0.1 , 0.5 , 10
.
0.5 , 0 , 0.5
.
0.5 , 0 , 1
.
0.5 , 0 , 5
.
0.5 , 0 , 10
.
0.5 , 0.1 , 0.5
.
0.5 , 0.1 , 1
.
0.5 , 0.1 , 5
.
0.5 , 0.1 , 10
.
0.5 , 0.5 , 0.5
.
0.5 , 0.5 , 1
.
0.5 , 0.5 , 5
.
0.5 , 0.5 , 10
.
CPU times: total: 2d 10h 1min 25s
Wall time: 2h 55min 12s


Best iterations for now:6
400 , 6 , 0.1 , min_loss = 0.5 , reg_alpha = 0 , reg_lambda = 10, sample = 0.7
0.54786188126529434589


In [20]:
[i,k,l]

[400, 8, 12]

In [17]:
%%time

ranker = xgboost.XGBRanker(tree_method="exact", n_estimators=400, eta=0.1, max_depth=6, random_state=42, objective="rank:ndcg", n_jobs=-1,
                           eval_metric='ndcg@10', min_split_loss=0.5, reg_alpha=0, reg_lambda=10, subsample=0.7)
ranker.fit(TX, Ty, qid=Tqids, eval_set=[(VX, Vy)], eval_qid=[Vqids])

CPU times: total: 2h 44min 21s
Wall time: 5min 18s


In [18]:
scores = ranker.predict(EX)
sorted_idx = np.argsort(scores)[::-1]
# Sort the relevance scores from most relevant to least relevant
scores = scores[sorted_idx].reshape(1, -1)

In [28]:
ndcg(Ey, scores.reshape(-1, 1), Eqids, handle_ties=True, exp=False)

0.5432192363024712

[400, 8, 6] :  0. 

#### Approximate

In [40]:
n_estimators = [200, 400, 600, 800]
max_leaves = [8, 12, 16, 20, 24]
max_depths = [4, 6, 8]#, 11, 12, 13, 14, 15, 16]
etas = [0.1, 0.2, 0.3]

#min_split_loss = [0.1, 0.5]
#reg_alpha = [0.1, 0.5]#[0, 0.1, 0.5]
#reg_lambda = [1, 5, 10, 0.5]
#subsampling = [0.5, 0.7, 0.9, 1]

In [42]:
%%time

best_ndcg = 0
best_parameters = [0, 0, 0, 0]

for i in n_estimators:
    for j in max_depths:
        #for k in max_leaves:
            for l in etas:
#    for j in colsample_bytree:
                ranker = xgboost.XGBRanker(tree_method="approx", n_estimators=i, max_depth=j, eta=l, random_state=42, #max_leaves = k,  eta=l, random_state=42,
                                            objective="rank:ndcg", n_jobs=-1, eval_metric='ndcg@10', ndcg_exp_gain=False,
                                            #min_split_loss=i, reg_alpha=j, reg_lambda=k, 
                                            #subsample=l,
                                            #colsample_bytree=0.5
                                          )
                ranker.fit(TX, Ty, qid=Tqids, eval_set=[(VX, Vy)], eval_qid=[Vqids])
            
                scores = ranker.predict(EX)
                sorted_idx = np.argsort(scores)[::-1]
                # Sort the relevance scores from most relevant to least relevant
                scores = scores[sorted_idx].reshape(1, -1)
    
                ndcg_now = ndcg(Ey, scores.reshape(-1, 1), Eqids, handle_ties=True, exp=False)
                print('%s , %s , %s' % (i, j, l))
                if (ndcg_now>=best_ndcg):
                    best_ndcg = ndcg_now
                    best_parameters = [i, j, l]
                    print(ndcg_now)

200 , 4 , 0.1
0.5462033392354859
200 , 4 , 0.2
0.5468830244048251
200 , 4 , 0.3
200 , 6 , 0.1
200 , 6 , 0.2
200 , 6 , 0.3
200 , 8 , 0.1
200 , 8 , 0.2
200 , 8 , 0.3
400 , 4 , 0.1
400 , 4 , 0.2
400 , 4 , 0.3
400 , 6 , 0.1
400 , 6 , 0.2
400 , 6 , 0.3
400 , 8 , 0.1
400 , 8 , 0.2
400 , 8 , 0.3
600 , 4 , 0.1
600 , 4 , 0.2
0.5469608638612027
600 , 4 , 0.3
600 , 6 , 0.1
600 , 6 , 0.2
0.547001867137514
600 , 6 , 0.3
600 , 8 , 0.1
600 , 8 , 0.2
600 , 8 , 0.3
800 , 4 , 0.1
800 , 4 , 0.2
800 , 4 , 0.3
800 , 6 , 0.1
800 , 6 , 0.2
800 , 6 , 0.3
800 , 8 , 0.1
800 , 8 , 0.2
800 , 8 , 0.3
CPU times: total: 4d 4h 36min 31s
Wall time: 10h 20min 26s


In [30]:
n_estimators = [600, 800]
max_leaves = [8, 12, 16, 20, 24]
max_depths = [8]#[4, 6, 8]#, 11, 12, 13, 14, 15, 16]
etas = [0.1, 0.2, 0.3]

min_split_loss = [0.1, 0.5]
reg_alpha = [0, 0.1, 0.5]
reg_lambda = [1, 5, 10, 0.5]
subsampling = [0.5, 0.7, 0.9, 1]

In [35]:
%%time

best_ndcg = 0
best_parameters = [0, 0, 0, 0]

for i in min_split_loss:
    for j in reg_alpha:
        for k in reg_lambda:
            for l in subsampling:
#    for j in colsample_bytree:
                ranker = xgboost.XGBRanker(tree_method="approx", n_estimators=400, max_depth=8, max_leaves = 24,  eta=0.2, random_state=42,
                                            objective="rank:ndcg", n_jobs=-1, eval_metric='ndcg@10', ndcg_exp_gain=False,
                                            min_split_loss=i, reg_alpha=j, reg_lambda=k, 
                                            subsample=l,
                                            colsample_bytree=0.5
                                          )
                ranker.fit(TX, Ty, qid=Tqids, eval_set=[(VX, Vy)], eval_qid=[Vqids])
            
                scores = ranker.predict(EX)
                sorted_idx = np.argsort(scores)[::-1]
                # Sort the relevance scores from most relevant to least relevant
                scores = scores[sorted_idx].reshape(1, -1)
    
                ndcg_now = ndcg(Ey, scores.reshape(-1, 1), Eqids, handle_ties=True, exp=False)
                print('min_split_loss = %s , reg_alpha = %s , reg_lambda = %s , subsampling = %s' % (i, j, k, l))
                if (ndcg_now>=best_ndcg):
                    best_ndcg = ndcg_now
                    best_parameters = [i, j, k, l]
                    print(ndcg_now)

min_split_loss = 0.1 , reg_alpha = 0 , reg_lambda = 1 , subsampling = 0.5
0.5426377090360153
min_split_loss = 0.1 , reg_alpha = 0 , reg_lambda = 1 , subsampling = 0.7
0.5456965967008265
min_split_loss = 0.1 , reg_alpha = 0 , reg_lambda = 1 , subsampling = 0.9
min_split_loss = 0.1 , reg_alpha = 0 , reg_lambda = 1 , subsampling = 1
0.5468371525950438
min_split_loss = 0.1 , reg_alpha = 0 , reg_lambda = 5 , subsampling = 0.5
0.5478127337912312
min_split_loss = 0.1 , reg_alpha = 0 , reg_lambda = 5 , subsampling = 0.7
min_split_loss = 0.1 , reg_alpha = 0 , reg_lambda = 5 , subsampling = 0.9
min_split_loss = 0.1 , reg_alpha = 0 , reg_lambda = 5 , subsampling = 1
min_split_loss = 0.1 , reg_alpha = 0 , reg_lambda = 10 , subsampling = 0.5
min_split_loss = 0.1 , reg_alpha = 0 , reg_lambda = 10 , subsampling = 0.7
min_split_loss = 0.1 , reg_alpha = 0 , reg_lambda = 10 , subsampling = 0.9
min_split_loss = 0.1 , reg_alpha = 0 , reg_lambda = 10 , subsampling = 1
min_split_loss = 0.1 , reg_alpha = 0 ,

Best iteration:

200 , 8 , 20 , 0.1 , min_loss = 0 , alpha = 0 , lambda = 1 , sample = 1
0.550770063380258

w/ colsampl):400 , 8 , 24 , 0.2
0.547489107213051189

In [125]:
%%time

# max_depth=8, shrinkage=0.1, no column subsampling

ranker = xgboost.XGBRanker(tree_method="approx", n_estimators=450, max_leaves=20, max_depth=6, random_state=42, objective="rank:ndcg", n_jobs=-1)
ranker.fit(TX, Ty, qid=Tqids)

CPU times: total: 2h 12min 41s
Wall time: 4min 31s


In [136]:
scores = ranker.predict(EX)
sorted_idx = np.argsort(scores)[::-1]
# Sort the relevance scores from most relevant to least relevant
scores = scores[sorted_idx].reshape(1, -1)

In [138]:
ndcg_now = ndcg_score(Ey.reshape(1,-1), scores)
ndcg_now

0.9263464638428349

Results with the same settings as "exact":
[400, 8, 6]:   vali: 0. 917 847 980 586 229 2   test: 0. 926 319 076 828 725 4
[400, 12, 6]:  vali: 0. 917 846 945 793 579 5   test: 0. 926 346 760 524 999 6
[450, 20, 6]:  vali: 0. 917 903 662 354 776 4   test: 0. 926 346 463 842 834 9

#### With Standard normalization

In [104]:
#Normalization

scaler = StandardScaler()
scaler.fit(TX)
nTX = scaler.transform(TX)
scaler.fit(VX)
nVX = scaler.transform(VX)
scaler.fit(EX)
nEX = scaler.transform(EX)

In [105]:
n_estimators = [350, 400, 450, 500]
max_leaves = [8, 12, 16, 20]
max_depths = [4, 5, 6, 7, 8, 9, 10]

In [106]:
%%time

best_ndcg = 0
best_parameters = [0, 0, 0]

for i in n_estimators:
    for k in max_leaves:
        for l in max_depths:
            ranker = xgboost.XGBRanker(tree_method="exact", random_state=42, objective="rank:ndcg",
                                   n_estimators=i, max_leaves=k, max_depth=l, n_jobs=-1)
            ranker.fit(nTX, Ty, qid=Tqids)
            
            y_vali = ranker.predict(nVX)
            sorted_idx = np.argsort(y_vali)[::-1]
            # Sort the relevance scores from most relevant to least relevant
            y_vali = y_vali[sorted_idx].reshape(1, -1)

            ndcg_now = ndcg_score(Vy.reshape(1,-1), y_vali)
            if (ndcg_now>best_ndcg):
                best_ndcg = ndcg_now
                best_parameters = [i, k, l]
                print('%s , %s , %s' % (i, k, l))
                print(ndcg_now)

350 , 8 , 4
0.7909134429520838
400 , 8 , 7
0.7909134433040513
CPU times: total: 13d 6h 4min 37s
Wall time: 10h 7min 42s


Best iteration ([400, 8, 8]):       0. 

In [155]:
%%time

ranker = xgboost.XGBRanker(tree_method="exact", n_estimators=400, max_leaves=8, max_depth=8, random_state=42, objective="rank:ndcg", n_jobs=-1)
ranker.fit(nTX, Ty, qid=Tqids)

CPU times: total: 1h 47min 14s
Wall time: 3min 24s


In [156]:
scores = ranker.predict(nEX)
sorted_idx = np.argsort(scores)[::-1]
# Sort the relevance scores from most relevant to least relevant
scores = scores[sorted_idx].reshape(1, -1)

In [157]:
ndcg_now = ndcg_score(Ey.reshape(1,-1), scores)
ndcg_now

0.9263359652490694

Best iteration ([400, 8, 8]):       0.           wall time: 3 min 24 s

In [None]:
%%time

best_ndcg = 0
best_parameters = [0, 0, 0]

for i in n_estimators:
    for k in max_leaves:
        for l in max_depths:
            ranker = xgboost.XGBRanker(tree_method="approx", random_state=42, objective="rank:ndcg",
                                   n_estimators=i, max_leaves=k, max_depth=l, n_jobs=-1)
            ranker.fit(nTX, Ty, qid=Tqids)
            
            y_vali = ranker.predict(nVX)
            sorted_idx = np.argsort(y_vali)[::-1]
            # Sort the relevance scores from most relevant to least relevant
            y_vali = y_vali[sorted_idx].reshape(1, -1)

            ndcg_now = ndcg_score(Vy.reshape(1,-1), y_vali)
            if (ndcg_now>best_ndcg):
                best_ndcg = ndcg_now
                best_parameters = [i, k, l]
                print('%s , %s , %s' % (i, k, l))
                print(ndcg_now)

350 , 8 , 4
0.7909134436591327
400 , 12 , 5
0.7909134456269447


Best iteration ([500, 20, 5]):       0. 917 875 897 970 953 1

In [163]:
%%time

ranker = xgboost.XGBRanker(tree_method="approx", n_estimators=500, max_leaves=20, max_depth=5, random_state=42, objective="rank:ndcg", n_jobs=-1)
ranker.fit(nTX, Ty, qid=Tqids)

CPU times: total: 2h 17min 59s
Wall time: 5min


In [164]:
scores = ranker.predict(nEX)
sorted_idx = np.argsort(scores)[::-1]
# Sort the relevance scores from most relevant to least relevant
scores = scores[sorted_idx].reshape(1, -1)

In [165]:
ndcg_now = ndcg_score(Ey.reshape(1,-1), scores)
ndcg_now

0.9263363929639176

Best iteration ([400, 8, 8]):       0. 926 336 392 963 917 6          wall time: 5 min

Note that the dataset after normalization seems to be slower than the non-normalized one.

#### With MinMax normalization

In [7]:
#Normalization

scaler = MinMaxScaler()
scaler.fit(TX)
nTX = scaler.transform(TX)
scaler.fit(VX)
nVX = scaler.transform(VX)
scaler.fit(EX)
nEX = scaler.transform(EX)

In [8]:
n_estimators = [350, 400, 450, 500]
max_leaves = [8, 12, 16, 20]
max_depths = [4, 5, 6, 7, 8, 9, 10]

In [13]:
%%time

best_ndcg = 0
best_parameters = [0, 0, 0]

for i in n_estimators:
    for k in max_leaves:
        for l in max_depths:
            ranker = xgboost.XGBRanker(tree_method="exact", random_state=42, objective="rank:ndcg",
                                   n_estimators=i, max_leaves=k, max_depth=l, n_jobs=-1)
            ranker.fit(nTX, Ty, qid=Tqids)
            
            y_vali = ranker.predict(nVX)
            sorted_idx = np.argsort(y_vali)[::-1]
            # Sort the relevance scores from most relevant to least relevant
            y_vali = y_vali[sorted_idx].reshape(1, -1)

            ndcg_now = ndcg_score(Vy.reshape(1,-1), y_vali)
            if (ndcg_now>best_ndcg):
                best_ndcg = ndcg_now
                best_parameters = [i, k, l]
                print('%s , %s , %s' % (i, k, l))
                print(ndcg_now)

350 , 8 , 4
0.9178402249299604
350 , 8 , 5
0.9178792373087499
350 , 8 , 6
0.9179011949807576
400 , 8 , 6
0.9179013958105817
CPU times: total: 7d 21h 18min 33s
Wall time: 6h 1min 34s


[400, 8, 6] :   0. 917 901 395 810 581 7

In [20]:
%%time

ranker = xgboost.XGBRanker(tree_method="exact", n_estimators=400, max_leaves=8, max_depth=6, random_state=42, objective="rank:ndcg", n_jobs=-1)
ranker.fit(nTX, Ty, qid=Tqids)

CPU times: total: 1h 20min 53s
Wall time: 2min 34s


In [21]:
scores = ranker.predict(nEX)
sorted_idx = np.argsort(scores)[::-1]
# Sort the relevance scores from most relevant to least relevant
scores = scores[sorted_idx].reshape(1, -1)

In [22]:
ndcg_now = ndcg_score(Ey.reshape(1,-1), scores)
ndcg_now

0.9263474700520594

[400, 8, 6] : 0.926 347 470 052 059 4   wall time: 2 min 34 s

In [11]:
%%time

best_ndcg = 0
best_parameters = [0, 0, 0]

for i in n_estimators:
    for k in max_leaves:
        for l in max_depths:
            ranker = xgboost.XGBRanker(tree_method="approx", random_state=42, objective="rank:ndcg",
                                   n_estimators=i, max_leaves=k, max_depth=l, n_jobs=-1)
            ranker.fit(nTX, Ty, qid=Tqids)
            
            y_vali = ranker.predict(nVX)
            sorted_idx = np.argsort(y_vali)[::-1]
            # Sort the relevance scores from most relevant to least relevant
            y_vali = y_vali[sorted_idx].reshape(1, -1)

            ndcg_now = ndcg_score(Vy.reshape(1,-1), y_vali)
            if (ndcg_now>best_ndcg):
                best_ndcg = ndcg_now
                best_parameters = [i, k, l]
                print('%s , %s , %s' % (i, k, l))
                print(ndcg_now)

350 , 8 , 4
0.9178458302440359
350 , 12 , 5
0.917847035906581
350 , 20 , 5
0.9178742937695091
400 , 20 , 5
0.9179036635454874
CPU times: total: 9d 18h 26min 32s
Wall time: 7h 48min 52s


In [13]:
%%time

ranker = xgboost.XGBRanker(tree_method="approx", n_estimators=400, max_leaves=20, max_depth=5, random_state=42, objective="rank:ndcg", n_jobs=-1)
ranker.fit(nTX, Ty, qid=Tqids)

CPU times: total: 2h 57s
Wall time: 4min


In [14]:
scores = ranker.predict(nEX)
sorted_idx = np.argsort(scores)[::-1]
# Sort the relevance scores from most relevant to least relevant
scores = scores[sorted_idx].reshape(1, -1)

In [15]:
ndcg_now = ndcg_score(Ey.reshape(1,-1), scores)
ndcg_now

0.9263380821451912

# Istella