In [9]:
import os
import pandas as pd
import seaborn as sns
from IPython.display import display




In [4]:
from tqdm import tqdm_notebook as tqdm

In [37]:
DATASET_DIR = './dataset/imat2009-datasets/'
TRAIN_DATASET_PATH = './dataset/task3_train.txt'
TEST_DATASET_PATH = './dataset/task3_test.txt'
FIXED_TRAIN_DATASET_PATH = './dataset/task3_train_fixed.txt'
FIXED_TEST_DATASET_PATH = './dataset/task3_test_fixed.txt'
MODELS_PATH = './models'
OUTPUT_PATH = './dataset/results'

In [6]:
def is_float(token):
    try:
        token = float(token)
        return True
    except:
        return False

In [32]:
def fix_dataset(input_path, output_path):
    with open(output_path, 'w') as output_file:
        with open(input_path, 'r') as file:
            for line_ind, line in tqdm(enumerate(file.readlines())):
                relevance = None
                qid = None
                features = {}
                for token in line.replace('\n', '').split(' '):
                    if token[0] == 'q':
                        qid = token
                    elif is_float(token):
                        relevance = token
                    else:
                        subtokens = token.split(':')
                        features[subtokens[0]] = subtokens[1]
                tokens = ['0', qid]
                for ind in range(1, 246):
                    ind = str(ind)
                    keys_set = set(features.keys())
                    if ind not in keys_set:
                        features[ind] = '0'
                    tokens.append(ind + ':' + features[ind])
                output_file.write(' '.join(tokens) + '#{}{}'.format(line_ind, os.linesep))

In [71]:
fix_dataset(TRAIN_DATASET_PATH, FIXED_TRAIN_DATASET_PATH)

HBox(children=(IntProgress(value=0, max=75057), HTML(value='')))




We will use train / validate split 85 / 15.

Let's try MART ranker (which is pairwise) with 1000 trees.

In [8]:
!  java -jar RankLib-2.12.jar -tvs 0.85 -metric2t NDCG@20 -ranker 0 -train './dataset/task3_train_fixed.txt' -save './models/MART_TV'


Discard orig. features
Training data:	./dataset/task3_train_fixed.txt
Train-Validation split: 0.85
Feature vector representation: Dense.
Ranking method:	MART
Feature description file:	Unspecified. All features will be used.
Train metric:	NDCG@20
Test metric:	NDCG@20
Feature normalization: No
Model file: ./models/MART_TV

[+] MART's Parameters:
No. of trees: 1000
No. of leaves: 10
No. of threshold candidates: 256
Min leaf support: 1
Learning rate: 0.1
Stop early: 100 rounds without performance gain on validation data

Reading feature file [./dataset/task3_train_fixed.txt]... [Done.]            
(7033 ranked lists, 75057 entries read)
Initializing... [Done]
---------------------------------
Training starts...
---------------------------------
#iter   | NDCG@20-T | NDCG@20-V | 
---------------------------------
1       | 0.7017    | 0.704     | 
2       | 0.7077    | 0.7127    | 
3       | 0.7109    | 0.7179    | 
4       | 0.7171    | 0.7258    | 
5       | 0.7183    | 0.7253    | 
6   

212     | 0.758     | 0.7539    | 
213     | 0.7579    | 0.754     | 
214     | 0.758     | 0.7541    | 
215     | 0.7579    | 0.7542    | 
216     | 0.7579    | 0.7537    | 
217     | 0.758     | 0.7536    | 
218     | 0.7579    | 0.7534    | 
219     | 0.7581    | 0.754     | 
220     | 0.7582    | 0.7544    | 
221     | 0.7583    | 0.7545    | 
222     | 0.7582    | 0.7543    | 
223     | 0.7583    | 0.7543    | 
224     | 0.7581    | 0.7545    | 
225     | 0.7582    | 0.7546    | 
226     | 0.7585    | 0.754     | 
227     | 0.7584    | 0.7539    | 
228     | 0.7584    | 0.7539    | 
229     | 0.7584    | 0.7539    | 
230     | 0.7585    | 0.7537    | 
231     | 0.7585    | 0.7537    | 
232     | 0.7584    | 0.7534    | 
233     | 0.7584    | 0.7532    | 
234     | 0.7584    | 0.7528    | 
235     | 0.7585    | 0.7532    | 
236     | 0.7584    | 0.7532    | 
237     | 0.7588    | 0.7531    | 
238     | 0.7589    | 0.7531    | 
239     | 0.7587    | 0.7526    | 
240     | 0.7586    

680     | 0.7727    | 0.7573    | 
681     | 0.7727    | 0.7573    | 
682     | 0.7729    | 0.7576    | 
683     | 0.7729    | 0.7579    | 
684     | 0.773     | 0.7582    | 
685     | 0.773     | 0.7582    | 
686     | 0.7729    | 0.7583    | 
687     | 0.7731    | 0.7581    | 
688     | 0.773     | 0.7581    | 
689     | 0.7729    | 0.758     | 
690     | 0.773     | 0.7583    | 
691     | 0.773     | 0.7583    | 
692     | 0.773     | 0.7584    | 
693     | 0.7731    | 0.7583    | 
694     | 0.7731    | 0.7582    | 
695     | 0.773     | 0.7586    | 
696     | 0.7731    | 0.7583    | 
697     | 0.7731    | 0.7583    | 
698     | 0.7732    | 0.7584    | 
699     | 0.7731    | 0.7587    | 
700     | 0.7732    | 0.7584    | 
701     | 0.7733    | 0.7584    | 
702     | 0.7733    | 0.7583    | 
703     | 0.7734    | 0.7583    | 
704     | 0.7733    | 0.7583    | 
705     | 0.7732    | 0.7579    | 
706     | 0.7732    | 0.7579    | 
707     | 0.7732    | 0.758     | 
708     | 0.7732    

Let's try RankNet ranker (which is pairwise).

In [12]:
!  java -jar RankLib-2.12.jar -tvs 0.85 -metric2t NDCG@20 -ranker 1 -train './dataset/task3_train_fixed.txt' -save './models/RankNet_TV'


Discard orig. features
Training data:	./dataset/task3_train_fixed.txt
Train-Validation split: 0.85
Feature vector representation: Dense.
Ranking method:	RankNet
Feature description file:	Unspecified. All features will be used.
Train metric:	NDCG@20
Test metric:	NDCG@20
Feature normalization: No
Model file: ./models/RankNet_TV

[+] RankNet's Parameters:
No. of epochs: 100
No. of hidden layers: 1
No. of hidden nodes per layer: 10
Learning rate: 5.0E-5

Reading feature file [./dataset/task3_train_fixed.txt]... [Done.]            
(7033 ranked lists, 75057 entries read)
Initializing... [Done]
-----------------------------------------
Training starts...
--------------------------------------------------
#epoch  | % mis-ordered  | NDCG@20-T | NDCG@20-V | 
        |   pairs        |           |           | 
--------------------------------------------------
1       | 0.2291         | 0.6629    | 0.6703    | 
2       | 0.2251         | 0.6663    | 0.672     | 
3       | 0.2216         | 0.668

Let's try RankBoost ranker (which is listwise).

In [22]:
!  java -jar RankLib-2.12.jar -tvs 0.85 -metric2t NDCG@20 -ranker 2 -train './dataset/task3_train_fixed.txt' -save './models/RankBoost_TV'


Discard orig. features
Training data:	./dataset/task3_train_fixed.txt
Train-Validation split: 0.85
Feature vector representation: Dense.
Ranking method:	RankBoost
Feature description file:	Unspecified. All features will be used.
Train metric:	NDCG@20
Test metric:	NDCG@20
Feature normalization: No
Model file: ./models/RankBoost_TV

[+] RankBoost's Parameters:
No. of rounds: 300
No. of threshold candidates: 10

Reading feature file [./dataset/task3_train_fixed.txt]... [Done.]            
(7033 ranked lists, 75057 entries read)
Initializing... [Done]
------------------------------------------
Training starts...
--------------------------------------------------------------------
#iter   | Sel. F.  | Threshold | Error     | NDCG@20-T | NDCG@20-V | 
--------------------------------------------------------------------
1       | 113      | 0.0827    | 0.1041    | 0.8575    | 0.6876    | 
2       | 181      | 0.0656    | 0.0986    | 0.8177    | 0.6979    | 
3       | 4        | 0.2       | 0.

223     | 122      | 0.2581    | 0.0084    | 0.7203    | 0.7277    | 
224     | 227      | 0.2       | 0.0084    | 0.7203    | 0.7278    | 
225     | 11       | 0.2       | 0.0083    | 0.7203    | 0.7278    | 
226     | 125      | 0.9       | 0.0083    | 0.7203    | 0.7275    | 
227     | 74       | 0.7       | 0.0083    | 0.7204    | 0.7275    | 
228     | 224      | 0.1882    | 0.0083    | 0.7205    | 0.7277    | 
229     | 5        | 0.3499    | 0.0083    | 0.7205    | 0.7277    | 
230     | 35       | 0.1       | 0.0082    | 0.7205    | 0.7278    | 
231     | 181      | 0.328     | 0.0082    | 0.7205    | 0.7277    | 
232     | 222      | 0.4       | 0.0081    | 0.7208    | 0.7279    | 
233     | 164      | 0.9       | 0.0082    | 0.7208    | 0.7279    | 
234     | 31       | 0.0969    | 0.0081    | 0.7204    | 0.7276    | 
235     | 165      | 0.8       | 0.0082    | 0.7204    | 0.728     | 
236     | 161      | 0.1       | 0.0081    | 0.7205    | 0.7277    | 
237     | 35       |

Let's try AdaRank.

In [23]:
!  java -jar RankLib-2.12.jar -tvs 0.85 -metric2t NDCG@20 -ranker 3 -train './dataset/task3_train_fixed.txt' -save './models/AdaRank_TV'


Discard orig. features
Training data:	./dataset/task3_train_fixed.txt
Train-Validation split: 0.85
Feature vector representation: Dense.
Ranking method:	AdaRank
Feature description file:	Unspecified. All features will be used.
Train metric:	NDCG@20
Test metric:	NDCG@20
Feature normalization: No
Model file: ./models/AdaRank_TV

[+] AdaRank's Parameters:
No. of rounds: 500
Train with 'enequeue': Yes
Tolerance: 0.002
Max Sel. Count: 5

Reading feature file [./dataset/task3_train_fixed.txt]... [Done.]            
(7033 ranked lists, 75057 entries read)
Initializing... [Done]
---------------------------
Training starts...
--------------------------------------------------------
#iter   | Sel. F.  | NDCG@20-T | NDCG@20-V | Status    | 
--------------------------------------------------------
1       | 11       | 0.7028    | 0.7005    | OK        | 
2       | 227      | 0.7104    | 0.7098    | OK        | 
3       | 227      |           |           | ROLLBACK  | 
4       | 71       | 0.7083 

129     | 125      |           |           | ROLLBACK  | 
130     | 49       | 0.7064    | 0.7112    | OK        | 
131     | 152      | 0.7063    | 0.7118    | OK        | 
132     | 152      |           |           | ROLLBACK  | 
133     | 49       | 0.7065    | 0.7111    | OK        | 
134     | 49       |           |           | ROLLBACK  | 
135     | 124      | 0.7065    | 0.711     | OK        | 
136     | 124      |           |           | ROLLBACK  | 
137     | 84       | 0.7065    | 0.7119    | OK        | 
138     | 84       |           |           | ROLLBACK  | 
139     | 225      | 0.7066    | 0.7111    | OK        | 
140     | 225      |           |           | ROLLBACK  | 
141     | 181      | 0.7065    | 0.7111    | OK        | 
142     | 215      | 0.7066    | 0.7114    | OK        | 
143     | 215      |           |           | ROLLBACK  | 
144     | 181      | 0.7065    | 0.711     | OK        | 
145     | 181      |           |           | ROLLBACK  | 
146     | 189 

271     | 201      | 0.7057    | 0.7097    | OK        | 
272     | 201      |           |           | ROLLBACK  | 
273     | 224      | 0.7057    | 0.7097    | OK        | 
274     | 198      | 0.7056    | 0.7097    | OK        | 
275     | 198      |           |           | ROLLBACK  | 
276     | 224      | 0.7057    | 0.7098    | OK        | 
277     | 131      | 0.7058    | 0.7099    | OK        | 
278     | 102      | 0.7058    | 0.7098    | OK        | 
279     | 102      |           |           | ROLLBACK  | 
280     | 131      | 0.7058    | 0.7097    | OK        | 
281     | 131      |           |           | ROLLBACK  | 
282     | 143      | 0.7058    | 0.7097    | OK        | 
283     | 143      |           |           | ROLLBACK  | 
284     | 229      | 0.7057    | 0.7097    | OK        | 
285     | 229      |           |           | ROLLBACK  | 
286     | 78       | 0.7057    | 0.7095    | OK        | 
287     | 116      | 0.7053    | 0.7095    | OK        | 
288     | 116 

413     | 121      | 0.7056    | 0.7094    | OK        | 
414     | 121      |           |           | ROLLBACK  | 
415     | 120      | 0.7056    | 0.7093    | OK        | 
416     | 120      |           |           | ROLLBACK  | 
417     | 217      | 0.7055    | 0.7092    | OK        | 
418     | 217      |           |           | ROLLBACK  | 
419     | 159      | 0.7056    | 0.7094    | OK        | 
420     | 159      |           |           | ROLLBACK  | 
421     | 207      | 0.7054    | 0.7094    | OK        | 
422     | 207      |           |           | ROLLBACK  | 
423     | 92       | 0.7057    | 0.7092    | OK        | 
424     | 92       |           |           | ROLLBACK  | 
425     | 117      | 0.7054    | 0.7095    | OK        | 
426     | 117      |           |           | ROLLBACK  | 
427     | 241      | 0.7057    | 0.7095    | OK        | 
428     | 241      |           |           | ROLLBACK  | 
429     | 132      | 0.7052    | 0.7087    | OK        | 
430     | 132 

Let's try LambdaMART.

In [8]:
!  java -jar RankLib-2.12.jar -tvs 0.85 -metric2t NDCG@20 -ranker 6 -train './dataset/task3_train_fixed.txt' -save './models/LambdaMART'


Discard orig. features
Training data:	./dataset/task3_train_fixed.txt
Train-Validation split: 0.85
Feature vector representation: Dense.
Ranking method:	LambdaMART
Feature description file:	Unspecified. All features will be used.
Train metric:	NDCG@20
Test metric:	NDCG@20
Feature normalization: No
Model file: ./models/LambdaMART

[+] LambdaMART's Parameters:
No. of trees: 1000
No. of leaves: 10
No. of threshold candidates: 256
Min leaf support: 1
Learning rate: 0.1
Stop early: 100 rounds without performance gain on validation data

Reading feature file [./dataset/task3_train_fixed.txt]... [Done.]            
(7033 ranked lists, 75057 entries read)
Initializing... [Done]
---------------------------------
Training starts...
---------------------------------
#iter   | NDCG@20-T | NDCG@20-V | 
---------------------------------
1       | 0.7139    | 0.7176    | 
2       | 0.7167    | 0.7181    | 
3       | 0.715     | 0.7164    | 
4       | 0.718     | 0.7196    | 
5       | 0.7184    | 0.

445     | 0.7842    | 0.758     | 
446     | 0.7843    | 0.758     | 
447     | 0.7844    | 0.758     | 
448     | 0.7845    | 0.7576    | 
449     | 0.7845    | 0.7576    | 
450     | 0.7846    | 0.7573    | 
451     | 0.7847    | 0.7574    | 
452     | 0.7848    | 0.7575    | 
453     | 0.7847    | 0.7577    | 
454     | 0.7848    | 0.7578    | 
455     | 0.785     | 0.7577    | 
456     | 0.7851    | 0.7574    | 
457     | 0.7852    | 0.7576    | 
458     | 0.7851    | 0.7578    | 
459     | 0.7851    | 0.7576    | 
460     | 0.7853    | 0.7572    | 
461     | 0.7851    | 0.7572    | 
462     | 0.7852    | 0.7572    | 
463     | 0.7853    | 0.7572    | 
464     | 0.7854    | 0.7573    | 
465     | 0.7856    | 0.757     | 
466     | 0.7856    | 0.7571    | 
467     | 0.7855    | 0.7567    | 
468     | 0.7856    | 0.7567    | 
469     | 0.7856    | 0.7568    | 
470     | 0.7856    | 0.7566    | 
471     | 0.7857    | 0.7562    | 
472     | 0.7858    | 0.7565    | 
473     | 0.7857    

Let's try Coordinate Ascent.

In [4]:
!  java -jar RankLib-2.12.jar -tvs 0.85 -metric2t NDCG@20 -ranker 4 -train './dataset/task3_train_fixed.txt' -save './models/Coordinate_Ascent_TV'


Discard orig. features
Training data:	./dataset/task3_train_fixed.txt
Train-Validation split: 0.85
Feature vector representation: Dense.
Ranking method:	Coordinate Ascent
Feature description file:	Unspecified. All features will be used.
Train metric:	NDCG@20
Test metric:	NDCG@20
Feature normalization: No
Model file: ./models/Coordinate_Ascent_TV

[+] Coordinate Ascent's Parameters:
No. of random restarts: 5
No. of iterations to search in each direction: 25
Tolerance: 0.001
Regularization: No

Reading feature file [./dataset/task3_train_fixed.txt]... [Done.]            
(7033 ranked lists, 75057 entries read)
Initializing... [Done]
---------------------------
Training starts...
---------------------------
[+] Random restart #1/5...
Shuffling features' order... [Done.]
Optimizing weight vector... 
------------------------------
Feature | weight   | NDCG@20 | 
------------------------------
226     | +0.0051  | 0.694   | 
226     | +0.0071  | 0.6941  | 
226     | +0.0111  | 0.6945  | 
22

19      | +0.0173  | 0.7048  | 
19      | +0.0333  | 0.7048  | 
19      | +0.0653  | 0.7049  | 
20      | +0.0032  | 0.705   | 
129     | +0.0032  | 0.705   | 
129     | +0.0052  | 0.7054  | 
129     | +0.0092  | 0.7056  | 
129     | +0.0172  | 0.7061  | 
129     | +0.0332  | 0.7071  | 
129     | +0.0652  | 0.7076  | 
129     | +0.1292  | 0.7085  | 
201     | +0.3926  | 0.7085  | 
201     | +0.7834  | 0.7088  | 
201     | +1.565   | 0.7093  | 
201     | +3.1281  | 0.7095  | 
201     | +6.2544  | 0.7098  | 
12      | +3.0E-4  | 0.7098  | 
12      | +5.0E-4  | 0.7099  | 
12      | +0.0011  | 0.7099  | 
12      | +0.0019  | 0.71    | 
37      | +3.0E-4  | 0.71    | 
37      | +4.0E-4  | 0.71    | 
37      | +7.0E-4  | 0.71    | 
37      | +0.0011  | 0.71    | 
37      | +0.0019  | 0.7102  | 
37      | +0.0036  | 0.7104  | 
37      | +0.007   | 0.7104  | 
92      | +3.0E-4  | 0.7104  | 
92      | +3.0E-4  | 0.7104  | 
92      | +4.0E-4  | 0.7105  | 
99      | +3.0E-4  | 0.7105  | 
99      

182     | +0.0     | 0.7329  | 
182     | +0.0     | 0.7329  | 
2       | +0.0     | 0.7329  | 
2       | +0.0     | 0.7329  | 
2       | +0.0     | 0.7329  | 
2       | +0.0     | 0.7331  | 
2       | +0.0     | 0.7331  | 
2       | +1.0E-4  | 0.7333  | 
196     | +0.0     | 0.7333  | 
196     | +0.0     | 0.7333  | 
81      | +0.0     | 0.7333  | 
81      | +0.0     | 0.7333  | 
81      | +0.0     | 0.7333  | 
81      | +0.0     | 0.7334  | 
172     | +0.0     | 0.7334  | 
172     | +0.0     | 0.7334  | 
60      | +0.0     | 0.7334  | 
60      | +0.0     | 0.7334  | 
60      | +0.0     | 0.7334  | 
15      | +0.0     | 0.7334  | 
15      | +0.0     | 0.7334  | 
15      | +0.0     | 0.7335  | 
198     | +0.0     | 0.7335  | 
198     | +0.0     | 0.7335  | 
198     | +0.0     | 0.7335  | 
222     | +0.0     | 0.7335  | 
108     | +0.0     | 0.7335  | 
108     | +0.0     | 0.7335  | 
4       | +1.0E-4  | 0.7338  | 
4       | +1.0E-4  | 0.7339  | 
------------------------------
Shuffling

183     | +0.0     | 0.7277  | 
191     | +5.0E-4  | 0.7278  | 
215     | +0.0     | 0.7278  | 
114     | +0.0     | 0.7278  | 
114     | +0.0     | 0.7278  | 
114     | +0.0     | 0.7278  | 
25      | +0.0     | 0.7278  | 
25      | +0.0     | 0.7278  | 
25      | +0.0     | 0.7278  | 
157     | +0.0     | 0.7278  | 
157     | +0.0     | 0.7278  | 
157     | +0.0     | 0.7279  | 
157     | +1.0E-4  | 0.7279  | 
157     | +2.0E-4  | 0.728   | 
157     | +5.0E-4  | 0.7282  | 
157     | +0.001   | 0.7286  | 
157     | +0.0019  | 0.7287  | 
157     | +0.0038  | 0.7289  | 
74      | +0.0     | 0.7289  | 
74      | +0.0     | 0.7289  | 
74      | +0.0     | 0.7289  | 
74      | +0.0     | 0.7289  | 
74      | +0.0     | 0.7289  | 
136     | +0.0     | 0.7289  | 
136     | +0.0     | 0.7289  | 
239     | +0.0     | 0.729   | 
239     | +1.0E-4  | 0.729   | 
239     | +1.0E-4  | 0.7293  | 
239     | +2.0E-4  | 0.7296  | 
239     | +5.0E-4  | 0.7301  | 
239     | +0.001   | 0.7303  | 
239     

189     | +0.7467  | 0.7245  | 
35      | +0.0     | 0.7245  | 
35      | +0.0     | 0.7245  | 
35      | +0.0     | 0.7245  | 
35      | +0.0     | 0.7246  | 
244     | +0.0     | 0.7246  | 
244     | +0.0     | 0.7246  | 
173     | +0.0     | 0.7246  | 
173     | +0.0     | 0.7246  | 
136     | +0.0     | 0.7246  | 
136     | +0.0     | 0.7246  | 
136     | +0.0     | 0.7247  | 
136     | +0.0     | 0.7247  | 
136     | +0.0     | 0.7247  | 
136     | +0.0     | 0.7248  | 
46      | +0.0     | 0.7248  | 
46      | +0.0     | 0.7248  | 
46      | +0.0     | 0.7248  | 
123     | +1.0E-4  | 0.7248  | 
123     | +4.0E-4  | 0.7249  | 
123     | +0.0134  | 0.7249  | 
81      | +0.0     | 0.7249  | 
81      | +0.0     | 0.7249  | 
81      | +0.0     | 0.7249  | 
81      | +0.0     | 0.7249  | 
155     | +0.0     | 0.7249  | 
155     | +0.0     | 0.7249  | 
156     | +0.0     | 0.7249  | 
156     | +0.0     | 0.7249  | 
149     | +0.0     | 0.7249  | 
149     | +0.0     | 0.725   | 
149     

141     | +0.0     | 0.7327  | 
141     | +0.0     | 0.7327  | 
63      | +3.0E-4  | 0.7328  | 
169     | +0.0     | 0.7328  | 
177     | +0.0     | 0.7328  | 
119     | +0.0     | 0.7328  | 
181     | +0.0     | 0.7328  | 
181     | +0.0     | 0.7328  | 
181     | +0.0     | 0.7328  | 
181     | +0.0     | 0.7328  | 
181     | +0.0     | 0.7328  | 
181     | +0.0     | 0.7328  | 
181     | +1.0E-4  | 0.7328  | 
20      | +0.0     | 0.7328  | 
20      | +0.0     | 0.7328  | 
57      | +0.0     | 0.7328  | 
57      | +0.0     | 0.7328  | 
57      | +0.0     | 0.7328  | 
57      | +0.0     | 0.7328  | 
57      | +1.0E-4  | 0.7329  | 
192     | +0.0     | 0.7329  | 
192     | +0.0     | 0.7329  | 
13      | +0.0     | 0.733   | 
13      | +1.0E-4  | 0.7331  | 
159     | +0.0     | 0.7331  | 
159     | +0.0     | 0.7331  | 
159     | +0.0     | 0.7331  | 
159     | +1.0E-4  | 0.7331  | 
159     | +1.0E-4  | 0.7332  | 
159     | +3.0E-4  | 0.7332  | 
151     | +0.0     | 0.7332  | 
33      

129     | +1.0E-4  | 0.7267  | 
89      | +0.0     | 0.7267  | 
141     | +0.0     | 0.7268  | 
141     | +0.0     | 0.7268  | 
109     | +0.0     | 0.7269  | 
109     | +0.0     | 0.7269  | 
109     | +0.0     | 0.7269  | 
109     | +1.0E-4  | 0.7269  | 
109     | +9.0E-4  | 0.727   | 
29      | +0.0     | 0.727   | 
29      | +0.0     | 0.727   | 
29      | +0.0     | 0.727   | 
29      | +0.0     | 0.7272  | 
3       | +1.0E-4  | 0.7273  | 
3       | +2.0E-4  | 0.7275  | 
3       | +4.0E-4  | 0.7276  | 
139     | +0.0     | 0.7276  | 
139     | +0.0     | 0.7276  | 
124     | +0.0     | 0.7276  | 
124     | +0.0     | 0.7276  | 
124     | +0.0     | 0.7277  | 
124     | +0.0     | 0.7278  | 
32      | +0.0     | 0.7278  | 
32      | +0.0     | 0.7278  | 
32      | +0.0     | 0.7278  | 
32      | +0.0     | 0.7278  | 
32      | +0.0     | 0.7278  | 
32      | +1.0E-4  | 0.7279  | 
192     | +0.0     | 0.7279  | 
192     | +0.0     | 0.7279  | 
192     | +0.0     | 0.7279  | 
192     

67      | +0.0     | 0.7342  | 
67      | +0.0     | 0.7343  | 
67      | +0.0     | 0.7343  | 
156     | +0.0     | 0.7343  | 
39      | +0.0     | 0.7343  | 
161     | +1.0E-4  | 0.7344  | 
243     | +2.0E-4  | 0.7344  | 
243     | +9.0E-4  | 0.7344  | 
243     | +0.0018  | 0.7344  | 
158     | +0.0     | 0.7344  | 
158     | +0.0     | 0.7344  | 
230     | +0.0     | 0.7344  | 
123     | +0.0     | 0.7344  | 
121     | +0.0     | 0.7344  | 
133     | +0.0     | 0.7345  | 
224     | +1.0E-4  | 0.7346  | 
12      | +0.0     | 0.7346  | 
------------------------------
Shuffling features' order... [Done.]
Optimizing weight vector... 
------------------------------
Feature | weight   | NDCG@20 | 
------------------------------
------------------------------
---------------------------------
Finished sucessfully.
NDCG@20 on training data: 0.7339
NDCG@20 on validation data: 0.742
---------------------------------

Model saved to: ./models/Coordinate_Ascent_TV


Let's try ListNet ranker (which is listwise).

In [9]:
!  java -jar RankLib-2.12.jar -tvs 0.85 -metric2t NDCG@20 -ranker 7 -train './dataset/task3_train_fixed.txt' -save './models/ListNet_TV'


Discard orig. features
Training data:	./dataset/task3_train_fixed.txt
Train-Validation split: 0.85
Feature vector representation: Dense.
Ranking method:	ListNet
Feature description file:	Unspecified. All features will be used.
Train metric:	NDCG@20
Test metric:	NDCG@20
Feature normalization: No
Model file: ./models/ListNet_TV

[+] ListNet's Parameters:
No. of epochs: 1500
Learning rate: 1.0E-5

Reading feature file [./dataset/task3_train_fixed.txt]... [Done.]            
(7033 ranked lists, 75057 entries read)
Initializing... [Done]
-----------------------------------------
Training starts...
--------------------------------------------------
#epoch  | C.E. Loss      | NDCG@20-T | NDCG@20-V | 
--------------------------------------------------
1       | 0.0            | 0.6345    | 0.6319    | 
2       | 0.0            | 0.636     | 0.6335    | 
3       | 0.0            | 0.6372    | 0.6352    | 
4       | 0.0            | 0.6388    | 0.6375    | 
5       | 0.0            | 0.6407    

302     | 0.0            | 0.7206    | 0.7277    | 
303     | 0.0            | 0.7206    | 0.7276    | 
304     | 0.0            | 0.7208    | 0.7276    | 
305     | 0.0            | 0.7208    | 0.7276    | 
306     | 0.0            | 0.7208    | 0.7277    | 
307     | 0.0            | 0.7208    | 0.7277    | 
308     | 0.0            | 0.7208    | 0.7277    | 
309     | 0.0            | 0.7208    | 0.7277    | 
310     | 0.0            | 0.7208    | 0.7277    | 
311     | 0.0            | 0.7208    | 0.7277    | 
312     | 0.0            | 0.7208    | 0.7277    | 
313     | 0.0            | 0.7208    | 0.7277    | 
314     | 0.0            | 0.7208    | 0.7277    | 
315     | 0.0            | 0.7208    | 0.7277    | 
316     | 0.0            | 0.7208    | 0.7277    | 
317     | 0.0            | 0.7208    | 0.7276    | 
318     | 0.0            | 0.7209    | 0.7277    | 
319     | 0.0            | 0.7209    | 0.7277    | 
320     | 0.0            | 0.7209    | 0.7277    | 
321     | 0.

460     | 0.0            | 0.7224    | 0.7296    | 
461     | 0.0            | 0.7224    | 0.7297    | 
462     | 0.0            | 0.7223    | 0.7297    | 
463     | 0.0            | 0.7223    | 0.7297    | 
464     | 0.0            | 0.7223    | 0.7296    | 
465     | 0.0            | 0.7224    | 0.7297    | 
466     | 0.0            | 0.7224    | 0.7297    | 
467     | 0.0            | 0.7223    | 0.7297    | 
468     | 0.0            | 0.7223    | 0.7298    | 
469     | 0.0            | 0.7223    | 0.7298    | 
470     | 0.0            | 0.7222    | 0.7298    | 
471     | 0.0            | 0.7223    | 0.7298    | 
472     | 0.0            | 0.7222    | 0.7298    | 
473     | 0.0            | 0.7222    | 0.7298    | 
474     | 0.0            | 0.7222    | 0.7298    | 
475     | 0.0            | 0.7222    | 0.73      | 
476     | 0.0            | 0.7222    | 0.73      | 
477     | 0.0            | 0.7222    | 0.7299    | 
478     | 0.0            | 0.7222    | 0.7299    | 
479     | 0.

618     | 0.0            | 0.7226    | 0.7314    | 
619     | 0.0            | 0.7226    | 0.7314    | 
620     | 0.0            | 0.7226    | 0.7314    | 
621     | 0.0            | 0.7226    | 0.7314    | 
622     | 0.0            | 0.7226    | 0.7314    | 
623     | 0.0            | 0.7226    | 0.7313    | 
624     | 0.0            | 0.7226    | 0.7313    | 
625     | 0.0            | 0.7226    | 0.7313    | 
626     | 0.0            | 0.7226    | 0.7314    | 
627     | 0.0            | 0.7226    | 0.7314    | 
628     | 0.0            | 0.7226    | 0.7314    | 
629     | 0.0            | 0.7226    | 0.7315    | 
630     | 0.0            | 0.7226    | 0.7315    | 
631     | 0.0            | 0.7225    | 0.7315    | 
632     | 0.0            | 0.7225    | 0.7316    | 
633     | 0.0            | 0.7226    | 0.7316    | 
634     | 0.0            | 0.7226    | 0.7316    | 
635     | 0.0            | 0.7225    | 0.7316    | 
636     | 0.0            | 0.7225    | 0.7316    | 
637     | 0.

776     | 0.0            | 0.723     | 0.7322    | 
777     | 0.0            | 0.723     | 0.7322    | 
778     | 0.0            | 0.723     | 0.7321    | 
779     | 0.0            | 0.7229    | 0.7321    | 
780     | 0.0            | 0.7229    | 0.7321    | 
781     | 0.0            | 0.723     | 0.7321    | 
782     | 0.0            | 0.7229    | 0.7319    | 
783     | 0.0            | 0.7229    | 0.7319    | 
784     | 0.0            | 0.7229    | 0.7319    | 
785     | 0.0            | 0.7229    | 0.7319    | 
786     | 0.0            | 0.7229    | 0.732     | 
787     | 0.0            | 0.7229    | 0.7319    | 
788     | 0.0            | 0.7229    | 0.7319    | 
789     | 0.0            | 0.7229    | 0.7319    | 
790     | 0.0            | 0.7229    | 0.732     | 
791     | 0.0            | 0.7229    | 0.732     | 
792     | 0.0            | 0.7229    | 0.732     | 
793     | 0.0            | 0.7228    | 0.732     | 
794     | 0.0            | 0.7228    | 0.732     | 
795     | 0.

934     | 0.0            | 0.7228    | 0.7316    | 
935     | 0.0            | 0.7228    | 0.7316    | 
936     | 0.0            | 0.7228    | 0.7316    | 
937     | 0.0            | 0.7228    | 0.7314    | 
938     | 0.0            | 0.7228    | 0.7314    | 
939     | 0.0            | 0.7228    | 0.7314    | 
940     | 0.0            | 0.7228    | 0.7313    | 
941     | 0.0            | 0.7228    | 0.7313    | 
942     | 0.0            | 0.7228    | 0.7313    | 
943     | 0.0            | 0.7228    | 0.7313    | 
944     | 0.0            | 0.7228    | 0.7313    | 
945     | 0.0            | 0.7228    | 0.7313    | 
946     | 0.0            | 0.7229    | 0.7312    | 
947     | 0.0            | 0.7229    | 0.7311    | 
948     | 0.0            | 0.7229    | 0.7311    | 
949     | 0.0            | 0.7229    | 0.7311    | 
950     | 0.0            | 0.7229    | 0.7311    | 
951     | 0.0            | 0.7229    | 0.7311    | 
952     | 0.0            | 0.7228    | 0.7311    | 
953     | 0.

1092    | 0.0            | 0.7226    | 0.7315    | 
1093    | 0.0            | 0.7226    | 0.7315    | 
1094    | 0.0            | 0.7226    | 0.7315    | 
1095    | 0.0            | 0.7226    | 0.7315    | 
1096    | 0.0            | 0.7226    | 0.7315    | 
1097    | 0.0            | 0.7226    | 0.7315    | 
1098    | 0.0            | 0.7226    | 0.7315    | 
1099    | 0.0            | 0.7226    | 0.7315    | 
1100    | 0.0            | 0.7226    | 0.7315    | 
1101    | 0.0            | 0.7226    | 0.7315    | 
1102    | 0.0            | 0.7226    | 0.7315    | 
1103    | 0.0            | 0.7226    | 0.7315    | 
1104    | 0.0            | 0.7226    | 0.7315    | 
1105    | 0.0            | 0.7226    | 0.7313    | 
1106    | 0.0            | 0.7226    | 0.7313    | 
1107    | 0.0            | 0.7226    | 0.7313    | 
1108    | 0.0            | 0.7226    | 0.7313    | 
1109    | 0.0            | 0.7226    | 0.7313    | 
1110    | 0.0            | 0.7226    | 0.7313    | 
1111    | 0.

1250    | 0.0            | 0.7224    | 0.7311    | 
1251    | 0.0            | 0.7224    | 0.7311    | 
1252    | 0.0            | 0.7224    | 0.7311    | 
1253    | 0.0            | 0.7224    | 0.7311    | 
1254    | 0.0            | 0.7224    | 0.7311    | 
1255    | 0.0            | 0.7224    | 0.7311    | 
1256    | 0.0            | 0.7224    | 0.7311    | 
1257    | 0.0            | 0.7224    | 0.7311    | 
1258    | 0.0            | 0.7224    | 0.7311    | 
1259    | 0.0            | 0.7224    | 0.7311    | 
1260    | 0.0            | 0.7224    | 0.7311    | 
1261    | 0.0            | 0.7225    | 0.7311    | 
1262    | 0.0            | 0.7225    | 0.7311    | 
1263    | 0.0            | 0.7225    | 0.7311    | 
1264    | 0.0            | 0.7225    | 0.7311    | 
1265    | 0.0            | 0.7225    | 0.7311    | 
1266    | 0.0            | 0.7224    | 0.7311    | 
1267    | 0.0            | 0.7224    | 0.7311    | 
1268    | 0.0            | 0.7224    | 0.7311    | 
1269    | 0.

1408    | 0.0            | 0.7223    | 0.7307    | 
1409    | 0.0            | 0.7223    | 0.7307    | 
1410    | 0.0            | 0.7223    | 0.7307    | 
1411    | 0.0            | 0.7222    | 0.7307    | 
1412    | 0.0            | 0.7223    | 0.7307    | 
1413    | 0.0            | 0.7223    | 0.7307    | 
1414    | 0.0            | 0.7223    | 0.7307    | 
1415    | 0.0            | 0.7223    | 0.7307    | 
1416    | 0.0            | 0.7223    | 0.7307    | 
1417    | 0.0            | 0.7223    | 0.7307    | 
1418    | 0.0            | 0.7223    | 0.7307    | 
1419    | 0.0            | 0.7223    | 0.7307    | 
1420    | 0.0            | 0.7223    | 0.7307    | 
1421    | 0.0            | 0.7223    | 0.7307    | 
1422    | 0.0            | 0.7223    | 0.7307    | 
1423    | 0.0            | 0.7223    | 0.7307    | 
1424    | 0.0            | 0.7223    | 0.7307    | 
1425    | 0.0            | 0.7223    | 0.7307    | 
1426    | 0.0            | 0.7223    | 0.7307    | 
1427    | 0.

Let's try linear regression ranker (which is pointwise).

In [11]:
!  java -jar RankLib-2.12.jar -tvs 0.85 -metric2t NDCG@20 -ranker 9 -train './dataset/task3_train_fixed.txt' -save './models/Linear_Regression_TV'


Discard orig. features
Training data:	./dataset/task3_train_fixed.txt
Train-Validation split: 0.85
Feature vector representation: Dense.
Ranking method:	Linear Regression
Feature description file:	Unspecified. All features will be used.
Train metric:	NDCG@20
Test metric:	NDCG@20
Feature normalization: No
Model file: ./models/Linear_Regression_TV

[+] Linear Regression's Parameters:
L2-norm regularization: lambda = 1.0E-10

Reading feature file [./dataset/task3_train_fixed.txt]... [Done.]            
(7033 ranked lists, 75057 entries read)
Initializing... [Done]
--------------------------------
Training starts...
--------------------------------
Learning the least square model... [Done]
---------------------------------
Finished sucessfully.
NDCG@20 on training data: 0.7026
NDCG@20 on validation data: 0.7026
---------------------------------

Model saved to: ./models/Linear_Regression_TV


As we can see, pairwise ranker is better than listwise ranker, which is better than pointwise marker.

In [12]:
results = {'Ramker' : ['MART', 'RankNet', 'RankBoost', 'ListNet', 'AdaRank', 'Coordinate Ascent', 'LambdaMART', 'Linear Regression'],
           'Method' : ['Pairwise', 'Pairwise', 'Pairwise', 'Listwise', 'Listwise', 'Listwise', 'Listwise', 'Pointwise'],
           'NDCG@20 train' : [0.7776, 0.7099, 0.721, 0.7229, 0.705, 0.7339, 0.7842, 0.7026],
           'NDCG@20 validation' : [0.7611, 0.7143, 0.7282, 0.7323, 0.7088, 0.742, 0.758, 0.7026]}
results_dataframe = pd.DataFrame(results)

cm = sns.light_palette("green", as_cmap=True)
display(results_dataframe.style.background_gradient(cmap=cm, axis='rows'))

Unnamed: 0,Ramker,Method,NDCG@20 train,NDCG@20 validation
0,MART,Pairwise,0.7776,0.7611
1,RankNet,Pairwise,0.7099,0.7143
2,RankBoost,Pairwise,0.721,0.7282
3,ListNet,Listwise,0.7229,0.7323
4,AdaRank,Listwise,0.705,0.7088
5,Coordinate Ascent,Listwise,0.7339,0.742
6,LambdaMART,Listwise,0.7842,0.758
7,Linear Regression,Pointwise,0.7026,0.7026


In [33]:
fix_dataset(TEST_DATASET_PATH, FIXED_TEST_DATASET_PATH)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [47]:
def add_relevance_to_file(relevance_path, input_path, output_path):
    with open(relevance_path, 'r') as rel_file:
        rels = {}
        for line in rel_file.readlines():
            tokens = line.replace(os.linesep, '').split(' ')
            rels[tokens[2]] = tokens[3]
        with open(input_path, 'r') as input_file:
            with open(output_path, 'w') as output_file:
                for ind, line in tqdm(enumerate(input_file.readlines())):
                    line = '{} '.format(rels[str(ind)]) + ' '.join(line.replace(os.linesep, '').split(' ')[1:]) + os.linesep
                    output_file.write(line)

Let's generate output files for all the models saved above.

In [59]:
!  java -jar RankLib-2.12.jar -load './models/AdaRank_TV' -rank './dataset/task3_test_fixed.txt' -indri './score/AdaRank_TV' -metric2t NDCG@20


Discard orig. features
Model file:	./models/AdaRank_TV
Feature normalization: No
Model:		AdaRank
Reading feature file [./dataset/task3_test_fixed.txt]... [Done.]            
(2091 ranked lists, 22233 entries read)


In [50]:
!  java -jar RankLib-2.12.jar -load './models/Coordinate_Ascent_TV' -rank './dataset/task3_test_fixed.txt' -indri './score/Coordinate_Ascent_TV' -metric2t NDCG@20


Discard orig. features
Model file:	./models/Coordinate_Ascent_TV
Feature normalization: No
Model:		Coordinate Ascent
Reading feature file [./dataset/task3_test_fixed.txt]... [Done.]            
(2091 ranked lists, 22233 entries read)


In [51]:
!  java -jar RankLib-2.12.jar -load './models/LambdaMART' -rank './dataset/task3_test_fixed.txt' -indri './score/LambdaMART' -metric2t NDCG@20


Discard orig. features
Model file:	./models/LambdaMART
Feature normalization: No
Model:		LambdaMART
Reading feature file [./dataset/task3_test_fixed.txt]... [Done.]            
(2091 ranked lists, 22233 entries read)


In [52]:
!  java -jar RankLib-2.12.jar -load './models/Linear_Regression_TV' -rank './dataset/task3_test_fixed.txt' -indri './score/Linear_Regression_TV' -metric2t NDCG@20


Discard orig. features
Model file:	./models/Linear_Regression_TV
Feature normalization: No
Model:		Linear Regression
Reading feature file [./dataset/task3_test_fixed.txt]... [Done.]            
(2091 ranked lists, 22233 entries read)


In [53]:
!  java -jar RankLib-2.12.jar -load './models/ListNet_TV' -rank './dataset/task3_test_fixed.txt' -indri './score/ListNet_TV' -metric2t NDCG@20


Discard orig. features
Model file:	./models/ListNet_TV
Feature normalization: No
Model:		ListNet
Reading feature file [./dataset/task3_test_fixed.txt]... [Done.]            
(2091 ranked lists, 22233 entries read)


In [54]:
!  java -jar RankLib-2.12.jar -load './models/MART_TV' -rank './dataset/task3_test_fixed.txt' -indri './score/MART_TV' -metric2t NDCG@20


Discard orig. features
Model file:	./models/MART_TV
Feature normalization: No
Model:		MART
Reading feature file [./dataset/task3_test_fixed.txt]... [Done.]            
(2091 ranked lists, 22233 entries read)


In [55]:
!  java -jar RankLib-2.12.jar -load './models/Random_Forests_TV' -rank './dataset/task3_test_fixed.txt' -indri './score/Random_Forests_TV' -metric2t NDCG@20


Discard orig. features
Model file:	./models/Random_Forests_TV
Feature normalization: No
Model:		Random Forests
Reading feature file [./dataset/task3_test_fixed.txt]... [Done.]            
(2091 ranked lists, 22233 entries read)


In [56]:
!  java -jar RankLib-2.12.jar -load './models/RankBoost_TV' -rank './dataset/task3_test_fixed.txt' -indri './score/RankBoost_TV' -metric2t NDCG@20


Discard orig. features
Model file:	./models/RankBoost_TV
Feature normalization: No
Model:		RankBoost
Reading feature file [./dataset/task3_test_fixed.txt]... [Done.]            
(2091 ranked lists, 22233 entries read)


In [57]:
!  java -jar RankLib-2.12.jar -load './models/RankNet_TV' -rank './dataset/task3_test_fixed.txt' -indri './score/RankNet_TV' -metric2t NDCG@20


Discard orig. features
Model file:	./models/RankNet_TV
Feature normalization: No
Model:		RankNet
Reading feature file [./dataset/task3_test_fixed.txt]... [Done.]            
(2091 ranked lists, 22233 entries read)


In [63]:
for model in ['AdaRank_TV', 'Coordinate_Ascent_TV', 'LambdaMART', 'Linear_Regression_TV', 'ListNet_TV', 'MART_TV', 'Random_Forests_TV', 'RankBoost_TV', 'RankNet_TV']:
    output_name = model[:-3] if model != 'LambdaMART' else model
    print('Saving results for model \'{}\''.format(output_name))
    add_relevance_to_file('./score/{}'.format(model), TEST_DATASET_PATH, os.path.join(OUTPUT_PATH, '{}.txt'.format(output_name)))

Saving results for model 'AdaRank'


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Saving results for model 'Coordinate_Ascent'


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Saving results for model 'LambdaMART'


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Saving results for model 'Linear_Regression'


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Saving results for model 'ListNet'


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Saving results for model 'MART'


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Saving results for model 'Random_Forests'


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Saving results for model 'RankBoost'


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Saving results for model 'RankNet'


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


