LICENSE

This notebook uses the Ruey-Cheng Chen library to train and test an AdaRank model:
https://github.com/rueycheng/AdaRank

The model is trained with a training svmlight file and evaluated with a test svmlight file. 

In [22]:
import helper

from adarank_lib.adarank import AdaRank
from adarank_lib.metrics import NDCGScorer, APScorer, PScorer
from adarank_lib.utils import load_docno, print_ranking

import os

from sklearn.datasets import load_svmlight_file

In [2]:
# Excel Files

excel_doc = "../data/Loinc/loinc_dataset_labels-v2.xlsx"
extended_excel_doc = "../data/Loinc/extended_loinc_dataset-v2.xlsx"

In [3]:
# Create Dataframes

df = helper.excel_to_df(excel_doc)
extended_df = helper.excel_to_df(extended_excel_doc)

In [4]:
# Svmlight files
#os.chdir('..data/svmlight_files')
file, train_file, test_file  = helper.df_to_svmlight_files(df)
extended_file, extended_train_file, extended_test_file = helper.df_to_svmlight_files(extended_df)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\laura\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\laura\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [29]:
def get_adarank_score_and_ranking(excel_doc, sc="NDCGScorer"):
    if excel_doc == extended_excel_doc:
        tr_file = extended_train_file
        tst_file = extended_test_file
        ranking_file = 'extended_ranking.txt'
    else:
        tr_file = train_file
        tst_file = test_file
        ranking_file = 'ranking.txt'
   
    X_train, y_train, qid_train = load_svmlight_file(tr_file, query_id=True)
    X_test, y_test, qid_test = load_svmlight_file(tst_file, query_id=True)
    
    '''
    Normalized Discounted Cumulative Gain scorer:
        A measure of ranking quality that is often used to measure effectiveness 
        of web search engine algorithms or related applications.
    '''
    
    '''
    Run AdaRank for 100 iterations optimizing for NDCG@10. 
    When no improvement is made within the previous 10 iterations, 
    the algorithm will stop.
    '''
    if sc == "NDCGScorer":
        model = AdaRank(max_iter=100, estop=10, scorer=NDCGScorer(k=10)).fit(X_train, y_train, qid_train)
    elif sc == "APScorer":
        model = AdaRank(max_iter=100, estop=10, scorer=APScorer()).fit(X_train, y_train, qid_train)
    elif sc == "PScorer":
        model = AdaRank(max_iter=100, estop=10, scorer=PScorer()).fit(X_train, y_train, qid_train)
    pred = model.predict(X_test, qid_test)
    
    # nDCG scores
    if sc == "NDCGScorer":
        for k in (1, 2, 3, 4, 5, 10, 20):
                score = NDCGScorer(k=k)(y_test, pred, qid_test).mean()
                print('nDCG@{}\t{}'.format(k, score))
    elif sc == "APScorer":
        score = APScorer()(y_test, pred, qid_test).mean()
        print('AP\t{}'.format(score))
    elif sc == "PScorer":
        score = PScorer()(y_test, pred, qid_test).mean()
        print('P\t{}'.format(score))
        
    
    # Return ranking
    docno = load_docno(tst_file, letor=False)
    #os.chdir('../rankings')
    print_ranking(qid_test, docno, pred)
    #os.chdir('../svmlight_files')

In [30]:
# Get scores for excel documents
print('nDCG scores for original dataset:')
get_adarank_score_and_ranking(excel_doc)


print('\nnDCG scores for extended dataset:')
get_adarank_score_and_ranking(extended_excel_doc)



nDCG scores for original dataset:
nDCG@1	0.0
nDCG@2	0.1289509357448472
nDCG@3	0.1289509357448472
nDCG@4	0.2169736432690442
nDCG@5	0.2481896027351573
nDCG@10	0.34795917795457837
nDCG@20	0.34795917795457837
+-----+-------+------+-------+
| qid | docno | rank | score |
+-----+-------+------+-------+
|  1  |  53   |  1   | 0.568 |
|  1  |   5   |  2   | 0.292 |
|  1  |  27   |  3   | 0.192 |
|  1  |  38   |  4   |  0.0  |
|  1  |  32   |  5   |  0.0  |
|  1  |  33   |  6   |  0.0  |
|  1  |  23   |  7   |  0.0  |
|  1  |  64   |  8   |  0.0  |
|  1  |  44   |  9   |  0.0  |
|  1  |  39   |  10  |  0.0  |
|  1  |  34   |  11  |  0.0  |
|  1  |   8   |  12  |  0.0  |
|  1  |  59   |  13  |  0.0  |
|  1  |  40   |  14  |  0.0  |
|  1  |  50   |  15  |  0.0  |
|  1  |  48   |  16  |  0.0  |
|  1  |  16   |  17  |  0.0  |
|  1  |  51   |  18  |  0.0  |
|  1  |  52   |  19  |  0.0  |
|  1  |  47   |  20  |  0.0  |
|  2  |  50   |  1   | 0.202 |
|  2  |  17   |  2   | 0.192 |
|  2  |  44   |  3  

In [31]:
print('AP scores for original dataset:')
get_adarank_score_and_ranking(excel_doc, "APScorer")


print('\nAP scores for extended dataset:')
get_adarank_score_and_ranking(extended_excel_doc, "APScorer")

AP scores for original dataset:
AP	0.25767195767195766
+-----+-------+------+-------+
| qid | docno | rank | score |
+-----+-------+------+-------+
|  1  |   5   |  1   | 0.561 |
|  1  |  27   |  2   | 0.368 |
|  1  |  53   |  3   |  0.0  |
|  1  |  38   |  4   |  0.0  |
|  1  |  32   |  5   |  0.0  |
|  1  |  33   |  6   |  0.0  |
|  1  |  23   |  7   |  0.0  |
|  1  |  64   |  8   |  0.0  |
|  1  |  44   |  9   |  0.0  |
|  1  |  39   |  10  |  0.0  |
|  1  |  34   |  11  |  0.0  |
|  1  |   8   |  12  |  0.0  |
|  1  |  59   |  13  |  0.0  |
|  1  |  40   |  14  |  0.0  |
|  1  |  50   |  15  |  0.0  |
|  1  |  48   |  16  |  0.0  |
|  1  |  16   |  17  |  0.0  |
|  1  |  51   |  18  |  0.0  |
|  1  |  52   |  19  |  0.0  |
|  1  |  47   |  20  |  0.0  |
|  2  |  50   |  1   | 0.419 |
|  2  |  17   |  2   | 0.368 |
|  2  |  44   |  3   | 0.338 |
|  2  |  14   |  4   |  0.0  |
|  2  |  66   |  5   |  0.0  |
|  2  |  38   |  6   |  0.0  |
|  2  |  58   |  7   |  0.0  |
|  2  |  55   |

In [32]:
print('Precision scores for original dataset:')
get_adarank_score_and_ranking(excel_doc, "PScorer")


print('\nPrecision scores for extended dataset:')
get_adarank_score_and_ranking(extended_excel_doc, "PScorer")

Precision scores for original dataset:
P	0.08293460925039872
+-----+-------+------+-------+
| qid | docno | rank | score |
+-----+-------+------+-------+
|  1  |  16   |  1   | 0.043 |
|  1  |  53   |  2   |  0.0  |
|  1  |  38   |  3   |  0.0  |
|  1  |  32   |  4   |  0.0  |
|  1  |  27   |  5   |  0.0  |
|  1  |  33   |  6   |  0.0  |
|  1  |  23   |  7   |  0.0  |
|  1  |   5   |  8   |  0.0  |
|  1  |  64   |  9   |  0.0  |
|  1  |  44   |  10  |  0.0  |
|  1  |  39   |  11  |  0.0  |
|  1  |  34   |  12  |  0.0  |
|  1  |   8   |  13  |  0.0  |
|  1  |  59   |  14  |  0.0  |
|  1  |  40   |  15  |  0.0  |
|  1  |  50   |  16  |  0.0  |
|  1  |  48   |  17  |  0.0  |
|  1  |  51   |  18  |  0.0  |
|  1  |  52   |  19  |  0.0  |
|  1  |  47   |  20  |  0.0  |
|  2  |  10   |  1   | 0.049 |
|  2  |   7   |  2   | 0.041 |
|  2  |  37   |  3   |  0.0  |
|  2  |  14   |  4   |  0.0  |
|  2  |  50   |  5   |  0.0  |
|  2  |  66   |  6   |  0.0  |
|  2  |  38   |  7   |  0.0  |
|  2  |  