In [3]:
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import joblib

In [4]:
def generate_ranklib_commands_low_level(year):
    
    common_command_1 = "java -jar RankLib-2.12.jar -train ./data/training.for.{}.txt -kcv 5 ".format(year)
    common_command_1 += "-ranker 6 -metric2t MAP -metric2T P@10 "
    common_command_2 = " -gmax 2 | tail -n 1 >> ./cv.results/{}.low.results.txt; ".format(year)

    tree = list(range(300,2100,100))
    leaf = list(range(10,20,2))
    shrinkage = [0.001, 0.005]
    mls = list(range(5,11,2))
    
    file_path = "../../commands.for.{}.low.txt".format(year)
    with open (file_path,"w") as f:
        for a in tree:
            for b in leaf:
                for c in shrinkage:
                    for d in mls:
                        parameters = "-tree {} -leaf {} -shrinkage {} -mls {} ".format(a,b,c,d)
                        command = common_command_1 + parameters + common_command_2
                        command+= '''echo "{} {} {} {}" >> ./cv.results/{}.low.results.txt'''.format(a,b,c,d,year)
                        f.write(command)
                        f.write("\n")
    f.close()

In [5]:
generate_ranklib_commands_low_level(2018)

In [6]:
def convert_ranklib_style_low(input_file_path, output_file_path, year):
    
    '''
    Convert the low level feature to ranklib style
    '''
      
    df = pd.read_csv(input_file_path)
    
    if year == 2018:
    
        # in 2018 retrieval result, there is no relevance score
        # we add relevance score to evaluate in Ranklib
        qrel = pd.read_csv("../data/topics/"+str(year)+"qrel.txt",sep=" ",header=None)
        qrel.columns = ['topicid','q0','docid','rel']
        qrel = qrel[['topicid','docid','rel']]

        # merge and fill na
        df['topicid'] = df['topicid'].astype(int)
        df['docid'] = df['docid'].astype(str)
        qrel['topicid'] = qrel['topicid'].astype(int)
        qrel['docid'] = qrel['docid'].astype(str)
        df = df.merge(qrel, on = ["topicid","docid"],how="left")
        df.fillna(0, inplace=True)
        df['rel'] = df['rel'].astype(int)
        
        bm25_score_2018 = pd.read_csv("../scripts/searching/2018.searching/2018.basic.query.result.txt", sep="\t")
        df['bm25'] =  bm25_score_2018['SCORE']
    
    # extract features
    columns = list(df.columns)
    feature_names = columns.copy()
    feature_names.remove("topicid")
    feature_names.remove("docid")
    feature_names.remove("year")
    feature_names.remove("rel")
    
    # min-max scaling
    if year == 2017:
        scaler = MinMaxScaler()
        df[feature_names]=scaler.fit_transform(df[feature_names])
        joblib.dump(scaler, "low.features.scaler.pkl")
        
    if year == 2018:
        scaler = joblib.load("low.features.scaler.pkl")
        df[feature_names]=scaler.transform(df[feature_names])
        
    
    for index,rows in df.iterrows():
        
        topicid = int(df.loc[index,"topicid"])
        docid = str(df.loc[index,"docid"])
        rel = int(df.loc[index,"rel"])
        
        features = dict(df.loc[index,feature_names])
        # sort this dictionary by key (alphabetic order) to make sure the order is consistent over years
        features_list =  sorted(features.items(), key=lambda x:x[0])
        
        # add rel,topic,feature,docid
        record = "{} ".format(rel)
        record += "qid:{} ".format(topicid)
        
        feature_no = 1
        for feature in features_list:
            record += "{}:{} ".format(feature_no,feature[1])
            feature_no += 1
        
        record += "#{}".format(docid)
        
        with open(output_file_path, 'a') as f:
            f.write(record)
            f.write("\n")
        f.close()

In [7]:
# generate training data from 2017
convert_ranklib_style_low("./ltr.low/2017.low.features.from.retrieval.csv", "../../training.for.2018.txt", 2017)

In [8]:
# generate testing data from 2018
convert_ranklib_style_low("../scripts/reranking/2018.low.features.csv", "../../testing.for.2018.txt", 2018)