<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import sys
import os
import subprocess
import argparse
import math
import numpy as np
import pandas as pd
import multiprocessing
from sklearn.model_selection import KFold
from scipy.stats.mstats import ttest_rel
from scipy.stats import ttest_ind, wilcoxon
from imblearn.over_sampling import RandomOverSampler, SMOTE
from collections import namedtuple
import random 

In [2]:
Result = namedtuple('Result', ['qrelFileName', 'datasetid', 'resultsFiles', 'nameApproach'])

In [22]:
help(RandomOverSampler())

Help on RandomOverSampler in module imblearn.over_sampling.random_over_sampler object:

class RandomOverSampler(imblearn.over_sampling.base.BaseOverSampler)
 |  Class to perform random over-sampling.
 |  
 |  Object to over-sample the minority class(es) by picking samples at random
 |  with replacement.
 |  
 |  Read more in the :ref:`User Guide <random_over_sampler>`.
 |  
 |  Parameters
 |  ----------
 |  ratio : str, dict, or callable, optional (default='auto')
 |      Ratio to use for resampling the data set.
 |  
 |      - If ``str``, has to be one of: (i) ``'minority'``: resample the
 |        minority class; (ii) ``'majority'``: resample the majority class,
 |        (iii) ``'not minority'``: resample all classes apart of the minority
 |        class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``:
 |        correspond to ``'all'`` with for over-sampling methods and ``'not
 |        minority'`` for under-sampling methods. The classes targeted will be
 |        over-sam

In [3]:
def getFileName(qrelFile):
    return os.path.basename(qrelFile)

In [13]:
class SIGTREC_Eval():
    def __init__(self, cv=0, seed=42, trec_eval='./trec_eval'):
        self.nameApp = {}
        self.cv = cv
        self.seed = seed
        self.trec_eval = trec_eval
        random.seed(seed)
    def build_df(self, results):
        raw = []
        for input_result in results:
            self.nameApp[input_result.datasetid] = []
            for m in measures:
                for (idx, to_compare) in enumerate(input_result.resultsFiles):
                    self.nameApp[input_result.datasetid].append(getFileName(to_compare))
                    content = str(subprocess.Popen(' '.join([trec_eval, input_result.qrelFileName, to_compare, '-q', '-m', m]), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True).communicate()[0])[2:-1]
                    raw.extend([ (input_result.datasetid, idx, getFileName(to_compare), *( w.strip() for w in line.split('\\t') )) for line in content.split('\\n') ][:-1])
        df_raw = pd.DataFrame(raw, columns=['qrel', 'idx_approach', 'approach', 'measure', 'docid', 'result'])
        df_finale = pd.pivot_table(df_raw, index=['qrel', 'docid'], columns=['idx_approach','measure'], values='result', aggfunc='first')
        df_finale.reset_index()
        df_finale[np.array(df_finale.columns)] = df_finale[np.array(df_finale.columns)].astype(np.float64)
        df_finale['fold'] = [0]*len(df_finale)
        if self.cv > 0:
            for (qrel, qrel_group) in df_finale.groupby('qrel'):
                folds=(list(range(cv))*math.ceil(len(qrel_group)/self.cv))[:len(qrel_group)]
                random.shuffle(folds)
                df_finale.loc[qrel, 'fold'] = folds
        return df_finale
    def get_test(self, test, pbase, pcomp, multi_test=False):
        if np.array_equal(pbase.values, pcomp.values):
            pvalue = 1.
        else:
            if test == 'student':
                (tvalue, pvalue) = ttest_rel(pbase, pcomp)
            elif test == 'wilcoxon':
                (tvalue, pvalue) = wilcoxon(pbase, pcomp)
            elif test == 'welcht':
                (tvalue, pvalue) = ttest_ind(pbase, pcomp, equal_var=False)
        if pvalue < 0.05:
            pbase_mean = pbase.mean()
            pcomp_mean = pcomp.mean()
            if pvalue < 0.01:
                if pbase_mean > pcomp_mean:
                    result_test = '▼ '
                else:
                    result_test = '▲ '
            else:
                if pbase_mean > pcomp_mean:
                    result_test = 'ᐁ '
                else:
                    result_test = 'ᐃ '
        else:
            if not multi_test:
                result_test = '  '
            else:
                result_test = '⏺ '
        return result_test
    def build_printable(self, table, significance_tests):
        printable = {}
        for qrel, qrel_group in table.groupby('qrel'):
            raw = []
            base = qrel_group.loc[:,0]
            for idx_app in [idx for idx in qrel_group.columns.levels[0] if type(idx) == int]:
                instance = [ self.nameApp[qrel][idx_app] ]
                for m in qrel_group[idx_app].columns:
                    array_results = qrel_group[idx_app][m]
                    mean_measure_folds = np.mean(qrel_group.groupby('fold').mean()[idx_app][m])
                    test_result=""
                    for test in significance_tests:
                        if idx_app > 0:
                            test_result+=(self.get_test(test, base[m], array_results, len(significance_tests)>1))
                        else:
                            test_result+=('bl ')
                    instance.append('%f %s' % (mean_measure_folds, test_result) )
                raw.append(instance)
            printable[qrel] = pd.DataFrame(raw, columns=['app', 'P_10', 'recall_10'])
        return printable
    def get_sampler(self,  sampler_name):
        if sampler_name == "ros" or sampler_name == 'RandomOverSampler':
            return RandomOverSampler(seed=self.seed)
        if sampler_name == "SMOTE":
            return SMOTE(seed=self.seed)
    def build_over_sample(self, df, sampler):
        raw = []
        for fold, fold_group in df.groupby('fold'):
            y = pd.factorize(fold_group.index.get_level_values('qrel'))[0]
            X_sampled, y_res = sampler.fit_sample(fold_group, y)
            raw.extend(X_sampled)
        df_sampled = pd.DataFrame(raw, columns=df.columns)
        df_sampled['qrel'] = [str(sampler)]*len(df_sampled)
        self.nameApp[str(sampler)] = self.nameApp[list(self.nameApp.keys())[0]]
        return df_sampled

In [14]:
cv=5
seed=42
significance_tests=['student', 'wilcoxon']
measures=['P.10', 'recall.10']
trec_eval='./trec_eval'

In [15]:
qrel_file = '../conversor/output/Inspec.qrel'
result_file = ['../conversor/output/Inspec_HLT_SR.out','../conversor/output/Inspec_HLT_SR_without.out']
result1 = Result(qrelFileName=qrel_file, datasetid=getFileName(qrel_file), resultsFiles=result_file, nameApproach=[])

qrel_file = '../conversor/output/SemEval2010.qrel'
result_file = ['../conversor/output/SemEval2010_HLT_SR.out','../conversor/output/SemEval2010_HLT_SR_without.out']
result2 = Result(qrelFileName=qrel_file, datasetid=getFileName(qrel_file), resultsFiles=result_file, nameApproach=[])

results = [result1, result2]


In [16]:
#def __init__(self, cv=0, seed=42, trec_eval='./trec_eval'):
sig = SIGTREC_Eval(cv=cv, seed=seed, trec_eval=trec_eval)

In [17]:
df_finale = sig.build_df(results)

In [18]:
printable = sig.build_printable(df_finale, significance_tests)
printable['Inspec.qrel']

Unnamed: 0,app,P_10,recall_10
0,Inspec_HLT_SR.out,0.250256 bl bl,0.198677 bl bl
1,Inspec_HLT_SR_without.out,0.070702 ▼ ▼,0.066269 ▼ ▼


In [19]:
df_sampled = sig.build_over_sample(df_finale, RandomOverSampler(random_state=seed))

In [20]:
printable_sampled = sig.build_printable(df_sampled, significance_tests)
printable_sampled['RandomOverSampler(random_state=42, ratio=\'auto\')']

Unnamed: 0,app,P_10,recall_10
0,Inspec_HLT_SR.out,0.138567 bl bl,0.107937 bl bl
1,Inspec_HLT_SR_without.out,0.035351 ▼ ▼,0.033135 ▼ ▼


In [21]:
printable = sig.build_printable(df_finale, [])
printable['Inspec.qrel']

Unnamed: 0,app,P_10,recall_10
0,Inspec_HLT_SR.out,0.250256,0.198677
1,Inspec_HLT_SR_without.out,0.070702,0.066269
