In [None]:
import numpy as np
import os
import pandas as pd
import subprocess 
import sys
from IPython.display import display, HTML

In [None]:
'''
Compute SNC, repeated 20 times for each collection
'''
COLLECTIONS = ['TREC3', 'TREC5', 'TREC6', 'TREC7', 'TREC8', 'TREC2001', 'R04', 'TB06', 'TB06M', 'WEB14']
MU_VALUES = {'TREC3': 14.9, 'TREC5': 3.9, 'TREC6': 6.32, 'TREC7': 5.78, 'TREC8': 5.35, 'TREC2001': 3.916, 'R04': 5.189,
             'TB06': 17.486, 'TB06M': 17.797, 'WEB14': 39.196}
SIGMA_VALUES = {'TREC3': 0.123, 'TREC5': 0.043, 'TREC6': 0.067, 'TREC7': 0.047, 'TREC8': 0.048, 'TREC2001': 0.049,
                'R04': 0.051, 'TB06': 0.139, 'TB06M': 0.14, 'WEB14': 0.234}
for COLLECTION in COLLECTIONS:
    print('Working on {}'.format(COLLECTION))
    
    # Real systems/topics table, trec_eval, runs and real qrels paths 
    CSV_TABLE = '../../src/Tables/{}.csv'.format(COLLECTION)
    REAL_QRELS = '../../src/qrels/qrels.{}.txt'.format(COLLECTION)

    real_table = pd.read_csv(CSV_TABLE, sep=',', header=0, index_col=0)
    systems = real_table.index.values
    topics = real_table.columns.values

    NUM_REP = 20
    for rep in range(NUM_REP):
        np.random.seed(seed=rep)

        #Create artificial qrels using the information coming from the real ones
        qrels = pd.read_csv(REAL_QRELS, sep=' ', header=None)
        qrels.columns = ['topic', 'zero', 'doc', 'relevant']
        qrels_topic_relevant = qrels.groupby('topic')['relevant']
        qrels_agg = qrels_topic_relevant.agg({'count': 'count'})
        mu = MU_VALUES_FIT[COLLECTION]
        sigma = SIGMA_VALUES[COLLECTION]
        # Build a normal distribution using mu and sigma
        s = np.random.normal(mu, sigma, size=len(topics))
        qrels_agg['perc_of_sampling'] = s

        # Sample the documents with respect to the normal distribution
        df_sampled_docs = pd.DataFrame(columns=['topic', 'doc', 'relevant'])
        for t in topics:
            sub = qrels[qrels['topic'] == int(t)]
            sub['relevant'] = 0
            perc = int((qrels_agg.loc[int(t)]['perc_of_sampling']) * (qrels_agg.loc[int(t)]['count']) / 100 )
            sample = np.random.choice(sub['doc'], size=perc, replace=False)
            sub.ix[sub.doc.isin(sample), 'relevant'] = 1
            df_sampled_docs = pd.concat([df_sampled_docs, sub])

        df_sampled_docs['topic'] = df_sampled_docs['topic'].astype(int)
        df_sampled_docs['relevant'] = df_sampled_docs['relevant'].astype(int)
        df_sampled_docs['zero'] = df_sampled_docs['zero'].astype(int)

        df_sampled_docs = df_sampled_docs[['topic', 'zero', 'doc', 'relevant']]
        df_sampled_docs.sort_values(by='topic', inplace=True)

        # Artificial qrels, trec_eval results and systems/topics Soboroff matrix paths
        ARTIFICIAL_QRELS = '../../pickles/SNC_estimate_orig_qrels/sampled_qrels/{}_qrels_in_{}.csv'.format(COLLECTION, rep)
        
        # save qrels to file
        df_sampled_docs.to_csv(ARTIFICIAL_QRELS, sep=' ', header=False, index=False)
        