## data loading and imports

In [1]:
from clef.utils.data_loading import load_datasets, task5_dir
from clef.utils.data_loading import write_trec_format_output
from clef.retrieval.retrieve import retrieve_evidence
import os

root_path = '../../'
out_dir = './data-out/train-setup2'
golden_labels_file = os.path.join(root_path, 'clef', 'data', 'train_qrels.txt')

train, dev = load_datasets(preprocess=True,
                           add_author_name=True,
                           add_author_bio=False,
                           root_path= root_path,)

# ensure out_dir directories exist for later
if not os.path.exists(out_dir):
    os.makedirs(out_dir)
    if not os.path.exists(os.path.join(out_dir, 'eval')):
        os.makedirs(os.path.join(out_dir, 'eval'))

loaded 96 training json lines and 32 dev json lines.


In [2]:
import json

print(json.dumps(train[0], indent=4))

{
    "id": "AuRED_014",
    "rumor": "Urgent Ramallah Ministry of Health spokesman Kamal Al-Shakhra: We received 2 000 doses of the American Moderna Corona vaccine and this batch will be designated for President Abbas the Fatah Central Committee and VIPs",
    "label": "REFUTES",
    "timeline": [
        [
            "https://twitter.com/ibrahimmilhim",
            "1357270458756960257",
            "Account: Ibrahim Melhem\n Ibraim Milhim\nText: Qalqilya (5) Bethlehem (10) Nablus (24) Jericho and the Jordan Valley (9) Ramallah and Al-Bireh (0) Tulkarm (28) Hebron (23) Jenin (19) Gaza Strip ( 275) The Ministry of Health noted that there were 55 patients in intensive care rooms including 20 patients on ventilators"
        ],
        [
            "https://twitter.com/ibrahimmilhim",
            "1357270456915685377",
            "Account: Ibrahim Melhem\n Ibraim Milhim\nText: Salfit (49) Jerusalem suburbs (61) Tubas (18) Qalqilya (12) Bethlehem (42) Nablus (53) Jericho and Al-Aghwar

## pyserini

In [6]:
method = 'LUCENE'
data = retrieve_evidence(train, method, kwargs={})
write_trec_format_output(f'{out_dir}/{method}-train.trec.txt', data, method)

  0%|          | 0/96 [00:00<?, ?it/s]

wrote 449 lines to ./data-out/train-setup2/LUCENE-train.trec.txt


## naive tfidf

In [7]:
method = 'TFIDF'
data = retrieve_evidence(train, method, kwargs={})
write_trec_format_output(f'{out_dir}/{method}-train.trec.txt', data, method)

  0%|          | 0/96 [00:00<?, ?it/s]

wrote 469 lines to ./data-out/train-setup2/TFIDF-train.trec.txt


## sentence_transformers

In [8]:
method = 'SBERT'
data = retrieve_evidence(train, method, kwargs={})
write_trec_format_output(f'{out_dir}/{method}-train.trec.txt', data, method)

  0%|          | 0/96 [00:00<?, ?it/s]

wrote 469 lines to ./data-out/train-setup2/SBERT-train.trec.txt


## openai embeddings

In [9]:
method = 'OPENAI'
data = retrieve_evidence(train, method, kwargs={})
write_trec_format_output(f'{out_dir}/{method}-train.trec.txt', data, method)

  0%|          | 0/96 [00:00<?, ?it/s]

wrote 469 lines to ./data-out/train-setup2/OPENAI-train.trec.txt


## terrier

RESTART THE KERNEL HERE

In [3]:
import pandas as pd

def jsons_to_pandas(jsons):
    data = []
    for entry in jsons:
        rumor_id = entry['id']
        query = entry['rumor']
        timeline = entry['timeline']

        for author, tw_id, tw in timeline:
            data += [
                [rumor_id, "".join([x if x.isalnum() else " " for x in query]), tw_id, tw]
            ]

    df = pd.DataFrame(data,
                      columns=["qid", "query", "docno", "text"],)
    return df

df = jsons_to_pandas(train)

In [6]:
# generally best so far: BM25 or DPH with qe off

import pyterrier as pt

if not pt.started():
    pt.init()

import pyterrier.io as ptio
import pyterrier.pipelines as ptpipelines
from ir_measures import P, R, MAP

from pyterrier.batchretrieve import TextScorer

wmodel = 'BM25'

textscorer = TextScorer(takes="docs",
                        returns="queries",
                        body_attr="text",
                        wmodel=wmodel,
                        controls={"qe":"off"})

rtr = textscorer.transform(df)

method = 'TERRIER'
tag = wmodel
fn = f'{out_dir}/{method}-{wmodel}-train.trec.txt'

ptio._write_results_trec(rtr.query('rank < 5'), fn, run_name=wmodel)



## evaluation

In [11]:
import datetime
from clef.utils.scoring import eval_run_retrieval
from clef.utils.data_loading import task5_dir

import pandas as pd
from IPython.core.display import display_html

# sample_submission_file =  f'{root_path}/{task5_dir}/submission_samples/KGAT_zeroShot_evidence_English_dev.txt'

lucene_submission_file = f'{out_dir}/LUCENE-train.trec.txt'
tfidf_submission_file = f'{out_dir}/TFIDF-train.trec.txt'
terrier_submission_file = f'{out_dir}/TERRIER-DPH-train.trec.txt'
sbert_submission_file = f'{out_dir}/SBERT-train.trec.txt'
openai_submission_file = f'{out_dir}/OPENAI-train.trec.txt'


time_now  = datetime.datetime.now().strftime('%d_%m_%Y_%H_%M_%S') 
eval_out_file = f'{out_dir}/eval/RQ1-{time_now}.csv'

eval_data = [
    # ['baseline',    *[v for v in eval_run_retrieval(sample_submission_file,  golden_labels_file).values()]],
    ['lucence',     *[v for v in eval_run_retrieval(lucene_submission_file,  golden_labels_file).values()]],
    ['tfidf',       *[v for v in eval_run_retrieval(tfidf_submission_file,   golden_labels_file).values()]],
    ['terrier',     *[v for v in eval_run_retrieval(terrier_submission_file, golden_labels_file).values()]],
    ['sbert',       *[v for v in eval_run_retrieval(sbert_submission_file,   golden_labels_file).values()]],
    ['openai',      *[v for v in eval_run_retrieval(openai_submission_file,  golden_labels_file).values()]],
]

eval_df = pd.DataFrame(eval_data)
eval_df.columns = ['method', 'R@5', 'MAP']
df_r5  = eval_df[['method', 'R@5']].sort_values('R@5', axis=0, ascending=False)
df_map = eval_df[['method', 'MAP']].sort_values('MAP', axis=0, ascending=False)

map_styler = df_map.style.set_table_attributes("style='display:inline'").set_caption('Mean Average Precision')
r5_styler = df_r5.style.set_table_attributes("style='display:inline'").set_caption('Recall @ 5')

eval_df.to_csv(eval_out_file)

display_html(map_styler._repr_html_()+r5_styler._repr_html_(), raw=True)

Unnamed: 0,method,MAP
3,sbert,0.564958
2,terrier,0.564514
4,openai,0.561422
0,lucence,0.553322
1,tfidf,0.480119

Unnamed: 0,method,R@5
2,terrier,0.605036
4,openai,0.595489
3,sbert,0.594947
0,lucence,0.583495
1,tfidf,0.573954
