In [4]:
import os
import shutil
import pandas as pd
from glob import glob
pd.set_option('display.max_colwidth', 100)


In [5]:

import pyterrier as pt
if not pt.started():
    pt.init()


In [6]:
docno_to_text = pd.read_csv('../CHS-2021/documents/Webdoc/crawl/txt_over_50.tsv', sep='\t')
#rename docid column to docno
docno_to_text = docno_to_text.rename(columns={'docid':'docno'})

In [7]:
def get_text_from_docno(docno):
    print(docno)
    if docno_to_text[docno_to_text['docno'] == docno].empty:
        return ''
    return docno_to_text[docno_to_text['docno'] == docno]['text'].values[0]

In [8]:
import xml.etree.ElementTree as ET
import pandas as pd

def load_topics(path, clean_queries=False):
    with open(path) as f:
        root = ET.fromstring(f.read())
    topic_dict = {}
    for topic in root.findall("topic"):
        topic_id = topic.findtext("id")
        topic_query = topic.findtext("query")
        if topic_id and topic_query:
            topic_dict[topic_id] = topic_query.strip()
    topics = pd.DataFrame(topic_dict.items(), columns=["qid", "query"]) 
    if clean_queries:
        topics["query"] = topics["query"].str.lower().replace(r'\W+', ' ', regex=True)
    return topics

In [9]:
topics = load_topics("../data/topics/topics.txt", clean_queries=True)
qrels = pt.io.read_qrels("../data/assessments/qrels.txt") # type: ignore
qcred = pt.io.read_qrels("../data/assessments/qcredibility.txt") # type: ignore
qread = pt.io.read_qrels("../data/assessments/qreadability.txt") # type: ignore

all_qs = [("qrels", qrels), ("qcred", qcred), ("qread", qread)]

# remove non alphanumeric characters from queries

In [10]:
# add qid to docno_to_text based on join with qrels . If multiple qids, add all of them
docno_to_text['qid'] = docno_to_text['docno'].apply(lambda x: qrels[qrels['docno'] == x]['qid'].values)

In [11]:
all_llms = ['', 'chatgpt_clean_queries', 'falcon7b_prompt', 'falcon40b_prompt', 'OA_LLama']

In [12]:
from tqdm import tqdm
use_colbert = True
retrieval_results = pd.DataFrame(columns=['judgement', 'qid', 'query', 'ndcg@10', 'map', 'bpref', 'name', 'num_docs', 'num_results'])

def yield_passages_from_df(df):
    for index, row in df.iterrows():
        yield {'docno': row['docno'], 'text': row['text']}


for index, row in tqdm(topics.iterrows(), total=topics.shape[0], position=0, leave=True, unit='queries'):
    qid = row['qid']
    query = row['query']
    # make copy of docno_to_text for all rows where qid is in qid
    passages_for_query = docno_to_text[docno_to_text['qid'].apply(lambda x: qid in x)].copy()
    num_docs = passages_for_query.shape[0]
    # create index for this query
    # delete index if it exists
    index_path = "./indexes/" + 'base' + "/query_" + qid
    if os.path.exists(index_path):
        shutil.rmtree(index_path)
    index = pt.DFIndexer(index_path)
    index.index(passages_for_query['text'], passages_for_query['docno'])
    dph = pt.BatchRetrieve(index, wmodel="DPH")
    # tfidf = pt.BatchRetrieve(indexref, wmodel="TF_IDF", metadata=["docno", "text"])
    bo1 = pt.rewrite.Bo1QueryExpansion(index)
    pipeline = dph >> bo1 >> dph # >> readability_rerank
    simple_name = 'QE_dph_over_50_base'
    
    # run pipeline and calculate ndcg@10
    res = pipeline.search(query)
    res['qid'] = qid
    num_results = res.shape[0]
    judgements = {'qrels': qrels, 'qcred': qcred, 'qread': qread}
    for name, q in judgements.items():
        exp = pt.Experiment([res], topics[topics['qid'] == qid], q[q['qid'] == qid], eval_metrics=['ndcg_cut_10', 'map', 'bpref'], names=[simple_name])
        ndcg = exp['ndcg_cut_10'][0]
        result_df = pd.DataFrame({'judgement': name, 'qid': qid, 'query': query, 'ndcg@10': ndcg, 'map': exp['map'][0], 'bpref': exp['bpref'][0], 'name': simple_name, 'num_docs': num_docs, 'num_results': num_results}, index=[0])
        retrieval_results = pd.concat([retrieval_results, result_df], ignore_index=True)
   
retrieval_results

  0%|          | 0/50 [00:00<?, ?queries/s]

100%|██████████| 50/50 [00:30<00:00,  1.64queries/s]


Unnamed: 0,judgement,qid,query,ndcg@10,map,bpref,name,num_docs,num_results
0,qrels,1,what are the most common chronic diseases what effects do chronic diseases have for the society ...,0.895853,0.735476,0.539382,QE_dph_over_50_base,244,244
1,qcred,1,what are the most common chronic diseases what effects do chronic diseases have for the society ...,0.764162,0.764478,0.506195,QE_dph_over_50_base,244,244
2,qread,1,what are the most common chronic diseases what effects do chronic diseases have for the society ...,0.963318,0.959591,0.689566,QE_dph_over_50_base,244,244
3,qrels,8,best apps daily activity exercise diabetes,0.347912,0.204938,0.176860,QE_dph_over_50_base,235,233
4,qcred,8,best apps daily activity exercise diabetes,0.502905,0.895949,0.678797,QE_dph_over_50_base,235,233
...,...,...,...,...,...,...,...,...,...
145,qcred,131,exercises for better posture,0.576404,0.501183,0.511529,QE_dph_over_50_base,84,84
146,qread,131,exercises for better posture,0.836347,0.318078,0.280833,QE_dph_over_50_base,84,84
147,qrels,132,headpats scalp psoriasis,0.726305,0.454830,0.520657,QE_dph_over_50_base,124,112
148,qcred,132,headpats scalp psoriasis,0.714093,0.774452,0.796364,QE_dph_over_50_base,124,112


In [20]:
#get average ndcg@10, map, bpref for each judgement
average_scores = retrieval_results.groupby(['judgement', 'name'])['ndcg@10', 'map', 'bpref'].mean()
# sort in order [qrels, qcred, qread]
average_scores = average_scores.reindex(['qrels', 'qcred', 'qread'], level=0)
average_scores

  average_scores = retrieval_results.groupby(['judgement', 'name'])['ndcg@10', 'map', 'bpref'].mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,ndcg@10,map,bpref
judgement,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
qrels,QE_dph_over_50_base,0.509754,0.4694,0.432712
qcred,QE_dph_over_50_base,0.637488,0.773065,0.729664
qread,QE_dph_over_50_base,0.751416,0.643351,0.465262


In [14]:
from tqdm import tqdm
results = pd.DataFrame(columns=['qid', 'query', 'llm', 'docno', 'rank', 'score', 'run_name', 'total_results', 'total_num_docs'])
for llm in all_llms:
    try:
        llm_answers = pd.read_csv('data/all_answers_' + llm + '.csv', header='infer', quotechar='"', skipinitialspace=True)
    except:
        print('No answer file for ' + llm)
        print('Skipping...')
        continue
    llm_answers['qid'] = llm_answers['qid'].astype(str)
    for index, row in tqdm(topics.iterrows(), total=topics.shape[0], desc=llm, position=0, leave=True, unit='queries'):
        qid = row['qid']
        if qid not in llm_answers['qid'].values:
           continue
        # get result rows for this qid
        qid_results = llm_answers[llm_answers['qid'] == qid]
        #add qid_results to docno_to_text
        docid = llm + '_' + qid
        # make copy of docno_to_text for all rows where qid is in qid
        passages_for_query = docno_to_text[docno_to_text['qid'].apply(lambda x: qid in x)].copy()
        # add new row
        new_row ={'docno': docid, 'text': qid_results['answer'], 'qid': [[qid]]} 
        passages_for_query = pd.concat([passages_for_query, pd.DataFrame(new_row)])
        
        # create index for this query
        # delete index if it exists
        if os.path.exists("./data/indexes/" + llm + "/query_" + qid):
            shutil.rmtree("./data/indexes/" + llm + "/query_" + qid)
        index = pt.DFIndexer("./data/indexes/" + llm + "/query_" + qid)
        index.index(passages_for_query['text'], passages_for_query['docno'])

        dph = pt.BatchRetrieve(index, wmodel="DPH")
        # tfidf = pt.BatchRetrieve(indexref, wmodel="TF_IDF", metadata=["docno", "text"])
        bo1 = pt.rewrite.Bo1QueryExpansion(index)
        pipelineQE_dph = dph >> bo1 >> dph # >> readability_rerank
        simple_name = 'QE_dph_over_50_' + llm
        
        # run pipeline
        res = pipelineQE_dph.search(row['query'])
        total_results = len(res['docno'].tolist())
        total_num_docs = passages_for_query.shape[0] - 1
        
        # get rank of docid
        if docid not in res['docno'].tolist():
            rank = total_results + 1
            score = 0
        else:
            rank = res['docno'].tolist().index(docid) + 1
            # get score of docid
            score = res['score'].tolist()[rank - 1]
        # add row to results
        results = pd.concat([results, pd.DataFrame({'qid': [qid], 'query': [row['query']], 'llm': [llm], 'docno': [docid], 'rank': [rank], 'score': [score], 'run_name': [simple_name], 'total_results': [total_results], 'total_num_docs': [total_num_docs]})])

No answer file for 
Skipping...
No answer file for chatgpt_clean_queries
Skipping...
No answer file for falcon7b_prompt
Skipping...
No answer file for falcon40b_prompt
Skipping...
No answer file for OA_LLama
Skipping...


In [15]:
# for each llm, get average rank and total amount of number 1s
for llm in all_llms:
    print(llm)
    llm_res = results[results['llm'] == llm]
    average_rank = llm_res['rank'].mean()
    print('average rank: ' + str(average_rank))
    num_1s = llm_res[llm_res['rank'] == 1].shape[0]
    print('number of 1s: ' + str(num_1s))
    print()


average rank: nan
number of 1s: 0

chatgpt_clean_queries
average rank: nan
number of 1s: 0

falcon7b_prompt
average rank: nan
number of 1s: 0

falcon40b_prompt
average rank: nan
number of 1s: 0

OA_LLama
average rank: nan
number of 1s: 0



In [16]:
results

Unnamed: 0,qid,query,llm,docno,rank,score,run_name,total_results,total_num_docs


In [17]:
import pyterrier as pt

def run_experiment(pipeline, simple_name, topics, qrels, eval_metrics=["map", "bpref", "ndcg_cut_10"]):
    experiments = []
    for name, q in qrels:
        # change pipeline name to include the name of the query

        exp = pt.Experiment([pipeline], topics, q, eval_metrics, names=[name + '_' + simple_name])
        experiments.append(exp)
    return pd.concat(experiments, axis=0)

In [18]:
credibility_scores = pd.read_csv('data/all_passages_credibility_scores_bert.tsv', sep='\t')

FileNotFoundError: [Errno 2] No such file or directory: 'data/all_passages_credibility_scores_bert.tsv'

In [None]:
#get the credibility score for a given docid
def get_credibility_score(docid):
    if credibility_scores[credibility_scores['docid'] == docid].empty:
        return 0
    return credibility_scores[credibility_scores['docid'] == docid]['credibility_score'].values[0]


In [None]:
import textstat
# rank documents with custom function that evaluates readability of the document
def readability_score(text):
    score = textstat.flesch_reading_ease(text)
    print(score)
    return score 

In [None]:
all_results = []
if use_colbert:
    retrieval = dense_e2e
    simple_name = 'colbert_msmarco_over_50'
else:
    for indexref in indexrefs:
        identifier = indexref['identifier']
        indexref = indexref['indexref']
        dph = pt.BatchRetrieve(indexref, wmodel="DPH", metadata=["docno", "text"])
        # tfidf = pt.BatchRetrieve(indexref, wmodel="TF_IDF", metadata=["docno", "text"])
        bo1 = pt.rewrite.Bo1QueryExpansion(indexref)
        # readability_rerank = pt.apply.doc_score(lambda row: get_credibility_score(row['docid']))
        pipelineQE_dph = dph >> bo1 >> dph # >> readability_rerank
        simple_name = 'QE_dph_over_50_' + identifier
        retrieval = pipelineQE_dph
        llm_answers = run_experiment(retrieval, simple_name, topics, all_qs, ["map", "bpref", "ndcg_cut_10"])
        all_results.append({'identifier': identifier, 'results': llm_answers, 'retrieval': retrieval})



KeyboardInterrupt: 

In [None]:
append_to_file = ''
if use_chatgpt:
    append_to_file = '_chatgpt'
else:
    append_to_file = '_without_chatgpt'

llm_answers.to_csv('data/results/results_' + simple_name + append_to_file + '_clean_queries.csv', index=False)

NameError: name 'use_chatgpt' is not defined

In [None]:
all_results[0]['results']

Unnamed: 0,name,map,bpref,ndcg_cut_10
0,qrels_QE_tfidf_over_50_IterDict,0.357313,0.447128,0.569338
0,qcred_QE_tfidf_over_50_IterDict,0.505827,0.679313,0.623856
0,qread_QE_tfidf_over_50_IterDict,0.403736,0.452294,0.722833


In [None]:
# iterate over topics and save the positions of chatgpt answers
# if use_colbert:
#     retrieval_model = dense_e2e
# else:
#     retrieval_model = pipelineQE_dph
out_dfs = []
for llm_number, result in enumerate(all_results[1:]):
    identifier = result['identifier'].replace('IterDict_', '')
    llm_answers = result['results']
    retrieval_model = result['retrieval']
    out_df = []
    for idx, row in topics.iterrows():
        res = retrieval_model.search(row["query"])
        best_answer_text = get_text_from_docno(res['docno'].tolist()[0])
        llm_answer = all_llm_answers[llm_number]['results'][all_llm_answers[llm_number]['results']['topic_id'] == str(row['qid'])]['answer'].values[0]
        docno = identifier + str(row['qid'])
        if docno not in res['docno'].values:
            out_df.append({'qid': row['qid'], 'query': row['query'], 'llm': identifier, 'docno': docno, 'position': -1, 'llm_answer': llm_answer[:2000], 'best_answer_if_not_llm': best_answer_text})
            continue
        position = res['docno'].tolist().index(docno)
        out_df.append({'qid': row['qid'], 'query': row['query'], 'llm': identifier, 'docno': docno, 'position': position, 'llm_answer': llm_answer[:2000], 'best_answer_if_not_llm': best_answer_text})
        
    out_dfs.append(pd.DataFrame(out_df))

chatgpt_clean_queries1
chatgpt_clean_queries8
381d63ae-ec5f-4fa0-b331-7df8b0f63e38
05d7038c-0ea2-41f7-b439-f1a80b6ae343
1cc2d142-9499-4770-82ae-816108994dd9
5c615aac-2089-4755-85cc-e43cd6d25f3f
ac64710f-5be6-4338-81bf-f449341dbfc7
c7087ca7-07b1-4a83-8b4f-750f42e2b248
401363ff-141a-4cc9-8b47-484a277e102f
94227313-076b-46c2-8aa9-ace1c7a6c19f
30c124f4-0535-41d8-8afe-54d8b8ab9c54
812e0336-7233-4286-908e-fd00e35724f5
ac5e10e1-2da7-41a9-94a7-098461398997
chatgpt_clean_queries62
a1b0e7c2-2b4d-4430-9a08-a0e2a5ae99ba
8c07ded1-c889-4259-ae03-c7d5a9817183
cb5586bc-6eed-4739-be79-c232a507c20f
chatgpt_clean_queries77
ae5f0ad2-193c-491a-a61d-d058b7934607
0378a4b6-930f-45d5-b518-ca90d8c2384f
chatgpt_clean_queries83
6b9c9f03-cb37-4e0f-9ac1-35b14551f750
5e829a57-b1e6-4d23-aa53-db5834c8af30
a9f343d9-958d-4d58-a76b-6a42b8442e7e
chatgpt_clean_queries93
92a08e89-6397-4ed4-a708-2ea1ce5e94fe
cc99940b-32b0-4364-a184-7b80bbf6ee7c
chatgpt_clean_queries96
f9d4fc76-1b16-4c31-b707-d25ec46d977d
57f56d68-7e40-4e3c-9

In [None]:
from datetime import datetime
for out_df in out_dfs:
    date = datetime.now().strftime("%d%m%Y")
    identifier = out_df['docno'].tolist()[0][:-2]
    if use_colbert:
        out_file = f'data/{identifier}-{date}-positions-{simple_name}.csv'
    else:
        out_file = f'data/{identifier}-{date}-positions-{simple_name}.csv'
    out_df.to_csv(out_file, index=False)

In [None]:
for out_df in out_dfs:
    print(out_df['position'].mean())
    # print number 1 positions
    print(out_df[out_df['position'] == 0]['qid'])

40.2
0       1
1       8
13     62
17     77
20     83
24     93
27     96
34    108
45    126
Name: qid, dtype: object
54.66
0       1
9      55
17     77
21     85
34    108
49    132
Name: qid, dtype: object
112.92
0       1
30    101
34    108
49    132
Name: qid, dtype: object
144.54
0    1
Name: qid, dtype: object
