In [2]:
import os
import shutil
import pandas as pd
pd.set_option('display.max_colwidth', 100)


In [3]:
import faiss
assert faiss.get_num_gpus() > 0

In [4]:
# load topics from topics.txt with format
# <topics>
# 	<topic>
# 		<id>1</id>
# 		<query> What are the most common chronic diseases? What effects do chronic diseases have for the society and the individual?
# 		</query>
# 	</topic>
# 	<topic>
# 		<id>8</id>
# 		<query> best apps daily activity exercise diabetes
# 		</query>
# 	</topic>
# 	<topic>
# 		<id>22</id>
# 		<query> my risk for developing type 2 diabetes
# 		</query>
# 	</topic>
# </topics>

file_path = 'data/topics/topics.txt'

with open(file_path, 'r') as f:
    topics = f.read()

topics = topics.split('<topic>')
topics = topics[1:]

topic_dict = {}
for topic in topics:
    topic_id = topic.split('<id>')[1].split('</id>')[0]
    topic_query = topic.split('<query>')[1].split('</query>')[0]
    topic_dict[topic_id] = topic_query

In [5]:

import pyterrier as pt
if not pt.started():
    pt.init()


PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [6]:
docno_to_text = pd.read_csv('CHS-2021/documents/Webdoc/crawl/txt.tsv', sep='\t')

In [7]:
def get_text_from_docno(docno):
    print(docno)
    if docno_to_text[docno_to_text['docid'] == docno].empty:
        return ''
    return docno_to_text[docno_to_text['docid'] == docno]['text'].values[0]

In [8]:
def ds_generate(use_chatgpt=True):
    with open('CHS-2021/documents/Webdoc/crawl/txt.tsv', 'r') as corpusfile:
        for l in corpusfile:
            docno, passage = l.split("\t")
            yield {'docno' : docno, 'text' : passage}
    if use_chatgpt:
        for idx, row in chat_gpt_results.iterrows():
            docno = 'chatgpt' + str(row['topic_id'])
            yield {'docno' : docno, 'text' : row['answer']}

use_colbert = False
use_chatgpt = False

if use_chatgpt:
    chat_gpt_results_infile = 'answers/chatgpt-16052023.csv'
    chat_gpt_results = pd.read_csv(chat_gpt_results_infile, header=None)
    chat_gpt_results.columns = ['topic_id', 'query', 'answer']
    chat_gpt_results['topic_id'] = chat_gpt_results['topic_id'].astype(str)
    chat_gpt_results['answer'] = chat_gpt_results['answer'].str.replace('\n', ' ').str.replace(',', ' ').str.replace('"', '')
    chat_gpt_results = chat_gpt_results[chat_gpt_results['answer'].notna()]

if use_colbert:
    index_path = './colbert_index'
    if use_chatgpt:
        index_path += '_chatgpt'
    if os.path.exists(index_path):  
        shutil.rmtree(index_path)
    from pyterrier_colbert.indexing import ColBERTIndexer
    # checkpoint="http://www.dcs.gla.ac.uk/~craigm/ecir2021-tutorial/colbert_model_checkpoint.zip"
    checkpoint="./colbertv2.0/pytorch_model.bin"

    indexer = ColBERTIndexer(checkpoint, index_path, "colbertindex", chunksize=3)
    indexref = indexer.index(ds_generate(use_chatgpt))
else:
    index_path = './IterDict_index'
    if use_chatgpt:
        index_path += '_chatgpt'
    if os.path.exists(index_path):  
        shutil.rmtree(index_path)
    iter_indexer = pt.IterDictIndexer(index_path)
    indexref = iter_indexer.index(ds_generate(use_chatgpt), meta={'docno' : 36, 'text': 2000})

  indexref = iter_indexer.index(ds_generate(use_chatgpt), meta={'docno' : 36, 'text': 2000})


In [9]:
if use_colbert:
    from pyterrier_colbert.ranking import ColBERTFactory
    index=(index_path, "colbertindex")
    pytcolbert = ColBERTFactory(checkpoint, *index)
    # rename the ivfpq file
    os.rename(index_path + '/colbertindex/ivfpq.100.faiss', index_path + '/colbertindex/ivfpq.faiss')
    dense_e2e = pytcolbert.end_to_end()

In [10]:
import xml.etree.ElementTree as ET
import pandas as pd

def load_topics(path):
    with open(path) as f:
        root = ET.fromstring(f.read())
    topic_dict = {}
    for topic in root.findall("topic"):
        topic_id = topic.findtext("id")
        topic_query = topic.findtext("query")
        if topic_id and topic_query:
            topic_dict[topic_id] = topic_query.strip().lower()
    topics = pd.DataFrame(topic_dict.items(), columns=["qid", "query"]) 
    topics["query"] = topics["query"].str.replace(r'\W+', ' ', regex=True)
    return topics

In [11]:

topics = load_topics("data/topics/topics.txt")
qrels = pt.io.read_qrels("data/assessments/qrels.txt") # type: ignore
qcred = pt.io.read_qrels("data/assessments/qcredibility.txt") # type: ignore
qread = pt.io.read_qrels("data/assessments/qreadability.txt") # type: ignore

all_qs = [("qrels", qrels), ("qcred", qcred), ("qread", qread)]

# remove non alphanumeric characters from queries

In [12]:
import pyterrier as pt

def run_experiment(pipeline, simple_name, topics, qrels, eval_metrics=["map", "bpref", "ndcg_cut_10"]):
    experiments = []
    for name, q in qrels:
        # change pipeline name to include the name of the query

        exp = pt.Experiment([pipeline], topics, q, eval_metrics, names=[name + '_' + simple_name])
        experiments.append(exp)
    return pd.concat(experiments, axis=0)

In [13]:
credibility_scores = pd.read_csv('data/all_passages_credibility_scores_bert.tsv', sep='\t')

In [18]:
credibility_scores.head()

Unnamed: 0,docid,credibility_score_bert
0,44f906a1-f818-4d48-a3fb-1673ebdeff13,0.004992
1,334ed241-6337-41ce-884c-2755648e14ea,0.004397
2,391b9de2-26dc-4187-b7ea-461153352e12,0.007133
3,461b6eb2-2b79-4fec-bf6b-63e302a093d9,0.007133
4,4e151d9e-a3fc-4e21-b5db-7b8ad4ea9d23,0.004555


In [19]:
#get the credibility score for a given docid
def get_credibility_score(docid):
    if credibility_scores[credibility_scores['docid'] == docid].empty:
        return 0
    return credibility_scores[credibility_scores['docid'] == docid]['credibility_score'].values[0]


In [52]:
import textstat
# rank documents with custom function that evaluates readability of the document
def readability_score(text):
    score = textstat.flesch_reading_ease(text)
    print(score)
    return score 

In [20]:
if use_colbert:
    retrieval = dense_e2e
    simple_name = 'colbert_msmarco'
else:
    dph = pt.BatchRetrieve(indexref, wmodel="DPH", metadata=["docno", "text"])
    # tfidf = pt.BatchRetrieve(indexref, wmodel="TF_IDF", metadata=["docno", "text"])
    bo1 = pt.rewrite.Bo1QueryExpansion(indexref)
    readability_rerank = pt.apply.doc_score(lambda row: get_credibility_score(row['docid']))
    pipelineQE_dph = dph >> bo1 >> dph >> readability_rerank
    simple_name = 'QE_dph_credibility'
    retrieval = pipelineQE_dph
results = run_experiment(retrieval,simple_name, topics, all_qs, ["map", "bpref", "ndcg_cut_10"])

In [50]:
append_to_file = ''
if use_chatgpt:
    append_to_file = '_chatgpt'
else:
    append_to_file = '_without_chatgpt'

results.to_csv('data/results/results_' + simple_name + append_to_file + '_credibility_bert.csv', index=False)

In [51]:
results

Unnamed: 0,name,map,bpref,ndcg_cut_10
0,qrels_QE_dph_readability,0.234132,0.42507,0.480908
0,qcred_QE_dph_readability,0.328263,0.653579,0.490473
0,qread_QE_dph_readability,0.276747,0.479719,0.651485


In [14]:
# iterate over topics and save the positions of chatgpt answers
if use_colbert:
    retrieval_model = dense_e2e
else:
    retrieval_model = pipelineQE_dph
out_df = []
for idx, row in topics.iterrows():
    if str(row['qid']) not in chat_gpt_results['topic_id'].values:
        continue
    res = retrieval_model.search(row["query"])
    best_answer_text = get_text_from_docno(res['docno'].tolist()[0])
    docno = 'chatgpt' + str(row['qid'])
    if docno not in res['docno'].values:
        out_df.append({'qid': row['qid'], 'query': row['query'], 'docno': docno, 'position': -1, 'chatgpt_answer': chat_gpt_results[chat_gpt_results['topic_id'] == str(row['qid'])]['answer'].values[0][:2000], 'best_answer': best_answer_text})
        continue
    position = res['docno'].tolist().index(docno)
    out_df.append({'qid': row['qid'], 'query': row['query'], 'docno': docno, 'position': position, 'chatgpt_answer': chat_gpt_results[chat_gpt_results['topic_id'] == str(row['qid'])]['answer'].values[0][:2000], 'best_answer': best_answer_text})
    
out_df = pd.DataFrame(out_df)

NameError: name 'chat_gpt_results' is not defined

In [None]:
if use_colbert:
    out_file = 'answers/chatgpt-16052023-positions-colbert.csv'
else:
    out_file = 'answers/chatgpt-16052023-positions-IterDict.csv'
out_df.to_csv(out_file, index=False)