In [1]:
import os
import shutil
import pandas as pd
from glob import glob
pd.set_option('display.max_colwidth', 100)


In [2]:
import faiss
assert faiss.get_num_gpus() > 0

In [3]:
print(os.environ['CUDA_HOME'])

/home/jonas/miniconda3/envs/colbert


In [4]:

import pyterrier as pt
if not pt.started():
    pt.init()


PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [5]:
docno_to_text = pd.read_csv('CHS-2021/documents/Webdoc/crawl/txt_over_50.tsv', sep='\t')

In [6]:
def get_text_from_docno(docno):
    print(docno)
    if docno_to_text[docno_to_text['docid'] == docno].empty:
        return ''
    return docno_to_text[docno_to_text['docid'] == docno]['text'].values[0]

In [7]:
def ds_generate(llms_to_use):
    with open('CHS-2021/documents/Webdoc/crawl/txt_over_50.tsv', 'r') as corpusfile:
        for l in corpusfile:
            docno, passage = l.split("\t")
            yield {'docno' : docno, 'text' : passage}
        for llm in llms_to_use:
            results = llm['results']
            llm = llm['identifier']
            for idx, row in results.iterrows():
                docno = llm + str(row['topic_id'])
                yield {'docno' : docno, 'text' : row['answer']}

use_colbert = False
llms_to_use = ['chatgpt_clean_queries', 'falcon7b_prompt', 'falcon40b_prompt', 'OA_LLama']

all_llm_answers = []
for file_name in llms_to_use:
    file = 'data/all_answers_' + file_name + '.csv'
    results = pd.read_csv(file, header=None, quotechar='"', skipinitialspace=True)
    results.columns = ['topic_id', 'query', 'answer', 'no_answer']
    results['topic_id'] = results['topic_id'].astype(str)
    results['answer'] = results['answer'].str.replace('\n', ' ').str.replace(',', ' ').str.replace('"', '')
    results = results[results['answer'].notna()]
    print(results.head())
    llm_identifier = '_'.join(file.split('_')[2:]).split('.')[0]
    print(llm_identifier)
    all_llm_answers.append({'identifier': llm_identifier, 'results': results})

indexrefs = []
if use_colbert:
    index_path = './colbert_index'
    if use_chatgpt:
        index_path += '_chatgpt'
    if os.path.exists(index_path):  
        shutil.rmtree(index_path)
    from pyterrier_colbert.indexing import ColBERTIndexer
    # checkpoint="http://www.dcs.gla.ac.uk/~craigm/ecir2021-tutorial/colbert_model_checkpoint.zip"
    checkpoint="./colbert_model_checkpoint/colbert.dnn"

    indexer = ColBERTIndexer(checkpoint, index_path, "colbertindex", chunksize=3)
    indexref = indexer.index(ds_generate(use_chatgpt))
else:
    index_path = './IterDict_index_over_50'
    if os.path.exists(index_path):  
        shutil.rmtree(index_path)
    iter_indexer = pt.IterDictIndexer(index_path)
    indexref = iter_indexer.index(ds_generate([]), meta={'docno' : 36, 'text': 2000})
    indexrefs.append({'identifier': 'IterDict', 'indexref': indexref})
    for llm in all_llm_answers:
        index_path = './IterDict_index_over_50_' + llm['identifier']
        if os.path.exists(index_path):  
            shutil.rmtree(index_path)
        iter_indexer = pt.IterDictIndexer(index_path)
        indexref = iter_indexer.index(ds_generate([llm]), meta={'docno' : 36, 'text': 2000})
        indexrefs.append({'identifier': 'IterDict_' + llm['identifier'], 'indexref': indexref})

  topic_id  \
0      qid   
1        1   
2        8   
3       22   
4       35   

                                                                                                 query  \
0                                                                                                query   
1  What are the most common chronic diseases? What effects do chronic diseases have for the society...   
2                                                           best apps daily activity exercise diabetes   
3                                                               my risk for developing type 2 diabetes   
4                                        Is a ketogenic / keto diet suitable for people with diabetes?   

                                                                                                answer  \
0                                                                                               answer   
1  The most common chronic diseases are:  1. Heart disease 2. Diab

  indexref = iter_indexer.index(ds_generate([]), meta={'docno' : 36, 'text': 2000})
  indexref = iter_indexer.index(ds_generate([llm]), meta={'docno' : 36, 'text': 2000})


In [8]:
if use_colbert:
    from pyterrier_colbert.ranking import ColBERTFactory
    index=(index_path, "colbertindex")
    pytcolbert = ColBERTFactory(checkpoint, *index)
    # rename the ivfpq file
    os.rename(index_path + '/colbertindex/ivfpq.100.faiss', index_path + '/colbertindex/ivfpq.faiss')
    dense_e2e = pytcolbert.end_to_end()

In [9]:
import xml.etree.ElementTree as ET
import pandas as pd

def load_topics(path, clean_queries=False):
    with open(path) as f:
        root = ET.fromstring(f.read())
    topic_dict = {}
    for topic in root.findall("topic"):
        topic_id = topic.findtext("id")
        topic_query = topic.findtext("query")
        if topic_id and topic_query:
            topic_dict[topic_id] = topic_query.strip()
    topics = pd.DataFrame(topic_dict.items(), columns=["qid", "query"]) 
    if clean_queries:
        topics["query"] = topics["query"].str.lower().replace(r'\W+', ' ', regex=True)
    return topics

In [10]:
if use_colbert:
    clean_queries = False
else:
    clean_queries = True
topics = load_topics("data/topics/topics.txt", clean_queries=clean_queries)
qrels = pt.io.read_qrels("data/assessments/qrels.txt") # type: ignore
qcred = pt.io.read_qrels("data/assessments/qcredibility.txt") # type: ignore
qread = pt.io.read_qrels("data/assessments/qreadability.txt") # type: ignore

all_qs = [("qrels", qrels), ("qcred", qcred), ("qread", qread)]

# remove non alphanumeric characters from queries

In [11]:
import pyterrier as pt

def run_experiment(pipeline, simple_name, topics, qrels, eval_metrics=["map", "bpref", "ndcg_cut_10"]):
    experiments = []
    for name, q in qrels:
        # change pipeline name to include the name of the query

        exp = pt.Experiment([pipeline], topics, q, eval_metrics, names=[name + '_' + simple_name])
        experiments.append(exp)
    return pd.concat(experiments, axis=0)

In [12]:
credibility_scores = pd.read_csv('data/all_passages_credibility_scores_bert.tsv', sep='\t')

In [13]:
#get the credibility score for a given docid
def get_credibility_score(docid):
    if credibility_scores[credibility_scores['docid'] == docid].empty:
        return 0
    return credibility_scores[credibility_scores['docid'] == docid]['credibility_score'].values[0]


In [14]:
import textstat
# rank documents with custom function that evaluates readability of the document
def readability_score(text):
    score = textstat.flesch_reading_ease(text)
    print(score)
    return score 

In [15]:
all_results = []
if use_colbert:
    retrieval = dense_e2e
    simple_name = 'colbert_msmarco_over_50'
else:
    for indexref in indexrefs:
        identifier = indexref['identifier']
        indexref = indexref['indexref']
        dph = pt.BatchRetrieve(indexref, wmodel="DPH", metadata=["docno", "text"])
        # tfidf = pt.BatchRetrieve(indexref, wmodel="TF_IDF", metadata=["docno", "text"])
        bo1 = pt.rewrite.Bo1QueryExpansion(indexref)
        # readability_rerank = pt.apply.doc_score(lambda row: get_credibility_score(row['docid']))
        pipelineQE_dph = dph >> bo1 >> dph # >> readability_rerank
        simple_name = 'QE_dph_over_50_' + identifier
        retrieval = pipelineQE_dph
        results = run_experiment(retrieval, simple_name, topics, all_qs, ["map", "bpref", "ndcg_cut_10"])
        all_results.append({'identifier': identifier, 'results': results, 'retrieval': retrieval})



In [16]:
append_to_file = ''
if use_chatgpt:
    append_to_file = '_chatgpt'
else:
    append_to_file = '_without_chatgpt'

results.to_csv('data/results/results_' + simple_name + append_to_file + '_clean_queries.csv', index=False)

NameError: name 'use_chatgpt' is not defined

In [20]:
all_results[0]['results']

Unnamed: 0,name,map,bpref,ndcg_cut_10
0,qrels_QE_tfidf_over_50_IterDict,0.357313,0.447128,0.569338
0,qcred_QE_tfidf_over_50_IterDict,0.505827,0.679313,0.623856
0,qread_QE_tfidf_over_50_IterDict,0.403736,0.452294,0.722833


In [21]:
# iterate over topics and save the positions of chatgpt answers
# if use_colbert:
#     retrieval_model = dense_e2e
# else:
#     retrieval_model = pipelineQE_dph
out_dfs = []
for llm_number, result in enumerate(all_results[1:]):
    identifier = result['identifier'].replace('IterDict_', '')
    results = result['results']
    retrieval_model = result['retrieval']
    out_df = []
    for idx, row in topics.iterrows():
        res = retrieval_model.search(row["query"])
        best_answer_text = get_text_from_docno(res['docno'].tolist()[0])
        llm_answer = all_llm_answers[llm_number]['results'][all_llm_answers[llm_number]['results']['topic_id'] == str(row['qid'])]['answer'].values[0]
        docno = identifier + str(row['qid'])
        if docno not in res['docno'].values:
            out_df.append({'qid': row['qid'], 'query': row['query'], 'llm': identifier, 'docno': docno, 'position': -1, 'llm_answer': llm_answer[:2000], 'best_answer_if_not_llm': best_answer_text})
            continue
        position = res['docno'].tolist().index(docno)
        out_df.append({'qid': row['qid'], 'query': row['query'], 'llm': identifier, 'docno': docno, 'position': position, 'llm_answer': llm_answer[:2000], 'best_answer_if_not_llm': best_answer_text})
        
    out_dfs.append(pd.DataFrame(out_df))

chatgpt_clean_queries1
chatgpt_clean_queries8
381d63ae-ec5f-4fa0-b331-7df8b0f63e38
05d7038c-0ea2-41f7-b439-f1a80b6ae343
1cc2d142-9499-4770-82ae-816108994dd9
5c615aac-2089-4755-85cc-e43cd6d25f3f
ac64710f-5be6-4338-81bf-f449341dbfc7
c7087ca7-07b1-4a83-8b4f-750f42e2b248
401363ff-141a-4cc9-8b47-484a277e102f
94227313-076b-46c2-8aa9-ace1c7a6c19f
30c124f4-0535-41d8-8afe-54d8b8ab9c54
812e0336-7233-4286-908e-fd00e35724f5
ac5e10e1-2da7-41a9-94a7-098461398997
chatgpt_clean_queries62
a1b0e7c2-2b4d-4430-9a08-a0e2a5ae99ba
8c07ded1-c889-4259-ae03-c7d5a9817183
cb5586bc-6eed-4739-be79-c232a507c20f
chatgpt_clean_queries77
ae5f0ad2-193c-491a-a61d-d058b7934607
0378a4b6-930f-45d5-b518-ca90d8c2384f
chatgpt_clean_queries83
6b9c9f03-cb37-4e0f-9ac1-35b14551f750
5e829a57-b1e6-4d23-aa53-db5834c8af30
a9f343d9-958d-4d58-a76b-6a42b8442e7e
chatgpt_clean_queries93
92a08e89-6397-4ed4-a708-2ea1ce5e94fe
cc99940b-32b0-4364-a184-7b80bbf6ee7c
chatgpt_clean_queries96
f9d4fc76-1b16-4c31-b707-d25ec46d977d
57f56d68-7e40-4e3c-9

In [22]:
from datetime import datetime
for out_df in out_dfs:
    date = datetime.now().strftime("%d%m%Y")
    identifier = out_df['docno'].tolist()[0][:-2]
    if use_colbert:
        out_file = f'data/{identifier}-{date}-positions-{simple_name}.csv'
    else:
        out_file = f'data/{identifier}-{date}-positions-{simple_name}.csv'
    out_df.to_csv(out_file, index=False)

In [23]:
for out_df in out_dfs:
    print(out_df['position'].mean())
    # print number 1 positions
    print(out_df[out_df['position'] == 0]['qid'])

40.2
0       1
1       8
13     62
17     77
20     83
24     93
27     96
34    108
45    126
Name: qid, dtype: object
54.66
0       1
9      55
17     77
21     85
34    108
49    132
Name: qid, dtype: object
112.92
0       1
30    101
34    108
49    132
Name: qid, dtype: object
144.54
0    1
Name: qid, dtype: object
