In [1]:
import os
import shutil
import pandas as pd
from glob import glob
pd.set_option('display.max_colwidth', 100)


In [2]:
import faiss
assert faiss.get_num_gpus() > 0

In [3]:
print(os.environ['CUDA_HOME'])

/home/jonas/miniconda3/envs/faiss


In [4]:

import pyterrier as pt
if not pt.started():
    pt.init()


PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [5]:
docno_to_text = pd.read_csv('CHS-2021/documents/Webdoc/crawl/txt_over_50.tsv', sep='\t')

In [6]:
def get_text_from_docno(docno):
    print(docno)
    if docno_to_text[docno_to_text['docid'] == docno].empty:
        return ''
    return docno_to_text[docno_to_text['docid'] == docno]['text'].values[0]

In [7]:
def ds_generate(use_chatgpt=True):
    with open('CHS-2021/documents/Webdoc/crawl/txt_over_50.tsv', 'r') as corpusfile:
        for l in corpusfile:
            docno, passage = l.split("\t")
            yield {'docno' : docno, 'text' : passage}
    if use_chatgpt:
        for llm, results in all_llm_answers.items():
            for idx, row in results.iterrows():
                docno = llm + str(row['topic_id'])
                yield {'docno' : docno, 'text' : row['answer']}

use_colbert = False
use_chatgpt = False

if use_chatgpt:
    all_llm_answers = {}
    for file in glob('data/all_answers*.csv'):
        chat_gpt_results_infile = file
        print(chat_gpt_results_infile)
        chat_gpt_results = pd.read_csv(chat_gpt_results_infile, header=None, quotechar='"', skipinitialspace=True)
        chat_gpt_results.columns = ['topic_id', 'query', 'answer']
        chat_gpt_results['topic_id'] = chat_gpt_results['topic_id'].astype(str)
        chat_gpt_results['answer'] = chat_gpt_results['answer'].str.replace('\n', ' ').str.replace(',', ' ').str.replace('"', '')
        chat_gpt_results = chat_gpt_results[chat_gpt_results['answer'].notna()]
        print(chat_gpt_results.head())
        llm_identifier = '_'.join(file.split('_')[2:]).split('.')[0]
        print(llm_identifier)
        all_llm_answers[llm_identifier] = chat_gpt_results

if use_colbert:
    index_path = './colbert_index'
    if use_chatgpt:
        index_path += '_chatgpt'
    if os.path.exists(index_path):  
        shutil.rmtree(index_path)
    from pyterrier_colbert.indexing import ColBERTIndexer
    # checkpoint="http://www.dcs.gla.ac.uk/~craigm/ecir2021-tutorial/colbert_model_checkpoint.zip"
    checkpoint="./colbert_model_checkpoint/colbert.dnn"

    indexer = ColBERTIndexer(checkpoint, index_path, "colbertindex", chunksize=3)
    indexref = indexer.index(ds_generate(use_chatgpt))
else:
    index_path = './IterDict_index_over_50'
    if use_chatgpt:
        index_path += '_chatgpt'
    if os.path.exists(index_path):  
        shutil.rmtree(index_path)
    iter_indexer = pt.IterDictIndexer(index_path)
    indexref = iter_indexer.index(ds_generate(use_chatgpt), meta={'docno' : 36, 'text': 2000})

  indexref = iter_indexer.index(ds_generate(use_chatgpt), meta={'docno' : 36, 'text': 2000})


In [8]:
if use_colbert:
    from pyterrier_colbert.ranking import ColBERTFactory
    index=(index_path, "colbertindex")
    pytcolbert = ColBERTFactory(checkpoint, *index)
    # rename the ivfpq file
    os.rename(index_path + '/colbertindex/ivfpq.100.faiss', index_path + '/colbertindex/ivfpq.faiss')
    dense_e2e = pytcolbert.end_to_end()

In [9]:
import xml.etree.ElementTree as ET
import pandas as pd

def load_topics(path, clean_queries=False):
    with open(path) as f:
        root = ET.fromstring(f.read())
    topic_dict = {}
    for topic in root.findall("topic"):
        topic_id = topic.findtext("id")
        topic_query = topic.findtext("query")
        if topic_id and topic_query:
            topic_dict[topic_id] = topic_query.strip()
    topics = pd.DataFrame(topic_dict.items(), columns=["qid", "query"]) 
    if clean_queries:
        topics["query"] = topics["query"].str.lower().replace(r'\W+', ' ', regex=True)
    return topics

In [10]:
if use_colbert:
    clean_queries = False
else:
    clean_queries = True
topics = load_topics("data/topics/topics.txt", clean_queries=clean_queries)
qrels = pt.io.read_qrels("data/assessments/qrels.txt") # type: ignore
qcred = pt.io.read_qrels("data/assessments/qcredibility.txt") # type: ignore
qread = pt.io.read_qrels("data/assessments/qreadability.txt") # type: ignore

all_qs = [("qrels", qrels), ("qcred", qcred), ("qread", qread)]

# remove non alphanumeric characters from queries

In [11]:
import pyterrier as pt

def run_experiment(pipeline, simple_name, topics, qrels, eval_metrics=["map", "bpref", "ndcg_cut_10"]):
    experiments = []
    for name, q in qrels:
        # change pipeline name to include the name of the query

        exp = pt.Experiment([pipeline], topics, q, eval_metrics, names=[name + '_' + simple_name])
        experiments.append(exp)
    return pd.concat(experiments, axis=0)

In [12]:
credibility_scores = pd.read_csv('data/all_passages_credibility_scores_bert.tsv', sep='\t')

In [13]:
#get the credibility score for a given docid
def get_credibility_score(docid):
    if credibility_scores[credibility_scores['docid'] == docid].empty:
        return 0
    return credibility_scores[credibility_scores['docid'] == docid]['credibility_score'].values[0]


In [14]:
import textstat
# rank documents with custom function that evaluates readability of the document
def readability_score(text):
    score = textstat.flesch_reading_ease(text)
    print(score)
    return score 

In [15]:
if use_colbert:
    retrieval = dense_e2e
    simple_name = 'colbert_msmarco_over_50'
else:
    dph = pt.BatchRetrieve(indexref, wmodel="DPH", metadata=["docno", "text"])
    tfidf = pt.BatchRetrieve(indexref, wmodel="TF_IDF", metadata=["docno", "text"])
    bo1 = pt.rewrite.Bo1QueryExpansion(indexref)
    # readability_rerank = pt.apply.doc_score(lambda row: get_credibility_score(row['docid']))
    pipelineQE_dph = tfidf >> bo1 >> tfidf # >> readability_rerank
    simple_name = 'QE_tfidf_over_50'
    retrieval = pipelineQE_dph
results = run_experiment(retrieval,simple_name, topics, all_qs, ["map", "bpref", "ndcg_cut_10"])



In [16]:
append_to_file = ''
if use_chatgpt:
    append_to_file = '_chatgpt'
else:
    append_to_file = '_without_chatgpt'

results.to_csv('data/results/results_' + simple_name + append_to_file + '_clean_queries.csv', index=False)

In [17]:
results

Unnamed: 0,name,map,bpref,ndcg_cut_10
0,qrels_QE_tfidf_over_50,0.328905,0.430822,0.528759
0,qcred_QE_tfidf_over_50,0.492266,0.676894,0.605469
0,qread_QE_tfidf_over_50,0.394445,0.452956,0.691979


In [18]:
# iterate over topics and save the positions of chatgpt answers
if use_colbert:
    retrieval_model = dense_e2e
else:
    retrieval_model = pipelineQE_dph
out_df = []
for idx, row in topics.iterrows():
    for llm, chat_gpt_results in all_llm_answers.items():
        if str(row['qid']) not in chat_gpt_results['topic_id'].values:
            continue
        res = retrieval_model.search(row["query"])
        best_answer_text = get_text_from_docno(res['docno'].tolist()[0])
        docno = llm + str(row['qid'])
        if docno not in res['docno'].values:
            out_df.append({'qid': row['qid'], 'query': row['query'], 'llm': llm, 'docno': docno, 'position': -1, 'chatgpt_answer': chat_gpt_results[chat_gpt_results['topic_id'] == str(row['qid'])]['answer'].values[0][:2000], 'best_answer_if_not_llm': best_answer_text})
            continue
        position = res['docno'].tolist().index(docno)
        out_df.append({'qid': row['qid'], 'query': row['query'], 'llm': llm, 'docno': docno, 'position': position, 'chatgpt_answer': chat_gpt_results[chat_gpt_results['topic_id'] == str(row['qid'])]['answer'].values[0][:2000], 'best_answer_if_not_llm': best_answer_text})
    
out_df = pd.DataFrame(out_df)

NameError: name 'all_llm_answers' is not defined

In [None]:
from datetime import datetime
date = datetime.now().strftime("%d%m%Y")
if use_colbert:
    out_file = f'data/chatgpt-{date}-positions-{simple_name}.csv'
else:
    out_file = f'data/chatgpt-{date}-positions-{simple_name}.csv'
out_df.to_csv(out_file, index=False)

In [None]:
out_df

Unnamed: 0,qid,query,llm,docno,position,chatgpt_answer,best_answer_if_not_llm
0,1,what are the most common chronic diseases what effects do chronic diseases have for the society ...,Falcon-7b-instruct_long,Falcon-7b-instruct_long1,35,Diabetes heart disease cancer and Alzheimer's disease are some of the most common chronic di...,
1,1,what are the most common chronic diseases what effects do chronic diseases have for the society ...,OA_SFT_Llama_30B_7_clean_queries,OA_SFT_Llama_30B_7_clean_queries1,0,Chronic diseases also known as non-communicable diseases (NCDs) are long-lasting conditions th...,
2,1,what are the most common chronic diseases what effects do chronic diseases have for the society ...,falcon40b_prompt,falcon40b_prompt1,161,Some of the most common chronic diseases include heart disease cancer diabetes and chronic r...,
3,1,what are the most common chronic diseases what effects do chronic diseases have for the society ...,falcon7b_prompt,falcon7b_prompt1,10,The most common chronic diseases are heart disease diabetes stroke cancer and respiratory d...,
4,1,what are the most common chronic diseases what effects do chronic diseases have for the society ...,falcon40b_instruct,falcon40b_instruct1,57,Chronic diseases are long-term illnesses that require ongoing medical attention and management. ...,
...,...,...,...,...,...,...,...
485,150,fish oil supplement dosage,falcon40b_instruct,falcon40b_instruct150,40,A typical dosage of fish oil is 1-2 grams per day. However it's important to consult with a hea...,Contact Us Additional Product(s) in Your Cart. Order By Item# Find My Order View My Cart Check O...
486,150,fish oil supplement dosage,open-llama-13b,open-llama-13b150,83,The answer depends on the fish oil supplement the person taking it his or her weight and the ...,Contact Us Additional Product(s) in Your Cart. Order By Item# Find My Order View My Cart Check O...
487,150,fish oil supplement dosage,open-llama-13b_k50,open-llama-13b_k50150,24,My doctor told me that I can get enough omega-3 from a diet that contains a little fish. So I d...,Contact Us Additional Product(s) in Your Cart. Order By Item# Find My Order View My Cart Check O...
488,150,fish oil supplement dosage,chatgpt_clean_queries,chatgpt_clean_queries150,2,The dosage of fish oil supplements can vary depending on the specific product and the intended u...,Contact Us Additional Product(s) in Your Cart. Order By Item# Find My Order View My Cart Check O...
