Find and exclude missing docs from preprocessed dataset

In [1]:
import csv
import os
docs_path = '../Dataset/docs/'
docs_list = sorted(os.listdir(docs_path))
queries = '../Dataset/Queries_20'
relevant = '../Dataset/cfquery_detailed'

previous_doc_name = 0
count = 0
missing = []
for doc_name in docs_list:
    if((int(doc_name) - int(previous_doc_name)) != 1):
        for i in range(1,int(doc_name) - int(previous_doc_name)):
            missing.append(int(doc_name)-i)
        count += 1
    previous_doc_name = doc_name
print(f"Missing count: {count}")
print(f"Missing {missing}")

Missing count: 28
Missing [129, 132, 200, 213, 219, 282, 296, 343, 397, 465, 489, 492, 491, 508, 507, 513, 537, 627, 724, 902, 926, 940, 1013, 1033, 1103, 1134, 1138, 1184, 1190, 1225]


Because ColBERT needs incremental cids, I create a dictionary mapping incremental cids to doc ids

In [2]:
cid_dictionary = {}
i = 0
for doc_name in docs_list:
    cid_dictionary[doc_name.lstrip('0')] = i 
    i += 1

inverted_cid_dictionary = {v: k for k, v in cid_dictionary.items()}

Preprocess the data into Tab Separated Values (TSV)

In [3]:
import operator

def processRelevantIntoRanking(relevant_string, query_id):
    relevant_string = relevant_string.split()
    relevant_order = {}
    flag = False
    for text in relevant_string:
        if(flag == False):
            if (text in cid_dictionary):
                doc_id = cid_dictionary[text]
                flag = True
            else:
                continue
        else:
            score = 0
            for digit in text:
                score += int(digit)
            score /= len(text)
            relevant_order[doc_id] = score
            flag = False
    relevant_order = dict(sorted(relevant_order.items(), key=operator.itemgetter(1),reverse=True))
    i = 1
    for doc_id in relevant_order:
        relevant_order[doc_id] = i
        i += 1
    return relevant_order

In [4]:
import csv
import os
docs_path = '../Dataset/docs/'
docs_list = sorted(os.listdir(docs_path))
queries = '../Dataset/Queries_20'
relevant = '../Dataset/cfquery_detailed'

with open('../Dataset/TSVs/collection.tsv', 'w', newline='') as collectiontsv:
    for doc_name in docs_list:
        file_path = os.path.join(docs_path, doc_name)
        doc = open(file_path)
        collectiontsv.write(str(cid_dictionary[doc_name.lstrip('0')])+'\t')
        for token in doc:
            collectiontsv.write(token[:-1].lower()+' ')
        collectiontsv.write('\n')

#Correct missing query (Queries_20 has only 19 out of queries)
queries = open('../Dataset/Queries_20','r')
line_count = len(queries.readlines())
if(line_count < 20):
    write_missing_query = open('../Dataset/Queries_20', "a")
    write_missing_query.write("What is the effect of treatment of CF patients with essential fatty acid supplements?\n") 
    write_missing_query.close()
queries.close()
    
with open('../Dataset/TSVs/queries.tsv', 'w', newline='') as queriestsv:
    i = 1
    queries = open('../Dataset/Queries_20','r')
    for query in queries:
        queriestsv.write(str(i)+'\t')
        queriestsv.write(query)
        i+=1

with open('../Dataset/TSVs/relevant.tsv', 'w', newline='') as relevanttsv:
    relevant = open(relevant)
    relevant_per_query = ''
    flag = False
    i = 1
    for line in relevant:
        if(flag == False and line[0:2] == 'RD'):
            flag = True
            relevant_per_query += line[3:]
        elif(flag == True and line[0:2] == 'QN'):
            flag = False
            relevant_order = processRelevantIntoRanking(relevant_per_query,i)
            for doc_id in relevant_order:
                relevanttsv.write(str(i) + '\t' + str(doc_id) + '\t' + str(relevant_order[doc_id]) + '\n')
            i+=1
            relevant_per_query = ''
        elif(flag == True):
            relevant_per_query += line

Import ColBERT libraries

In [5]:
import os
import sys
sys.path.insert(0, '../ColBERT/')
import colbert
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert.data import Queries, Collection
from colbert import Indexer, Searcher
sys.path.insert(0,'../Code/')

Compute Indexes

In [6]:
nbits=8

In [7]:
with Run().context(RunConfig(nranks=1, experiment="../../ColBERT/experiments/cystic_fibrosis")):
    config = ColBERTConfig (
        nbits=nbits,
        root="../ColBERT/experiments",
    )
    indexer = Indexer(checkpoint="../ColBERT/models/colbertv2.0", config=config)
    indexer.index(name="cystic_fibrosis.nbits="+str(nbits), collection="../Dataset/TSVs/collection.tsv", overwrite=True)



[Oct 22, 15:59:34] #> Creating directory /home/greg/University-Information-Retrieval/Code/experiments/../../ColBERT/experiments/cystic_fibrosis/indexes/cystic_fibrosis.nbits=8 


#> Starting...
nranks = 1 	 num_gpus = 1 	 device=0
{
    "query_token_id": "[unused0]",
    "doc_token_id": "[unused1]",
    "query_token": "[Q]",
    "doc_token": "[D]",
    "ncells": null,
    "centroid_score_threshold": null,
    "ndocs": null,
    "load_index_with_mmap": false,
    "index_path": null,
    "nbits": 8,
    "kmeans_niters": 20,
    "resume": false,
    "similarity": "cosine",
    "bsize": 64,
    "accumsteps": 1,
    "lr": 1e-5,
    "maxsteps": 400000,
    "save_every": null,
    "warmup": 20000,
    "warmup_bert": null,
    "relu": false,
    "nway": 64,
    "use_ib_negatives": true,
    "reranker": false,
    "distillation_alpha": 1.0,
    "ignore_scores": false,
    "model_name": null,
    "query_maxlen": 32,
    "attend_to_mask_tokens": false,
    "interaction": "colbert",
    "dim": 1

0it [00:00, ?it/s]

[Oct 22, 16:03:43] [0] 		 #> Saving chunk 0: 	 1,209 passages and 185,540 embeddings. From #0 onward.


1it [02:01, 121.22s/it]
100%|██████████| 1/1 [00:00<00:00, 1721.09it/s]
100%|██████████| 4096/4096 [00:00<00:00, 145939.65it/s]


[Oct 22, 16:03:44] [0] 		 #> Checking all files were saved...
[Oct 22, 16:03:44] [0] 		 Found all files!
[Oct 22, 16:03:44] [0] 		 #> Building IVF...
[Oct 22, 16:03:44] [0] 		 #> Loading codes...
[Oct 22, 16:03:44] [0] 		 Sorting codes...
[Oct 22, 16:03:44] [0] 		 Getting unique codes...
[Oct 22, 16:03:44] #> Optimizing IVF to store map from centroids to list of pids..
[Oct 22, 16:03:44] #> Building the emb2pid mapping..
[Oct 22, 16:03:44] len(emb2pid) = 185540
[Oct 22, 16:03:44] #> Saved optimized IVF to /home/greg/University-Information-Retrieval/Code/experiments/../../ColBERT/experiments/cystic_fibrosis/indexes/cystic_fibrosis.nbits=8/ivf.pid.pt
[Oct 22, 16:03:44] [0] 		 #> Saving the indexing metadata to /home/greg/University-Information-Retrieval/Code/experiments/../../ColBERT/experiments/cystic_fibrosis/indexes/cystic_fibrosis.nbits=8/metadata.json ..
#> Joined...


Retrieve information

In [8]:
with Run().context(RunConfig(nranks=1, experiment="../../ColBERT/experiments/cystic_fibrosis")):

        config = ColBERTConfig(
            root="../ColBERT/experiments",
        )
        searcher = Searcher(index="cystic_fibrosis.nbits="+str(nbits), config=config)
        queries = Queries("../Dataset/TSVs/queries.tsv")
        ranking = searcher.search_all(queries, k=100)
        ranking.save("cystic_fibrosis.nbits="+str(nbits)+".ranking.tsv")

[Oct 22, 16:03:44] #> Loading collection...
0M 
[Oct 22, 16:03:46] #> Loading codec...
[Oct 22, 16:03:46] Loading decompress_residuals_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[Oct 22, 16:03:46] Loading packbits_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[Oct 22, 16:03:46] #> Loading IVF...
[Oct 22, 16:03:46] #> Loading doclens...


100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 3228.87it/s]

[Oct 22, 16:03:46] #> Loading codes and residuals...



100%|████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 70.47it/s]

[Oct 22, 16:03:46] #> Loading the queries from ../Dataset/TSVs/queries.tsv ...
[Oct 22, 16:03:46] #> Got 20 queries. All QIDs are unique.


#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . What are the effects of calcium on the physical properties of mucus from CF patients, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  2054,  2024,  1996,  3896,  1997, 13853,  2006,  1996,
         3558,  5144,  1997, 14163,  7874,  2013, 12935,  5022,   102,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103])
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])




20it [00:00, 90.07it/s]




[Oct 22, 16:03:47] #> Creating directory /home/greg/University-Information-Retrieval/Code/experiments/../../ColBERT/experiments/cystic_fibrosis/none/2023-10/22/15.59.34 


[Oct 22, 16:03:47] #> Saved ranking of 20 queries and 2000 lines to /home/greg/University-Information-Retrieval/Code/experiments/../../ColBERT/experiments/cystic_fibrosis/none/2023-10/22/15.59.34/cystic_fibrosis.nbits=8.ranking.tsv


Print and evaluate results

In [9]:
import collections

queries = open('../Dataset/Relevant_20','r')
foundAnswers = []
realAnswers = []

i = 0
for query in queries:
    j = 0
    answers = query.split()
    realAnswers.append([])
    for answer in answers:
        realAnswers[i].append(int(answer))
    i += 1
queries.close()

i = 0
for query in ranking.data:
    print(f"Query: {query}:\nRetrieved Answers: ", end='')
    foundAnswers.append([])
    for answer in ranking.data[query]:
        foundAnswers[i].append(int(inverted_cid_dictionary[answer[0]]))
    print(foundAnswers[i])
    print(f"Known Answers: ", end='')
    print(realAnswers[i])
    intersection = []
    print("Model found: ", end='')
    for answer in realAnswers[i]:
        if answer in foundAnswers[i]:
            intersection.append(answer)
    print(intersection)
    print(str(len(intersection)) + '/' + str(len(realAnswers[i])) + ' of the correct results')
    print('\n')
    i+=1

Query: 1:
Retrieved Answers: [533, 957, 741, 827, 1201, 960, 484, 754, 499, 867, 139, 1185, 147, 742, 1107, 437, 52, 568, 564, 1145, 481, 265, 451, 559, 975, 967, 441, 636, 151, 189, 501, 496, 711, 1167, 11, 454, 527, 1238, 850, 439, 246, 302, 303, 201, 690, 108, 589, 386, 676, 969, 1173, 950, 588, 435, 878, 895, 766, 85, 1019, 398, 461, 987, 765, 988, 266, 1200, 392, 562, 1016, 1125, 592, 790, 190, 648, 563, 543, 1171, 693, 901, 1203, 748, 1169, 505, 288, 776, 526, 982, 382, 40, 973, 1188, 1076, 1227, 751, 925, 538, 912, 1163, 466, 794]
Known Answers: [139, 151, 166, 311, 370, 392, 439, 440, 441, 454, 461, 502, 503, 505, 520, 522, 526, 527, 533, 593, 619, 737, 742, 789, 827, 835, 861, 875, 891, 921, 922, 1175, 1185, 1222]
Model found: [139, 151, 392, 439, 441, 454, 461, 505, 526, 527, 533, 742, 827, 1185]
14/34 of the correct results


Query: 2:
Retrieved Answers: [437, 592, 589, 499, 754, 151, 1173, 980, 265, 451, 7, 533, 455, 912, 172, 382, 481, 439, 1019, 867, 733, 790, 297, 805, 4