In [1]:
import json
import sys
import random

sys.path.append("../")

from tqdm import tqdm
from preprocessors.preprocessor import Preprocessor
from preprocessors.synonym_expander import SynonymExpander
from preprocessors.llm_expander import LLMExpander

import pandas as pd
import numpy as np

from models.bm25 import bm25
from models.vector_model import vector_model

import IR_utils

random.seed(0)
np.random.seed(0)

%load_ext autoreload
%autoreload 2

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/manoschatzakis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/manoschatzakis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/manoschatzakis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
preprocessor = Preprocessor(expander=SynonymExpander(add_synonym_prob=1))

docs = preprocessor.load_docs("../../data/dataset/tokenized_corpus.jsonl")

test_queries_t1 = IR_utils.load_test_queries_t1(
    "../../data/dataset/queries.jsonl", "../../data/task1_test.tsv"
)[0]

test_queries_t2 = IR_utils.load_test_queries_t2(
    "../../data/dataset/queries.jsonl", "../../data/task2_test.tsv"
)[0]


print("Number of docs:", len(docs))
print("Number of queries (t1):", len(test_queries_t1))
print("Number of queries (t2):", len(test_queries_t2))

Number of docs: 1471406
Number of queries (t1): 7437
Number of queries (t2): 33


In [3]:
model = bm25(list(docs.keys()), list(docs.values()))
model.fit()

In [4]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

flan_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
flan_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")

llmExpander = LLMExpander(flan_model, flan_tokenizer, max_new_tokens=20)

In [20]:
csv_string = "id,corpus-id,score\n"

for query_data in tqdm(test_queries_t1, desc="Query Preprocessing and Expansion", unit=" queries"):
    query_text = query_data["text"]
    query_terms = preprocessor.preprocess_query(query_text)
    
    if len(query_terms) <= 2:
        old_query_terms = query_terms
        query_terms = preprocessor.preprocess_query(llmExpander.expand(query_text))
        #print(old_query_terms)
        #print(query_terms)
        #print("----")
        
    top_results = model.get_top_k_documents(query_terms, 30)
    top_docs = {}
    for r in top_results:
        top_docs[r[1]] = r[2]
    
    if(len(top_results) < 10):
        print(len(top_results))
        print(query_text)
        print(query_terms)    
        top_10_res = []
        for d in top_results:
            top_10_res.append((d[1], 1))
    else:
        vm = vector_model.create_model(top_docs, min_df=1)
        vm.fit()
        top_10_res = vm.find_similar(query_terms, 10)
        if len(top_10_res) == 0:
            print("NO")
            top_10_res = []
            for d in top_results[0:10]:
                top_10_res.append((d[1], 1))

    #print(top_10_res)
    
    query_index = query_data["id"]
    doc_ids = [int(r[0]) for r in top_10_res]
    
    #print(doc_ids)

    csv_string += f"{query_index},\"{doc_ids}\",-1\n"
    
with open("../../submissions/bm25_tfidf_submission.csv", "w") as f:
    f.write(csv_string)

Query Preprocessing and Expansion:   1%|▏         | 105/7437 [00:07<10:10, 12.01 queries/s]

NO


Query Preprocessing and Expansion:   3%|▎         | 233/7437 [00:21<07:41, 15.59 queries/s]

NO


Query Preprocessing and Expansion:   3%|▎         | 248/7437 [00:22<06:32, 18.33 queries/s]

NO


Query Preprocessing and Expansion:   4%|▍         | 289/7437 [00:26<12:11,  9.77 queries/s]

NO


Query Preprocessing and Expansion:   4%|▍         | 305/7437 [00:28<15:09,  7.84 queries/s]

NO


Query Preprocessing and Expansion:   5%|▍         | 341/7437 [00:32<16:31,  7.16 queries/s]

NO


Query Preprocessing and Expansion:   5%|▍         | 351/7437 [00:33<09:38, 12.25 queries/s]

NO


Query Preprocessing and Expansion:   5%|▍         | 359/7437 [00:34<13:34,  8.69 queries/s]

NO


Query Preprocessing and Expansion:   7%|▋         | 494/7437 [00:43<05:28, 21.12 queries/s]

NO


Query Preprocessing and Expansion:   7%|▋         | 497/7437 [00:43<05:42, 20.27 queries/s]

NO


Query Preprocessing and Expansion:   7%|▋         | 529/7437 [00:46<11:40,  9.86 queries/s]

NO


Query Preprocessing and Expansion:  11%|█         | 795/7437 [01:18<11:44,  9.42 queries/s]

NO


Query Preprocessing and Expansion:  11%|█         | 809/7437 [01:20<15:09,  7.29 queries/s]

NO


Query Preprocessing and Expansion:  11%|█▏        | 846/7437 [01:26<12:16,  8.95 queries/s]

NO


Query Preprocessing and Expansion:  11%|█▏        | 848/7437 [01:26<12:53,  8.52 queries/s]

NO


Query Preprocessing and Expansion:  12%|█▏        | 875/7437 [01:32<19:04,  5.73 queries/s]

NO


Query Preprocessing and Expansion:  12%|█▏        | 898/7437 [01:34<10:02, 10.84 queries/s]

NO


Query Preprocessing and Expansion:  13%|█▎        | 977/7437 [01:48<24:56,  4.32 queries/s]

NO


Query Preprocessing and Expansion:  14%|█▍        | 1058/7437 [02:03<25:35,  4.15 queries/s]

NO


Query Preprocessing and Expansion:  15%|█▍        | 1082/7437 [02:07<19:25,  5.45 queries/s]

NO


Query Preprocessing and Expansion:  16%|█▌        | 1182/7437 [02:16<08:34, 12.16 queries/s]

NO


Query Preprocessing and Expansion:  17%|█▋        | 1264/7437 [02:23<04:35, 22.42 queries/s]

NO


Query Preprocessing and Expansion:  17%|█▋        | 1291/7437 [02:25<04:36, 22.26 queries/s]

NO
NO


Query Preprocessing and Expansion:  17%|█▋        | 1295/7437 [02:25<06:49, 15.02 queries/s]

NO


Query Preprocessing and Expansion:  18%|█▊        | 1374/7437 [02:33<05:04, 19.94 queries/s]

NO


Query Preprocessing and Expansion:  19%|█▉        | 1411/7437 [02:35<09:22, 10.71 queries/s]

NO


Query Preprocessing and Expansion:  19%|█▉        | 1450/7437 [02:38<05:14, 19.02 queries/s]

NO


Query Preprocessing and Expansion:  20%|██        | 1516/7437 [02:44<05:31, 17.88 queries/s]

NO


Query Preprocessing and Expansion:  22%|██▏       | 1610/7437 [02:51<07:39, 12.69 queries/s]

NO
NO


Query Preprocessing and Expansion:  22%|██▏       | 1616/7437 [02:52<06:35, 14.73 queries/s]

NO


Query Preprocessing and Expansion:  22%|██▏       | 1624/7437 [02:52<05:08, 18.82 queries/s]

NO


Query Preprocessing and Expansion:  22%|██▏       | 1645/7437 [02:53<04:57, 19.50 queries/s]

NO


Query Preprocessing and Expansion:  24%|██▎       | 1750/7437 [03:07<10:26,  9.07 queries/s]

NO


Query Preprocessing and Expansion:  24%|██▍       | 1807/7437 [03:13<06:46, 13.86 queries/s]

NO


Query Preprocessing and Expansion:  25%|██▍       | 1858/7437 [03:17<10:09,  9.15 queries/s]

NO


Query Preprocessing and Expansion:  25%|██▌       | 1876/7437 [03:19<15:50,  5.85 queries/s]

NO


Query Preprocessing and Expansion:  26%|██▌       | 1927/7437 [03:25<10:07,  9.06 queries/s]

NO


Query Preprocessing and Expansion:  29%|██▊       | 2132/7437 [03:38<04:19, 20.47 queries/s]

NO
NO


Query Preprocessing and Expansion:  31%|███       | 2279/7437 [03:48<06:35, 13.04 queries/s]

NO
NO


Query Preprocessing and Expansion:  31%|███       | 2293/7437 [03:50<08:59,  9.54 queries/s]

NO
NO


Query Preprocessing and Expansion:  35%|███▍      | 2568/7437 [04:10<06:12, 13.08 queries/s]

NO


Query Preprocessing and Expansion:  35%|███▍      | 2573/7437 [04:10<06:21, 12.76 queries/s]

NO


Query Preprocessing and Expansion:  37%|███▋      | 2735/7437 [04:20<04:48, 16.31 queries/s]

NO


Query Preprocessing and Expansion:  37%|███▋      | 2750/7437 [04:22<10:01,  7.80 queries/s]

NO


Query Preprocessing and Expansion:  38%|███▊      | 2813/7437 [04:29<06:33, 11.74 queries/s]

NO
NO


Query Preprocessing and Expansion:  38%|███▊      | 2839/7437 [04:32<07:22, 10.40 queries/s]

NO


Query Preprocessing and Expansion:  39%|███▊      | 2865/7437 [04:34<05:45, 13.25 queries/s]

NO


Query Preprocessing and Expansion:  40%|███▉      | 2973/7437 [04:42<04:55, 15.12 queries/s]

NO
NO
NO
NO


Query Preprocessing and Expansion:  40%|████      | 3005/7437 [04:44<06:33, 11.26 queries/s]

NO


Query Preprocessing and Expansion:  41%|████      | 3030/7437 [04:47<09:24,  7.81 queries/s]

NO


Query Preprocessing and Expansion:  41%|████      | 3031/7437 [04:47<10:28,  7.01 queries/s]

NO


Query Preprocessing and Expansion:  41%|████      | 3036/7437 [04:48<07:08, 10.28 queries/s]

NO


Query Preprocessing and Expansion:  41%|████      | 3052/7437 [04:50<07:58,  9.16 queries/s]

NO


Query Preprocessing and Expansion:  42%|████▏     | 3135/7437 [04:57<07:36,  9.42 queries/s]

NO


Query Preprocessing and Expansion:  42%|████▏     | 3140/7437 [04:57<06:46, 10.58 queries/s]

NO


Query Preprocessing and Expansion:  43%|████▎     | 3191/7437 [05:01<03:56, 17.94 queries/s]

NO


Query Preprocessing and Expansion:  43%|████▎     | 3225/7437 [05:05<06:49, 10.30 queries/s]

NO


Query Preprocessing and Expansion:  44%|████▎     | 3247/7437 [05:06<02:46, 25.24 queries/s]

NO


Query Preprocessing and Expansion:  44%|████▍     | 3272/7437 [05:08<04:44, 14.62 queries/s]

NO


Query Preprocessing and Expansion:  44%|████▍     | 3275/7437 [05:08<04:17, 16.17 queries/s]

NO


Query Preprocessing and Expansion:  44%|████▍     | 3294/7437 [05:10<07:22,  9.37 queries/s]

NO


Query Preprocessing and Expansion:  45%|████▍     | 3334/7437 [05:13<03:29, 19.62 queries/s]

NO


Query Preprocessing and Expansion:  45%|████▌     | 3356/7437 [05:14<03:34, 18.99 queries/s]

NO


Query Preprocessing and Expansion:  45%|████▌     | 3373/7437 [05:16<05:19, 12.72 queries/s]

NO


Query Preprocessing and Expansion:  46%|████▌     | 3392/7437 [05:17<04:30, 14.95 queries/s]

NO


Query Preprocessing and Expansion:  46%|████▌     | 3435/7437 [05:20<04:36, 14.48 queries/s]

NO


Query Preprocessing and Expansion:  46%|████▌     | 3437/7437 [05:21<07:10,  9.28 queries/s]

NO


Query Preprocessing and Expansion:  47%|████▋     | 3473/7437 [05:23<03:37, 18.24 queries/s]

NO


Query Preprocessing and Expansion:  47%|████▋     | 3507/7437 [05:26<05:21, 12.23 queries/s]

NO


Query Preprocessing and Expansion:  48%|████▊     | 3601/7437 [05:36<07:17,  8.77 queries/s]

NO


Query Preprocessing and Expansion:  49%|████▊     | 3615/7437 [05:38<10:11,  6.25 queries/s]

NO


Query Preprocessing and Expansion:  49%|████▊     | 3623/7437 [05:38<06:36,  9.62 queries/s]

NO


Query Preprocessing and Expansion:  49%|████▉     | 3659/7437 [05:41<03:38, 17.30 queries/s]

NO


Query Preprocessing and Expansion:  49%|████▉     | 3674/7437 [05:42<05:15, 11.93 queries/s]

8
what is a cincture
['cinctur', 'cinctur', 'cinctur', 'cinc']


Query Preprocessing and Expansion:  49%|████▉     | 3681/7437 [05:43<04:27, 14.03 queries/s]

NO
NO


Query Preprocessing and Expansion:  50%|█████     | 3727/7437 [05:47<06:17,  9.82 queries/s]

NO


Query Preprocessing and Expansion:  51%|█████     | 3759/7437 [05:51<04:58, 12.32 queries/s]

NO


Query Preprocessing and Expansion:  51%|█████     | 3811/7437 [05:55<06:18,  9.59 queries/s]

NO


Query Preprocessing and Expansion:  51%|█████▏    | 3826/7437 [05:56<03:25, 17.59 queries/s]

NO


Query Preprocessing and Expansion:  52%|█████▏    | 3837/7437 [05:56<02:26, 24.54 queries/s]

NO


Query Preprocessing and Expansion:  52%|█████▏    | 3875/7437 [05:59<04:15, 13.95 queries/s]

NO


Query Preprocessing and Expansion:  52%|█████▏    | 3892/7437 [06:01<05:01, 11.76 queries/s]

NO


Query Preprocessing and Expansion:  53%|█████▎    | 3975/7437 [06:07<04:20, 13.30 queries/s]

NO


Query Preprocessing and Expansion:  54%|█████▎    | 3997/7437 [06:08<02:20, 24.51 queries/s]

NO


Query Preprocessing and Expansion:  54%|█████▍    | 4023/7437 [06:10<04:24, 12.92 queries/s]

NO
NO


Query Preprocessing and Expansion:  54%|█████▍    | 4045/7437 [06:13<06:17,  8.99 queries/s]

NO


Query Preprocessing and Expansion:  54%|█████▍    | 4047/7437 [06:13<06:32,  8.64 queries/s]

NO


Query Preprocessing and Expansion:  55%|█████▌    | 4092/7437 [06:17<04:45, 11.70 queries/s]

NO


Query Preprocessing and Expansion:  56%|█████▌    | 4155/7437 [06:23<10:37,  5.15 queries/s]

NO


Query Preprocessing and Expansion:  56%|█████▌    | 4171/7437 [06:24<05:09, 10.56 queries/s]

NO


Query Preprocessing and Expansion:  56%|█████▋    | 4200/7437 [06:27<03:46, 14.26 queries/s]

NO


Query Preprocessing and Expansion:  57%|█████▋    | 4218/7437 [06:28<02:51, 18.81 queries/s]

NO


Query Preprocessing and Expansion:  57%|█████▋    | 4239/7437 [06:29<03:21, 15.88 queries/s]

NO


Query Preprocessing and Expansion:  58%|█████▊    | 4277/7437 [06:33<05:14, 10.06 queries/s]

NO


Query Preprocessing and Expansion:  59%|█████▊    | 4352/7437 [06:39<05:03, 10.15 queries/s]

NO


Query Preprocessing and Expansion:  59%|█████▊    | 4358/7437 [06:39<04:16, 12.00 queries/s]

NO


Query Preprocessing and Expansion:  59%|█████▉    | 4404/7437 [06:44<06:11,  8.15 queries/s]

NO


Query Preprocessing and Expansion:  59%|█████▉    | 4418/7437 [06:45<04:40, 10.78 queries/s]

NO


Query Preprocessing and Expansion:  60%|█████▉    | 4458/7437 [06:50<07:32,  6.58 queries/s]

NO


Query Preprocessing and Expansion:  60%|█████▉    | 4461/7437 [06:51<06:52,  7.22 queries/s]

NO


Query Preprocessing and Expansion:  60%|██████    | 4478/7437 [06:52<04:03, 12.17 queries/s]

NO


Query Preprocessing and Expansion:  60%|██████    | 4491/7437 [06:54<05:09,  9.51 queries/s]

NO


Query Preprocessing and Expansion:  61%|██████    | 4505/7437 [06:55<03:47, 12.86 queries/s]

NO


Query Preprocessing and Expansion:  61%|██████▏   | 4562/7437 [07:00<06:19,  7.57 queries/s]

NO


Query Preprocessing and Expansion:  62%|██████▏   | 4579/7437 [07:03<05:09,  9.23 queries/s]

NO


Query Preprocessing and Expansion:  63%|██████▎   | 4667/7437 [07:14<07:11,  6.41 queries/s]

NO


Query Preprocessing and Expansion:  63%|██████▎   | 4714/7437 [07:18<03:24, 13.30 queries/s]

NO


Query Preprocessing and Expansion:  65%|██████▍   | 4814/7437 [07:26<02:25, 18.05 queries/s]

NO


Query Preprocessing and Expansion:  65%|██████▌   | 4863/7437 [07:31<06:04,  7.07 queries/s]

NO


Query Preprocessing and Expansion:  65%|██████▌   | 4868/7437 [07:32<07:27,  5.74 queries/s]

NO


Query Preprocessing and Expansion:  66%|██████▌   | 4890/7437 [07:35<04:29,  9.46 queries/s]

NO


Query Preprocessing and Expansion:  66%|██████▋   | 4944/7437 [07:40<04:16,  9.72 queries/s]

NO


Query Preprocessing and Expansion:  67%|██████▋   | 5016/7437 [07:46<02:08, 18.81 queries/s]

NO


Query Preprocessing and Expansion:  68%|██████▊   | 5057/7437 [07:51<03:16, 12.14 queries/s]

NO


Query Preprocessing and Expansion:  68%|██████▊   | 5063/7437 [07:52<05:42,  6.93 queries/s]

NO


Query Preprocessing and Expansion:  68%|██████▊   | 5070/7437 [07:53<03:37, 10.86 queries/s]

NO
NO


Query Preprocessing and Expansion:  69%|██████▊   | 5102/7437 [07:56<04:26,  8.76 queries/s]

NO


Query Preprocessing and Expansion:  69%|██████▊   | 5112/7437 [07:57<04:41,  8.25 queries/s]

NO


Query Preprocessing and Expansion:  69%|██████▉   | 5155/7437 [08:02<01:55, 19.69 queries/s]

NO


Query Preprocessing and Expansion:  69%|██████▉   | 5165/7437 [08:03<04:27,  8.50 queries/s]

NO


Query Preprocessing and Expansion:  70%|██████▉   | 5181/7437 [08:04<02:24, 15.57 queries/s]

NO


Query Preprocessing and Expansion:  73%|███████▎  | 5432/7437 [08:28<03:55,  8.50 queries/s]

NO


Query Preprocessing and Expansion:  74%|███████▍  | 5493/7437 [08:34<02:16, 14.22 queries/s]

NO


Query Preprocessing and Expansion:  74%|███████▍  | 5496/7437 [08:35<03:49,  8.46 queries/s]

NO


Query Preprocessing and Expansion:  74%|███████▍  | 5500/7437 [08:35<03:22,  9.56 queries/s]

NO


Query Preprocessing and Expansion:  74%|███████▍  | 5529/7437 [08:37<01:29, 21.40 queries/s]

NO


Query Preprocessing and Expansion:  75%|███████▍  | 5569/7437 [08:42<03:01, 10.27 queries/s]

NO


Query Preprocessing and Expansion:  76%|███████▌  | 5633/7437 [08:47<02:13, 13.52 queries/s]

NO


Query Preprocessing and Expansion:  76%|███████▌  | 5669/7437 [08:51<03:42,  7.94 queries/s]

NO


Query Preprocessing and Expansion:  77%|███████▋  | 5691/7437 [08:53<02:18, 12.63 queries/s]

NO


Query Preprocessing and Expansion:  77%|███████▋  | 5704/7437 [08:53<01:45, 16.42 queries/s]

NO


Query Preprocessing and Expansion:  77%|███████▋  | 5719/7437 [08:54<02:16, 12.57 queries/s]

NO


Query Preprocessing and Expansion:  77%|███████▋  | 5742/7437 [08:56<02:15, 12.55 queries/s]

NO


Query Preprocessing and Expansion:  77%|███████▋  | 5761/7437 [08:57<01:43, 16.26 queries/s]

NO


Query Preprocessing and Expansion:  79%|███████▉  | 5876/7437 [09:10<01:51, 14.00 queries/s]

NO


Query Preprocessing and Expansion:  80%|███████▉  | 5939/7437 [09:18<02:16, 11.00 queries/s]

NO
NO


Query Preprocessing and Expansion:  80%|████████  | 5985/7437 [09:21<01:04, 22.44 queries/s]

NO


Query Preprocessing and Expansion:  81%|████████  | 5994/7437 [09:22<01:06, 21.58 queries/s]

NO


Query Preprocessing and Expansion:  81%|████████  | 6040/7437 [09:25<01:12, 19.20 queries/s]

NO


Query Preprocessing and Expansion:  81%|████████▏ | 6053/7437 [09:26<01:29, 15.54 queries/s]

NO


Query Preprocessing and Expansion:  83%|████████▎ | 6186/7437 [09:39<02:46,  7.52 queries/s]

NO


Query Preprocessing and Expansion:  83%|████████▎ | 6190/7437 [09:40<02:21,  8.81 queries/s]

NO


Query Preprocessing and Expansion:  85%|████████▍ | 6293/7437 [09:49<01:30, 12.70 queries/s]

NO


Query Preprocessing and Expansion:  86%|████████▌ | 6363/7437 [09:56<01:21, 13.14 queries/s]

NO


Query Preprocessing and Expansion:  86%|████████▋ | 6420/7437 [10:01<00:47, 21.22 queries/s]

NO
NO


Query Preprocessing and Expansion:  87%|████████▋ | 6467/7437 [10:08<01:57,  8.28 queries/s]

NO


Query Preprocessing and Expansion:  87%|████████▋ | 6469/7437 [10:08<01:59,  8.11 queries/s]

NO


Query Preprocessing and Expansion:  87%|████████▋ | 6481/7437 [10:10<02:12,  7.23 queries/s]

NO


Query Preprocessing and Expansion:  88%|████████▊ | 6522/7437 [10:13<00:51, 17.64 queries/s]

NO


Query Preprocessing and Expansion:  88%|████████▊ | 6530/7437 [10:14<02:04,  7.31 queries/s]

NO


Query Preprocessing and Expansion:  88%|████████▊ | 6543/7437 [10:15<01:00, 14.66 queries/s]

NO


Query Preprocessing and Expansion:  88%|████████▊ | 6552/7437 [10:15<00:48, 18.37 queries/s]

NO
NO


Query Preprocessing and Expansion:  89%|████████▊ | 6597/7437 [10:19<01:19, 10.54 queries/s]

NO


Query Preprocessing and Expansion:  89%|████████▉ | 6609/7437 [10:20<01:26,  9.60 queries/s]

NO


Query Preprocessing and Expansion:  89%|████████▉ | 6641/7437 [10:24<01:40,  7.95 queries/s]

NO


Query Preprocessing and Expansion:  90%|█████████ | 6723/7437 [10:30<00:53, 13.45 queries/s]

NO
NO


Query Preprocessing and Expansion:  91%|█████████ | 6739/7437 [10:32<01:14,  9.32 queries/s]

NO


Query Preprocessing and Expansion:  91%|█████████ | 6750/7437 [10:33<01:04, 10.62 queries/s]

NO
NO


Query Preprocessing and Expansion:  91%|█████████ | 6755/7437 [10:34<00:59, 11.52 queries/s]

NO


Query Preprocessing and Expansion:  92%|█████████▏| 6842/7437 [10:43<00:45, 13.21 queries/s]

NO


Query Preprocessing and Expansion:  92%|█████████▏| 6867/7437 [10:45<00:43, 13.20 queries/s]

NO


Query Preprocessing and Expansion:  93%|█████████▎| 6902/7437 [10:49<00:49, 10.76 queries/s]

NO


Query Preprocessing and Expansion:  93%|█████████▎| 6934/7437 [10:51<00:32, 15.64 queries/s]

NO


Query Preprocessing and Expansion:  94%|█████████▍| 7009/7437 [10:59<01:03,  6.75 queries/s]

NO


Query Preprocessing and Expansion:  95%|█████████▍| 7036/7437 [11:02<00:55,  7.28 queries/s]

NO


Query Preprocessing and Expansion:  96%|█████████▌| 7130/7437 [11:10<00:19, 15.97 queries/s]

NO


Query Preprocessing and Expansion:  96%|█████████▋| 7162/7437 [11:12<00:30,  8.88 queries/s]

NO


Query Preprocessing and Expansion:  97%|█████████▋| 7201/7437 [11:16<00:19, 11.98 queries/s]

NO


Query Preprocessing and Expansion:  98%|█████████▊| 7282/7437 [11:22<00:10, 15.46 queries/s]

NO


Query Preprocessing and Expansion:  98%|█████████▊| 7292/7437 [11:23<00:06, 23.17 queries/s]

NO


Query Preprocessing and Expansion:  98%|█████████▊| 7310/7437 [11:24<00:08, 15.39 queries/s]

NO


Query Preprocessing and Expansion:  98%|█████████▊| 7313/7437 [11:24<00:09, 13.64 queries/s]

NO


Query Preprocessing and Expansion:  99%|█████████▊| 7343/7437 [11:26<00:06, 13.72 queries/s]

NO


Query Preprocessing and Expansion:  99%|█████████▉| 7359/7437 [11:27<00:03, 19.73 queries/s]

NO
NO
NO


Query Preprocessing and Expansion: 100%|██████████| 7437/7437 [11:33<00:00, 10.73 queries/s]


In [21]:
with open("../../submissions/bm25_tfidf_submission.csv", "w") as f:
    f.write(csv_string)

In [None]:
for query_data in tqdm(test_queries_t2, desc="Query Preprocessing and Expansion", unit=" queries"):
    query_text = query_data["text"]
    query_terms = preprocessor.preprocess_query(query_text)