### Defining the Dataset

In [5]:
from data.dataloader import Data
import configparser
# load config.ini 
config = configparser.ConfigParser()
config.read('config.ini')
dat = Data(config)
corpus, queries = dat.get_dataset('fiqa')


Loading dataset from data/datasets\fiqa
data/datasets\fiqa\corpus.jsonl


In [7]:
print(corpus[0])
print(queries[0])

{'_id': '3', 'title': '', 'text': "I'm not saying I don't like the idea of on-the-job training too, but you can't expect the company to do that. Training workers is not their job - they're building software. Perhaps educational systems in the U.S. (or their students) should worry a little about getting marketable skills in exchange for their massive investment in education, rather than getting out with thousands in student debt and then complaining that they aren't qualified to do anything.", 'metadata': {}}
{'_id': '0', 'text': 'What is considered a business expense on a business trip?', 'metadata': {}}


In [30]:
# documents = [
#     {"text": "A cat is an animal.", "_id": 1},
#     {"text": "A dog is a house animal.", "_id": 1},
#     {"text": "The city of new york is big.", "_id": 2},
#     {"text": "The city of chicago is big.", "_id": 3},
#     {"text": "The city of gentofte is big.", "_id": 4},
#     {"text": "The city of copenhagen is big.", "_id": 5},
#     {"text": "During the cold war, many comunists were crazy.", "_id": 5},
#     {"text": "Are there any better movie than The Godfather?", "_id": 5}
#     ]

# Just for testing
documents = corpus[:100]

### Defining the Models

In [31]:
from models.builers.retriever import Retriever
from models.CURE import CURE
from models.k_means import KMeans
from models.TFIDF import TFIDF
from models.BM25 import BM25
from models.DPR import DPR
from models.DPR_crossencoder import DPRCrossencoder

models: dict[str: Retriever] = {
    "TF-IDF": TFIDF(documents=documents),
    "BM25": BM25(documents=documents),
    # "DPR": DPR(documents=documents),
    # "Crossencoder": DPRCrossencoder(documents=documents, n=25),
    # "KMeans": KMeans(documents=documents, k = 4),
    # "CURE": CURE(documents=documents, k = 2, n=2, shrinkage_fraction=0.2),
}


### Perform Experiment

In [32]:
from data.document import Document

query = queries[0]["text"]
print("Query:", query)
k = 5

for model_type in models.keys():
    print(model_type)
    model: Retriever = models[model_type]
    result: list[Document] = model.Lookup(query=query, k=k)

    print([d.GetText() for d in result])

Query: What is considered a business expense on a business trip?
TF-IDF
['"So you\'re basically saying that average market fluctuations have an affect on individual stocks, because individual stocks are often priced in relation to the growth of the market as a whole?  Also, what kinds of investments would be considered ""risk free"" in this nomenclature?"', '"As long as the losing business is not considered ""passive activity"" or ""hobby"", then yes. Passive Activity is an activity where you do not have to actively do anything to generate income. For example - royalties or rentals. Hobby is an activity that doesn\'t generate profit. Generally, if your business doesn\'t consistently generate profit (the IRS looks at 3 out of the last 5 years), it may be characterized as hobby. For hobby, loss deduction is limited by the hobby income and the 2% AGI threshold."', "&gt; monopoly  &gt; names 3 giant companies who compete with each other in almost every industry possible  What happened to /

In [42]:
from data.document import Document

query = "Cities are pretty cool, especially war"
k = 5

model: CURE = models["CURE"]
for cluster in model.clusters.clusters:
    print([doc.GetText() for doc in cluster.GetDocuments()])

for model_type in models.keys():
    print(model_type)
    model: Retriever = models[model_type]
    result: list[Document] = model.Lookup(query=query, k=k)
    print([d.GetText() for d in result])

['A cat is an animal.', 'A dog is a house animal.']
['The city of gentofte is big.', 'The city of new york is big.', 'The city of chicago is big.', 'The city of copenhagen is big.', 'Are there any better movie than The Godfather?', 'During the cold war, many comunists were crazy.']
CURE
['The city of copenhagen is big.', 'The city of chicago is big.', 'The city of new york is big.', 'The city of gentofte is big.', 'Are there any better movie than The Godfather?']
