In [39]:
# Adds the current notebook path to the sys.path just in case your venv is in a different location
import os
import sys

# get directy for this notebook .ipynb 
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
# add this to the path from which we will load the data
sys.path.insert(0, notebook_dir)

### Defining the Dataset

In [40]:
documents = [
    {"text": "A cat is an animal.", "id": 1},
    {"text": "A dog is a house animal.", "id": 1},
    {"text": "The city of new york is big.", "id": 2},
    {"text": "The city of chicago is big.", "id": 3},
    {"text": "The city of gentofte is big.", "id": 4},
    {"text": "The city of copenhagen is big.", "id": 5},
    {"text": "During the cold war, many comunists were crazy.", "id": 5},
    {"text": "Are there any better movie than The Godfather?", "id": 5}
    ]

### Defining the Models

In [None]:
from models.builers.retriever import Retriever
from models.CURE import CURE
from models.k_means import KMeans
from models.TFIDF import TFIDF
from models.DPR import DPR
from models.DPR_crossencoder import DPRCrossencoder

models: dict[str: Retriever] = {
    # "TF-IDF": TFIDF(documents=documents),
    # "DPR": DPR(documents=documents),
    # "Crossencoder": DPRCrossencoder(documents=documents, n=25),
    # "KMeans": KMeans(documents=documents, k = 4),
    "CURE": CURE(documents=documents, k = 2, n=2, shrinkage_fraction=0.2),
}

### Perform Experiment

In [None]:
from data.document import Document

query = "Cities are pretty cool, especially war"
k = 5

model: CURE = models["CURE"]
for cluster in model.clusters.clusters:
    print([doc.GetText() for doc in cluster.GetDocuments()])

for model_type in models.keys():
    print(model_type)
    model: Retriever = models[model_type]
    result: list[Document] = model.Lookup(query=query, k=k)
    print([d.GetText() for d in result])