In [1]:
import os

import pandas as pd

from src.code.io import LoadDocumentText
from src.code.populate import PopulateDocuments
from src.code.querier import Querier
from src.code.vectorizer import Vectorize

In [2]:
base_path = os.path.join("..", "..", "data")
doc_amount = 50000
dataset, tokenized_docs = PopulateDocuments(base_path, doc_amount)

In [3]:
doc_text_raw_path = os.path.join(base_path, "documents.txt")
docs_text_raw = {id: text for id, text in LoadDocumentText(doc_text_raw_path)}

In [4]:
doc_ids = [doc[0] for doc in tokenized_docs]
doc_words = [doc[1] for doc in tokenized_docs]

vector_path = os.path.join("..", "..", "data", "vectorized_data.pkl")
dictionary, corpus, tfidf, index = Vectorize(doc_words, vector_path)

In [5]:
model = Querier(doc_ids, doc_words, base_path)

In [8]:
query = "not flu"
query_results = model.Query(query, dictionary, tfidf, index, doc_ids)
results = [(id, score, docs_text_raw[id]) for id, score in query_results]

df = pd.DataFrame(results, columns=['ID', 'Score', 'Text'])

# Display the DataFrame
df



Unnamed: 0,ID,Score,Text
0,aclzp3iy,1221.438765,The pandemic of swine flu (H1N1) influenza sp...
1,1ykji0c8,1201.598453,The introduction of polymerase chain reaction...
2,73mqgofa,910.792196,We have produced a new Ebola virus pseudotype...
3,zkudc8ww,874.190315,"In this work, we study the consequences of se..."
4,ozigndov,810.286361,BACKGROUND: Recent reports have described the...
...,...,...,...
6440,ylv3w9wz,0.562817,Un Grupo de Expertos de la Sociedad de Enferm...
6441,hhoa32i6,0.548746,Aan de GGD worden twee gevallen van hepatitis...
6442,5b7ktvdm,0.466908,Infecties in het hart en de bloedbaan worden ...
6443,f9wsi3jb,0.304795,OBJETIVO: Valorar la efectividad de las inter...
