In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from dataclasses import dataclass, field
from colorama import Fore, Style
import pandas as pd
import json
import os
from Constants.constants import INVERTED_INDEX_PATH, MAPPING_PATH, DOCUMENTS_PATH

In [5]:
with open(INVERTED_INDEX_PATH, "r") as f:
    inverted_index = json.load(f)
with open(MAPPING_PATH, "r") as f:
    mapping = json.load(f)
reversed_mapping = {v: k for k, v in mapping.items()}

In [6]:
def preprocess_query(query):
    df = pd.DataFrame({"text": [query]})
    tokenizer = RegexpTokenizer(r"\w+")
    stop_words = set(stopwords.words("english"))
    df["text"] = df["text"].apply(lambda x: tokenizer.tokenize(x.lower()))
    df["text"] = df["text"].apply(lambda x: [w for w in x if not w in stop_words])
    df["text"] = df["text"].apply(lambda x: " ".join(x))
    return df["text"]

In [7]:
query = "european space agency"
query = preprocess_query(query)
query

0    european space agency
Name: text, dtype: object

In [8]:
def get_docs(query_df, inverted_index):
    docs = []
    for word in query_df[0].split():
        if word in inverted_index:
            docs.append(inverted_index[word])
    return docs
docs = get_docs(query, inverted_index)
docs

[{'soundex': 'E615',
  'occurences': [['H2', 1, 'Astronomy'],
   ['H7', 1, 'Astronomy'],
   ['H9', 1, 'Astronomy'],
   ['H10', 2, 'Astronomy'],
   ['H18', 1, 'Astronomy'],
   ['H23', 1, 'Astronomy'],
   ['H29', 1, 'Astronomy'],
   ['H33', 1, 'Astronomy'],
   ['H34', 117, 'Astronomy'],
   ['H53', 1, 'Astronomy'],
   ['H71', 1, 'Astronomy'],
   ['H78', 1, 'Astronomy'],
   ['H80', 1, 'Astronomy'],
   ['H86', 1, 'Astronomy'],
   ['H90', 1, 'Astronomy'],
   ['H91', 1, 'Astronomy'],
   ['H92', 1, 'Astronomy'],
   ['H93', 1, 'Astronomy'],
   ['H94', 1, 'Astronomy'],
   ['H95', 1, 'Astronomy'],
   ['H96', 1, 'Astronomy'],
   ['H98', 1, 'Astronomy'],
   ['H102', 1, 'Astronomy'],
   ['H103', 3, 'Astronomy'],
   ['H108', 1, 'Astronomy'],
   ['H113', 1, 'Astronomy'],
   ['H118', 21, 'Astronomy'],
   ['H120', 1, 'Astronomy'],
   ['H122', 1, 'Astronomy'],
   ['H123', 1, 'Astronomy'],
   ['H130', 1, 'Astronomy'],
   ['H131', 1, 'Astronomy'],
   ['H136', 1, 'Astronomy'],
   ['H138', 1, 'Astronomy'],
 

In [9]:
#Make a class to hold document path and similarity score
@dataclass
class Document:
    path: str = field(default=None)
    score: float = field(default=None)
    hash_: str = field(default=None)

In [17]:
def initialize_documents(docs, reversed_mapping):
    seen = set()
    documents = []
    extension = ".txt"
    for doc in docs:
        for occurence in doc["occurences"]:
            if occurence[0] not in seen:
                document = Document()
                path = os.path.join(DOCUMENTS_PATH,occurence[2], reversed_mapping[occurence[0]] + extension)
                document.path = path
                document.hash_ = occurence[0]
                documents.append(document)
                seen.add(occurence[0])
    return documents
documents= initialize_documents(docs, reversed_mapping)
documents

[Document(path='data\\Astronomy\\08d0fcf3b9263b5184eb5e3070365fe89fcda3055070126dd7e4f61045f9a449.txt', score=None, hash_='H2'),
 Document(path='data\\Astronomy\\0fec84f3a3c7ce5499f3c51206b058400502e683b9faedbe68df776c9d109f19.txt', score=None, hash_='H7'),
 Document(path='data\\Astronomy\\11c5c2aecda100076ea725bcb1b25d5b4beca83394dd72e88a6b43790c3738fb.txt', score=None, hash_='H9'),
 Document(path='data\\Astronomy\\137fa312a32cd2dcd7efe910f6b242c989b4e991b00dfda304c98d5512a928f9.txt', score=None, hash_='H10'),
 Document(path='data\\Astronomy\\1c16fc172a3e1805405f4ff561abc56bab6a12c797be398458a71e728a8c4e47.txt', score=None, hash_='H18'),
 Document(path='data\\Astronomy\\248997b047dc82a0a0740874c7997899a7011e81d7393d09f6464de093b947b9.txt', score=None, hash_='H23'),
 Document(path='data\\Astronomy\\31d24f38c3f669540b95636b022866769b924e0d26cabd8a85c41b4b0d22b80c.txt', score=None, hash_='H29'),
 Document(path='data\\Astronomy\\3788e4e990058bcf0256b9382138994f19a83f12a718b2b3d0b5583432bc

In [19]:
for document in documents:
    if document.path =="data\\Astronomy\\3a25055c972bf51e4b15979c83a71880b02bb022edc7e3d89240227055fbd445.txt":
        print(document.path)
        print(document.hash_)

data\Astronomy\3a25055c972bf51e4b15979c83a71880b02bb022edc7e3d89240227055fbd445.txt
H34


In [20]:
def compute_similarity(documents, query_df):
    for document in documents:
        with open(document.path, "r", encoding="utf-8") as f:
            text = f.read()
        df = pd.DataFrame({"text": [text]})
        tokenizer = RegexpTokenizer(r"\w+")
        # stop_words = set(stopwords.words("english"))
        df["text"] = df["text"].apply(lambda x: tokenizer.tokenize(x.lower()))
        # df["text"] = df["text"].apply(lambda x: [w for w in x if not w in stop_words])
        df["text"] = df["text"].apply(lambda x: " ".join(x))
        vectorizer = TfidfVectorizer()
        X_train_tfidf = vectorizer.fit_transform(df["text"])
        X_test_tfidf = vectorizer.transform(query_df)
        score = cosine_similarity(X_train_tfidf, X_test_tfidf)
        document.score = score[0][0]
    documents.sort(key=lambda x: x.score, reverse=True)
    return documents


In [21]:
document_scores = compute_similarity(documents, query)
document_scores

[Document(path='data\\Astronomy\\e33d0929afeaaea4976942f670f2a18a0db6037ac31dbad8991f829011a6d4bd.txt', score=0.6559719296597916, hash_='H154'),
 Document(path='data\\Astronomy\\268dd67541f7f4606b429abc8b41ec7aa38ef903667aa8f74d4dcc3dcd587be2.txt', score=0.6298897789321869, hash_='H24'),
 Document(path='data\\Astronomy\\de3da53966068646718943463a7741e1c4b26e73fe888609893058a8c27dfbd7.txt', score=0.6000991981489792, hash_='H146'),
 Document(path='data\\Astronomy\\cf469efa1928ff893a268316745c75aedcc7ff18ed74c8436472acb780a82723.txt', score=0.5679076368296748, hash_='H134'),
 Document(path='data\\Astronomy\\111ab5bb8e4daf91448b849d23e91da843ff026e4803b18d35d0efb81c657ada.txt', score=0.5622598279801457, hash_='H8'),
 Document(path='data\\Astronomy\\bbc951d068df26e16fa06d741f59b263d13a5cbef0a1919d696a07812102e561.txt', score=0.561707213306982, hash_='H117'),
 Document(path='data\\Astronomy\\351afafcfd101e1f1e31d2d76e90e341cb049852638ecb7f34b92049355825c0.txt', score=0.5313947654360066, hash

In [23]:
def get_snippet(text, query_words):
    # Split text into sentences
    sentences = sent_tokenize(text)
    # Find sentences containing query words
    relevant_sentences = [s for s in sentences if any(q.lower() in s.lower() for q in query_words)]
    # Join relevant sentences to form snippet
    snippet = "\n".join(relevant_sentences)
    return snippet


def display_highlighted_terms(documents,query):
    for document in documents:
        with open(document.path, "r", encoding="utf-8") as f:
            text = f.read()
        query_words = query.split()
        text = get_snippet(text, query_words)
        highlighted_document = text
        for term in query_words:
            highlighted_document = highlighted_document.replace(
                term, f"{Fore.GREEN}{term}{Style.RESET_ALL}")
        print(f'Document: {document.hash_}, Path: {document.path}')
        print(f"{highlighted_document}\n")


In [24]:
display_highlighted_terms(documents, query[0])

Document: H154, Path: data\Astronomy\e33d0929afeaaea4976942f670f2a18a0db6037ac31dbad8991f829011a6d4bd.txt
nasa astronaut , collins , retired colonel u.s. air force , achieved childhood dream flying [32mspace[0m , journey easy .
second woman go air force 's test pilot school selected astronaut 1990 .1995 , piloted shuttle discovery mission rendezvous russian [32mspace[0m station mir , 1999 commanded sts-93 mission -- first woman -- sts-114 2005 , shuttle 's return flight columbia accident ; , flew four shuttle missions .
eileen retired nasa 2006 released new memoir `` glass ceiling stars : story first woman command [32mspace[0m mission '' jonathan ward .
continues inspire young women worldwide .former nasa astronaut eileen collins made history [32mspace[0m missions first woman ever pilot command nasa [32mspace[0m shuttle .
new memoir jonathan ward details long road [32mspace[0m .week [32mspace[0mweek [32mspace[0m ( opens new tab ) covers new [32mspace[0m age .
[32mspa