In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from dataclasses import dataclass, field
from colorama import Fore, Style
import pandas as pd
import json
import os

In [2]:
with open("inverted_index.json", "r") as f:
    inverted_index = json.load(f)
with open("mapping.json", "r") as f:
    mapping = json.load(f)
reversed_mapping = {v: k for k, v in mapping.items()}

In [3]:
def preprocess_query(query):
    df = pd.DataFrame({"text": [query]})
    tokenizer = RegexpTokenizer(r"\w+")
    stop_words = set(stopwords.words("english"))
    df["text"] = df["text"].apply(lambda x: tokenizer.tokenize(x.lower()))
    df["text"] = df["text"].apply(lambda x: [w for w in x if not w in stop_words])
    df["text"] = df["text"].apply(lambda x: " ".join(x))
    return df["text"]

In [4]:
query = "pollen allergy"
query = preprocess_query(query)
query

0    pollen allergy
Name: text, dtype: object

In [5]:
def get_docs(query_df, inverted_index):
    docs = []
    for word in query_df[0].split():
        if word in inverted_index:
            docs.append(inverted_index[word])
    return docs
docs = get_docs(query, inverted_index)
docs

[{'soundex': 'P450', 'occurences': [['H54', 1, 'Health']]},
 {'soundex': 'A462',
  'occurences': [['H50', 1, 'Health'],
   ['H54', 17, 'Health'],
   ['H55', 1, 'Health']]}]

In [6]:
#Make a class to hold document path and similarity score
@dataclass
class Document:
    path: str = field(default=None)
    score: float = field(default=None)
    hash_: str = field(default=None)

In [7]:
def initialize_documents(docs, reversed_mapping):
    seen = set()
    documents = []
    extension = ".txt"
    for doc in docs:
        for occurence in doc["occurences"]:
            if occurence[0] not in seen:
                document = Document()
                path = os.path.join(occurence[2], reversed_mapping[occurence[0]] + extension)
                document.path = path
                document.hash_ = occurence[0]
                documents.append(document)
                seen.add(occurence[0])
    return documents
documents= initialize_documents(docs, reversed_mapping)
documents

[Document(path='Health\\1bcab2cec7df78495c885f2f817db59d6948908db9d757e1d6bbfe56aa5bfa08.txt', score=None, hash_='H54'),
 Document(path='Health\\0c7426f34b20aa3219e483267affbf36127119eea59f76a6408a05327f78e909.txt', score=None, hash_='H50'),
 Document(path='Health\\2902c166af57cdc4e524f4462e7df211960cb66a846c6747196b80b9ed5f7746.txt', score=None, hash_='H55')]

In [8]:
def compute_similarity(documents, query_df):
    for document in documents:
        with open(document.path, "r", encoding="utf-8") as f:
            text = f.read()
        df = pd.DataFrame({"text": [text]})
        tokenizer = RegexpTokenizer(r"\w+")
        # stop_words = set(stopwords.words("english"))
        df["text"] = df["text"].apply(lambda x: tokenizer.tokenize(x.lower()))
        # df["text"] = df["text"].apply(lambda x: [w for w in x if not w in stop_words])
        df["text"] = df["text"].apply(lambda x: " ".join(x))
        vectorizer = TfidfVectorizer()
        X_train_tfidf = vectorizer.fit_transform(df["text"])
        X_test_tfidf = vectorizer.transform(query_df)
        score = cosine_similarity(X_train_tfidf, X_test_tfidf)
        document.score = score[0][0]
    documents.sort(key=lambda x: x.score, reverse=True)
    return documents


In [9]:
document_scores = compute_similarity(documents, query)
document_scores

[Document(path='Health\\1bcab2cec7df78495c885f2f817db59d6948908db9d757e1d6bbfe56aa5bfa08.txt', score=0.1778260285444014, hash_='H54'),
 Document(path='Health\\0c7426f34b20aa3219e483267affbf36127119eea59f76a6408a05327f78e909.txt', score=0.016934564257464445, hash_='H50'),
 Document(path='Health\\2902c166af57cdc4e524f4462e7df211960cb66a846c6747196b80b9ed5f7746.txt', score=0.014558647529524711, hash_='H55')]

In [10]:
def get_snippet(text, query_words):
    # Split text into sentences
    sentences = sent_tokenize(text)
    # Find sentences containing query words
    relevant_sentences = [s for s in sentences if any(q.lower() in s.lower() for q in query_words)]
    # Join relevant sentences to form snippet
    snippet = "\n".join(relevant_sentences)
    return snippet


def display_highlighted_terms(documents,query):
    for document in documents:
        with open(document.path, "r", encoding="utf-8") as f:
            text = f.read()
        query_words = query.split()
        text = get_snippet(text, query_words)
        highlighted_document = text
        for term in query_words:
            highlighted_document = highlighted_document.replace(
                term, f"{Fore.GREEN}{term}{Style.RESET_ALL}")
        print(f'Document: {document.hash_}, Path: {document.path}')
        print(f"{highlighted_document}\n")


In [11]:
display_highlighted_terms(documents, query[0])

Document: H54, Path: Health\1bcab2cec7df78495c885f2f817db59d6948908db9d757e1d6bbfe56aa5bfa08.txt
[32mallergy[0m immune system response foreign substance ’ typically harmful body .
include certain foods , [32mpollen[0m , pet dander .immune system ’ job keep healthy fighting harmful pathogens .
include type [32mallergy[0m severe [32mallergy[0m .take medication anticipated allergic response , may still experience symptoms , may reduced .food allergiesfood allergies trigger swelling , hives , nausea , fatigue , .
may take person realize food [32mallergy[0m .
’ possible , treatment options available .medication[32mallergy[0m treatment often includes medications like antihistamines control symptoms .
involves several injections course years help body get used [32mallergy[0m .
successful immunotherapy prevent [32mallergy[0m symptoms returning .emergency epinephrinesevere , life-threatening [32mallergy[0m , carry emergency epinephrine shot .
blood tested presence [32mallergy