In [67]:
import pandas as pd
import numpy as np
import os
import re

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('float_format', '{:f}'.format)

from sklearn.metrics.pairwise import cosine_similarity

In [68]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/admin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/admin/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/admin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [69]:
txt_directory = "../Data"

<H1>Turn PDFs into rows of a DataFrame</H1>

In [70]:
def extract_text(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return None

In [71]:
def process_texts_in_directory(directory_path):
    result = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):  # Process only .txt files
            file_path = os.path.join(directory_path, filename)
            content = extract_text(file_path)
            if content is not None:
                # Append file name (without extension) and content to the data list
                result.append({"title": os.path.splitext(filename)[0], "content": content})
    
    # Convert the list to a DataFrame
    return pd.DataFrame(result)

In [72]:
data = process_texts_in_directory(txt_directory)
data.head()

Unnamed: 0,title,content
0,Predisposing factors for incomplete spontaneou...,﻿RESEARCH ARTICLE Incidence of Neuralgic Amyot...
1,Gastrointestinal LCH a rare manifestation of L...,Gastrointestinal LCH: a rare manifestation of ...
2,A rare case report of omental synovial sarcoma...,Synovial Sarcoma: A Clinical Review\n\nby Aaro...
3,Chronic brucellosis with sacroiliitis A case r...,Interactions of the Human Pathogenic Brucella ...
4,Unilateral adrenal metastases as an unusual pr...,Unilateral adrenal metastases as an unusual pr...


In [73]:
print(data.iloc[0]['content'])

RESEARCH ARTICLE Incidence of Neuralgic Amyotrophy (Parsonage Turner Syndrome) in a Primary Care Setting - A Prospective Cohort Study Nens van Alfen1* Jeroen J J van Eijk2 Tessa Ennik3 Sean O Flynn4 Inge E G Nobacht5 Jan T Groothuis6 Sigrid Pillen7 Floris A van de Laar8 Department of Neurology Donders Centre for Neuroscience Radboud university medical centre Nijmegen The Netherlands Department of Neurology Jeroen Bosch Hospital Den Bosch The Netherlands Department of Neurology Erasmus Medical Center Rotterdam The Netherlands Royal College of Surgeons in Ireland Dublin Ireland Primary Health Centre Oosterhout Nijmegen The Netherlands Department of Rehabilitation Donders Centre for Neuroscience Radboud university medical centre Nijmegen The Netherlands Department of Neurology Canisius Wilhelmina Hospital Nijmegen The Netherlands Department of Primary and Community Care Radboud Institute for Health Sciences Radboud university medical centre The Netherlands & Academic Health Centre Thermio

<H1>Tokenisation with Stop Word Removal</H1>

In [74]:
def tokenize_text(text):
    # Convert to lowercase (ASCII is different)
    text = text.lower()
    # Remove newlines
    text = text.replace('\n', ' ')
    # Remove special characters like © and non-breaking spaces (shows up as NBSP)
    text = text.replace('©', '').replace('\u00A0', ' ')
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove references in square brackets
    text = re.sub(r'\[.*?\]', '', text)
    # Tokenise
    words = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    
    filtered_words = [
        word for word in words
        if word not in stop_words  # Remove stopwords
        and len(word) > 2         # Remove two-letter words
        and not any(char in word for char in ['β', 'α', 'µ', 'δ', 'γ'])  # Remove words that include these characters
        and not (word.isdigit() and len(word) > 4)  # Remove numbers longer than 4 digits
        and not (word.isdigit() and len(word) == 3)  # Remove numbers that are 3 digits long
        and not (len(word) == 4 and word.isdigit() and word[0] != '2')  # Remove 4-digit numbers not starting with '2' i.e. not recent dates, usually page numbers
        and not re.search(r'^[a-zA-Z]*\d+[a-zA-Z]+|[a-zA-Z]+\d+[a-zA-Z]*$', word)  # Remove words that mix letters and digits
        ]
    
    return filtered_words

In [75]:
tokenized_data = data.copy()
tokenized_data["tokens"] = tokenized_data['content'].astype(str).apply(tokenize_text)
tokenized_data.drop(columns=['content'], inplace=True)

In [76]:
tokenized_data.head()

Unnamed: 0,title,tokens
0,Predisposing factors for incomplete spontaneou...,"[research, article, incidence, neuralgic, amyo..."
1,Gastrointestinal LCH a rare manifestation of L...,"[gastrointestinal, lch, rare, manifestation, l..."
2,A rare case report of omental synovial sarcoma...,"[synovial, sarcoma, clinical, review, aaron, g..."
3,Chronic brucellosis with sacroiliitis A case r...,"[interactions, human, pathogenic, brucella, sp..."
4,Unilateral adrenal metastases as an unusual pr...,"[unilateral, adrenal, metastases, unusual, pri..."


In [77]:
print(tokenized_data['tokens'].iloc[0])

['research', 'article', 'incidence', 'neuralgic', 'amyotrophy', 'parsonage', 'turner', 'syndrome', 'primary', 'care', 'setting', 'prospective', 'cohort', 'study', 'nens', 'van', 'jeroen', 'van', 'tessa', 'sean', 'inge', 'jan', 'sigrid', 'floris', 'van', 'department', 'neurology', 'donders', 'centre', 'neuroscience', 'radboud', 'university', 'medical', 'centre', 'nijmegen', 'netherlands', 'department', 'neurology', 'jeroen', 'bosch', 'hospital', 'den', 'bosch', 'netherlands', 'department', 'neurology', 'erasmus', 'medical', 'center', 'rotterdam', 'netherlands', 'royal', 'college', 'surgeons', 'ireland', 'dublin', 'ireland', 'primary', 'health', 'centre', 'oosterhout', 'nijmegen', 'netherlands', 'department', 'rehabilitation', 'donders', 'centre', 'neuroscience', 'radboud', 'university', 'medical', 'centre', 'nijmegen', 'netherlands', 'department', 'neurology', 'canisius', 'wilhelmina', 'hospital', 'nijmegen', 'netherlands', 'department', 'primary', 'community', 'care', 'radboud', 'insti

<H1>Lemmatize</H1>

In [78]:
def lemmatize_words(words):
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word, pos='n') for word in words]  # Lemmatize as nouns
    return lemmatized_words

In [79]:
lemmatized_data = tokenized_data.copy()
lemmatized_data["tokens_lemmatized"] = lemmatized_data["tokens"].apply(lambda x: lemmatize_words(x))
lemmatized_data["text_for_tfidf"] = lemmatized_data["tokens_lemmatized"].apply(lambda x: ' '.join(x))

In [80]:
lemmatized_data.head()

Unnamed: 0,title,tokens,tokens_lemmatized,text_for_tfidf
0,Predisposing factors for incomplete spontaneou...,"[research, article, incidence, neuralgic, amyo...","[research, article, incidence, neuralgic, amyo...",research article incidence neuralgic amyotroph...
1,Gastrointestinal LCH a rare manifestation of L...,"[gastrointestinal, lch, rare, manifestation, l...","[gastrointestinal, lch, rare, manifestation, l...",gastrointestinal lch rare manifestation langer...
2,A rare case report of omental synovial sarcoma...,"[synovial, sarcoma, clinical, review, aaron, g...","[synovial, sarcoma, clinical, review, aaron, g...",synovial sarcoma clinical review aaron gazenda...
3,Chronic brucellosis with sacroiliitis A case r...,"[interactions, human, pathogenic, brucella, sp...","[interaction, human, pathogenic, brucella, spe...",interaction human pathogenic brucella specie h...
4,Unilateral adrenal metastases as an unusual pr...,"[unilateral, adrenal, metastases, unusual, pri...","[unilateral, adrenal, metastasis, unusual, pri...",unilateral adrenal metastasis unusual primary ...


In [81]:
print(lemmatized_data['tokens_lemmatized'].iloc[0])

['research', 'article', 'incidence', 'neuralgic', 'amyotrophy', 'parsonage', 'turner', 'syndrome', 'primary', 'care', 'setting', 'prospective', 'cohort', 'study', 'nens', 'van', 'jeroen', 'van', 'tessa', 'sean', 'inge', 'jan', 'sigrid', 'floris', 'van', 'department', 'neurology', 'donders', 'centre', 'neuroscience', 'radboud', 'university', 'medical', 'centre', 'nijmegen', 'netherlands', 'department', 'neurology', 'jeroen', 'bosch', 'hospital', 'den', 'bosch', 'netherlands', 'department', 'neurology', 'erasmus', 'medical', 'center', 'rotterdam', 'netherlands', 'royal', 'college', 'surgeon', 'ireland', 'dublin', 'ireland', 'primary', 'health', 'centre', 'oosterhout', 'nijmegen', 'netherlands', 'department', 'rehabilitation', 'donders', 'centre', 'neuroscience', 'radboud', 'university', 'medical', 'centre', 'nijmegen', 'netherlands', 'department', 'neurology', 'canisius', 'wilhelmina', 'hospital', 'nijmegen', 'netherlands', 'department', 'primary', 'community', 'care', 'radboud', 'instit

<H1>Apply TF-IDF</H1>

In [82]:
def apply_tfidf(dataframe):
    # Initialize TF-IDF Vectorizer with tokens as input
    vectorizer = TfidfVectorizer(analyzer=lambda x: x)
    # Fit and transform the tokenized content to a TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform(dataframe['tokens_lemmatized'])
    # Convert the matrix to a DataFrame for easier viewing
    result = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
    return result, vectorizer

In [83]:
tfidf_df, tfidf_vectorizer = apply_tfidf(lemmatized_data)

In [84]:
tfidf_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Columns: 11622 entries, 2000 to μgμl
dtypes: float64(11622)
memory usage: 4.3 MB


<H1>Create the Inverted Index</H1>

In [85]:
def create_inverted_index_from_tfidf(tfidf_df, tokenized_data):
    # Step 1: Create a term dictionary (unique terms to termID mapping)
    terms = tfidf_df.columns
    term_dict = {idx: term for idx, term in enumerate(terms)}  # termID -> term

    # Initialize list to build the inverted index
    inverted_index = []

    # Step 2: Build the inverted index (termID, docID, freq, tf-idf, positions)
    for docID, tokens in enumerate(tokenized_data['tokens']):
        term_freq = {}  # term -> frequency in the document
        positions = {}  # term -> positions list
        
        for pos, term in enumerate(tokens):
            termID = next((k for k, v in term_dict.items() if v == term), None)  # Get termID for the term
            if termID is not None:  # If the term exists in the term_dict
                if termID not in term_freq:
                    term_freq[termID] = 0
                    positions[termID] = []
                term_freq[termID] += 1
                positions[termID].append(pos)

        # Add entries to the inverted index
        for termID, freq in term_freq.items():
            tfidf_score = tfidf_df.iloc[docID, termID]  # Get the TF-IDF score for the term in this document
            inverted_index.append({
                'termID': termID,
                'docID': docID,
                'freq': freq,
                'tf-idf': tfidf_score,
                'positions': positions[termID]
            })

    # Step 3: Create DataFrames for the results
    result_term_dict = pd.DataFrame(list(term_dict.items()), columns=['termID', 'term'])
    result_inverted_index = pd.DataFrame(inverted_index)

    return result_term_dict, result_inverted_index

In [86]:
# Usage:
term_dict_df, inverted_index_df = create_inverted_index_from_tfidf(tfidf_df, tokenized_data)

In [87]:
# Viewing the results
print("Term Dictionary:")
print(term_dict_df.head())
print("\nInverted Index:")
print(inverted_index_df.head())

Term Dictionary:
   termID  term
0       0  2000
1       1  2001
2       2  2002
3       3  2007
4       4  2008

Inverted Index:
   termID  docID  freq   tf-idf  \
0    8914      0     4 0.013553   
1     821      0     2 0.008380   
2    5147      0    29 0.133620   
3    6941      0    71 0.589395   
4     528      0    68 0.564491   

                                           positions  
0                               [0, 587, 1203, 1240]  
1                                           [1, 180]  
2  [2, 115, 161, 325, 338, 366, 539, 575, 597, 66...  
3  [3, 116, 136, 164, 225, 227, 254, 306, 328, 33...  
4  [4, 117, 137, 226, 228, 255, 307, 331, 340, 35...  


<H1>Information Retrieval Model</H1>

## Query processing

In [None]:
query = "tracheal rupture"

ps this is only to show, skip and  use the function directly

In [89]:
tokenized_query = tokenize_text(query)
print(tokenized_query)

['tracheal', 'rupture']


In [90]:
lemmatized_query = lemmatize_words(tokenized_query)
print(lemmatized_query)

['tracheal', 'rupture']


In [91]:
def match_query_to_documents(query_string, tfidf_data, vectorizer, top_n=5):
    # Step 1: Preprocess the query
    processed_query = tokenize_text(query_string)
    processed_query = lemmatize_words(processed_query)
    
    # Step 2: Transform the query into the same TF-IDF space as documents
    query_tfidf = vectorizer.transform([processed_query])  # Query as a 1xN vector    
    if np.count_nonzero(query_tfidf.toarray()) == 0:
        print("Query vector is zero. Check query preprocessing.")
        return
    
    # Step 3: Compute cosine similarity between query and documents
    cosine_sim = cosine_similarity(query_tfidf, tfidf_data)
    
    # Step 4: Get the most similar document(s)
    similarity_scores = cosine_sim.flatten()
    top_indices = similarity_scores.argsort()[-top_n:][::-1]  # Get indices of top N scores (descending order)
    top_scores = similarity_scores[top_indices]  # Corresponding scores
    
    return top_indices, top_scores

In [92]:
top_indices, top_scores = match_query_to_documents(query, tfidf_df, tfidf_vectorizer, top_n=5)

In [93]:
print("Top 5 matching documents:")
for idx, score in zip(top_indices, top_scores):
    print(f"Document Title: {data.iloc[idx]['title']}, Similarity Score: {score}")

Top 5 matching documents:
Document Title: Post-intubation tracheal rupture treated surgically using a muscle graft from the Latissimus Dorsi a rare disorder and review of literature., Similarity Score: 0.4574559477073631
Document Title: False-Negative Motor-Evoked Potential Due to Contrast-Induced Encephalopathy During Coil Embolization for Intracranial Aneurysm A Case Report, Similarity Score: 0.012575161053957735
Document Title: Repair of an iatrogenic lacrimal fistula a case report, Similarity Score: 0.009683871731696266
Document Title: Endocrine disorders in Rett syndrome a systematic review of the literature, Similarity Score: 0.0
Document Title: Report of one case with de novo mutation in TLK2 and literature review, Similarity Score: 0.0


In [94]:
term_dict_df.to_csv("term_dict.csv", index=False)
inverted_index_df.to_csv("inverted_index.csv", index=False)

In [95]:
doc_list = tokenized_data.drop(columns=['tokens'])
doc_list.to_csv("doc_list.csv", index=True)

# Final Matching Function

In [101]:
def match_query_to_documents(query_string, term_dict_df, inverted_index_df):
    # Step 1: Preprocess the query
    processed_query = tokenize_text(query_string)
    processed_query = lemmatize_words(processed_query)

    # Step 2: Map query terms to termIDs
    term_to_id = dict(zip(term_dict_df['term'], term_dict_df['termID']))
    query_term_ids = [term_to_id[term] for term in processed_query if term in term_to_id]

    if not query_term_ids:
        print("No matching terms found in query.")
        return

    # Step 3: Build the query vector
    query_vector = {}
    for term_id in query_term_ids:
        # Aggregate tf-idf scores for terms in the query
        tfidf_scores = inverted_index_df.loc[inverted_index_df['termID'] == term_id, 'tf-idf']
        query_vector[term_id] = tfidf_scores.sum()  # Summing scores if term appears multiple times in the query

    # Step 4: Normalize the query vector
    query_norm = np.sqrt(sum(value ** 2 for value in query_vector.values()))
    query_vector = {term_id: value / query_norm for term_id, value in query_vector.items()}

    # Step 5: Retrieve document vectors
    doc_vectors = {}
    for term_id, tfidf_score in query_vector.items():
        term_docs = inverted_index_df.loc[inverted_index_df['termID'] == term_id]
        for _, row in term_docs.iterrows():
            doc_id = row['docID']
            doc_vectors.setdefault(doc_id, {}).setdefault(term_id, 0)
            doc_vectors[doc_id][term_id] += row['tf-idf']

    # Step 6: Normalize document vectors
    for doc_id, vector in doc_vectors.items():
        norm = np.sqrt(sum(value ** 2 for value in vector.values()))
        doc_vectors[doc_id] = {term_id: value / norm for term_id, value in vector.items()}

    # Step 7: Compute cosine similarity
    similarities = []
    for doc_id, vector in doc_vectors.items():
        similarity = sum(query_vector.get(term_id, 0) * vector.get(term_id, 0) for term_id in query_vector)
        similarities.append((doc_id, similarity))

    # Step 8: Rank documents by similarity
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
    top_documents = similarities

    return [doc_id for doc_id, _ in top_documents], [score for _, score in top_documents]

In [102]:
top_docs, scores = match_query_to_documents("Case Study", term_dict_df, inverted_index_df)

print("Top Documents:", top_docs)
print("Scores:", scores)

Top Documents: [2, 1, 25, 4, 30, 23, 12, 33, 16, 13, 21, 17, 28, 18, 36, 45, 26, 5, 10, 43, 20, 44, 7, 37, 32, 3, 6, 8, 11, 15, 22, 27, 29, 41, 39, 14, 0, 38, 24, 47, 34, 9, 19, 31, 35, 40, 42, 46]
Scores: [0.9999494947797662, 0.9999494947797661, 0.9996987916956555, 0.9993365027775888, 0.9991006525514899, 0.9987687719948736, 0.9983858347943285, 0.9961103991522352, 0.9959985180033731, 0.9943788631143926, 0.9918528249443123, 0.9873119190151366, 0.9844782975556747, 0.9806005631943187, 0.9763763267712083, 0.9759130413099508, 0.9730263500382952, 0.9707832019114784, 0.9651820717913533, 0.9596443560294764, 0.9574585604141326, 0.9574585604141326, 0.9539039429715663, 0.9503543749991653, 0.9323363665940317, 0.9295549401760824, 0.9295549401760824, 0.9295549401760824, 0.9295549401760824, 0.9295549401760824, 0.9295549401760824, 0.9295549401760824, 0.9295549401760824, 0.9295549401760824, 0.8475022913403659, 0.768195283272004, 0.7643286468916102, 0.692640106122395, 0.5459921177007541, 0.5204547636660

In [103]:
for doc_id, score in zip(top_docs, scores):
    title = doc_list.loc[doc_id, 'title']
    print(f"Document ID: {doc_id}, Title: {title}, Similarity Score: {score:.4f}")

Document ID: 2, Title: A rare case report of omental synovial sarcoma complicated hemoperitoneum and literature review, Similarity Score: 0.9999
Document ID: 1, Title: Gastrointestinal LCH a rare manifestation of Langerhans cell histiocytosis, Similarity Score: 0.9999
Document ID: 25, Title: Case report Pediatric hepatopulmonary syndrome despite strict weight control after craniopharyngioma surgery, Similarity Score: 0.9997
Document ID: 4, Title: Unilateral adrenal metastases as an unusual primary presentation of hepatocellular carcinoma a case report and literature review, Similarity Score: 0.9993
Document ID: 30, Title: Genome-wide meta-analysis of myasthenia gravis uncovers new loci and provides insights into polygenic prediction, Similarity Score: 0.9991
Document ID: 23, Title: Brain abscess caused by Streptococcus pyogenes with atypical symptoms a case report and literature review, Similarity Score: 0.9988
Document ID: 12, Title: Report of one case with de novo mutation in TLK2 an