In [None]:
import pandas as pd
import numpy as np
import os
import re

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('float_format', '{:f}'.format)

from sklearn.metrics.pairwise import cosine_similarity

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

In [None]:
txt_directory = "../Data"

<H1>Turn PDFs into rows of a DataFrame</H1>

In [None]:
def extract_text(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return None

In [None]:
def process_texts_in_directory(directory_path):
    result = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):  # Process only .txt files
            file_path = os.path.join(directory_path, filename)
            content = extract_text(file_path)
            if content is not None:
                # Append file name (without extension) and content to the data list
                result.append({"title": os.path.splitext(filename)[0], "content": content})
    
    # Convert the list to a DataFrame
    return pd.DataFrame(result)

In [None]:
data = process_texts_in_directory(txt_directory)
data.head()

In [None]:
print(data.iloc[0]['content'])

<H1>Tokenisation with Stop Word Removal</H1>

In [None]:
def tokenize_text(text):
    # Convert to lowercase (ASCII is different)
    text = text.lower()
    # Remove newlines
    text = text.replace('\n', ' ')
    # Remove special characters like © and non-breaking spaces (shows up as NBSP)
    text = text.replace('©', '').replace('\u00A0', ' ')
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove references in square brackets
    text = re.sub(r'\[.*?\]', '', text)
    # Tokenise
    words = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    
    filtered_words = [
        word for word in words
        if word not in stop_words  # Remove stopwords
        and len(word) > 2         # Remove two-letter words
        and not any(char in word for char in ['β', 'α', 'µ', 'δ', 'γ'])  # Remove words that include these characters
        and not (word.isdigit() and len(word) > 4)  # Remove numbers longer than 4 digits
        and not (word.isdigit() and len(word) == 3)  # Remove numbers that are 3 digits long
        and not (len(word) == 4 and word.isdigit() and word[0] != '2')  # Remove 4-digit numbers not starting with '2' i.e. not recent dates, usually page numbers
        and not re.search(r'^[a-zA-Z]*\d+[a-zA-Z]+|[a-zA-Z]+\d+[a-zA-Z]*$', word)  # Remove words that mix letters and digits
        ]
    
    return filtered_words

In [None]:
tokenized_data = data.copy()
tokenized_data["tokens"] = tokenized_data['content'].astype(str).apply(tokenize_text)
tokenized_data.drop(columns=['content'], inplace=True)

In [None]:
tokenized_data.head()

In [None]:
print(tokenized_data['tokens'].iloc[0])

<H1>Lemmatize</H1>

In [None]:
def lemmatize_words(words):
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word, pos='n') for word in words]  # Lemmatize as nouns
    return lemmatized_words

In [None]:
lemmatized_data = tokenized_data.copy()
lemmatized_data["tokens_lemmatized"] = lemmatized_data["tokens"].apply(lambda x: lemmatize_words(x))
lemmatized_data["text_for_tfidf"] = lemmatized_data["tokens_lemmatized"].apply(lambda x: ' '.join(x))

In [None]:
lemmatized_data.head()

In [None]:
print(lemmatized_data['tokens_lemmatized'].iloc[0])

<H1>Apply TF-IDF</H1>

In [None]:
def apply_tfidf(dataframe):
    # Initialize TF-IDF Vectorizer with tokens as input
    vectorizer = TfidfVectorizer(analyzer=lambda x: x)
    # Fit and transform the tokenized content to a TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform(dataframe['tokens_lemmatized'])
    # Convert the matrix to a DataFrame for easier viewing
    result = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
    return result, vectorizer

In [None]:
tfidf_df, tfidf_vectorizer = apply_tfidf(lemmatized_data)

In [None]:
tfidf_df.info()

<H1>Create the Inverted Index</H1>

In [None]:
def create_inverted_index_from_tfidf(tfidf_df, tokenized_data):
    # Step 1: Create a term dictionary (unique terms to termID mapping)
    terms = tfidf_df.columns
    term_dict = {idx: term for idx, term in enumerate(terms)}  # termID -> term

    # Initialize list to build the inverted index
    inverted_index = []

    # Step 2: Build the inverted index (termID, docID, freq, tf-idf, positions)
    for docID, tokens in enumerate(tokenized_data['tokens']):
        term_freq = {}  # term -> frequency in the document
        positions = {}  # term -> positions list
        
        for pos, term in enumerate(tokens):
            termID = next((k for k, v in term_dict.items() if v == term), None)  # Get termID for the term
            if termID is not None:  # If the term exists in the term_dict
                if termID not in term_freq:
                    term_freq[termID] = 0
                    positions[termID] = []
                term_freq[termID] += 1
                positions[termID].append(pos)

        # Add entries to the inverted index
        for termID, freq in term_freq.items():
            tfidf_score = tfidf_df.iloc[docID, termID]  # Get the TF-IDF score for the term in this document
            inverted_index.append({
                'termID': termID,
                'docID': docID,
                'freq': freq,
                'tf-idf': tfidf_score,
                'positions': positions[termID]
            })

    # Step 3: Create DataFrames for the results
    result_term_dict = pd.DataFrame(list(term_dict.items()), columns=['termID', 'term'])
    result_inverted_index = pd.DataFrame(inverted_index)

    return result_term_dict, result_inverted_index

In [None]:
# Usage:
term_dict_df, inverted_index_df = create_inverted_index_from_tfidf(tfidf_df, tokenized_data)

In [None]:
# Viewing the results
print("Term Dictionary:")
print(term_dict_df.head())
print("\nInverted Index:")
print(inverted_index_df.head())

<H1>Information Retrieval Model</H1>

## Query processing

In [None]:
query = "tracheal rupture"

ps this is only to show, skip and  use the function directly

In [None]:
tokenized_query = tokenize_text(query)
print(tokenized_query)

In [None]:
lemmatized_query = lemmatize_words(tokenized_query)
print(lemmatized_query)

In [None]:
def match_query_to_documents(query_string, tfidf_data, vectorizer, top_n=5):
    # Step 1: Preprocess the query
    processed_query = tokenize_text(query_string)
    processed_query = lemmatize_words(processed_query)
    
    # Step 2: Transform the query into the same TF-IDF space as documents
    query_tfidf = vectorizer.transform([processed_query])  # Query as a 1xN vector    
    if np.count_nonzero(query_tfidf.toarray()) == 0:
        print("Query vector is zero. Check query preprocessing.")
        return
    
    # Step 3: Compute cosine similarity between query and documents
    cosine_sim = cosine_similarity(query_tfidf, tfidf_data)
    
    # Step 4: Get the most similar document(s)
    similarity_scores = cosine_sim.flatten()
    top_indices = similarity_scores.argsort()[-top_n:][::-1]  # Get indices of top N scores (descending order)
    top_scores = similarity_scores[top_indices]  # Corresponding scores
    
    return top_indices, top_scores

In [None]:
top_indices, top_scores = match_query_to_documents(query, tfidf_df, tfidf_vectorizer, top_n=5)

In [None]:
print("Top 5 matching documents:")
for idx, score in zip(top_indices, top_scores):
    print(f"Document Title: {data.iloc[idx]['title']}, Similarity Score: {score}")

In [None]:
term_dict_df.to_csv("term_dict.csv", index=False)
inverted_index_df.to_csv("inverted_index.csv", index=False)

In [None]:
doc_list = tokenized_data.drop(columns=['tokens'])
doc_list.to_csv("doc_list.csv", index=True)