In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

nltk.download("stopwords")
nltk.download("punkt_tab")
stop_words = set(stopwords.words("english"))


def remove_stop_words(text: str) -> str:
    word_tokens = word_tokenize(text)
    # converts the words in word_tokens to lower case and then checks whether
    # they are present in stop_words or not
    filtered_sentence = [w for w in word_tokens if w.lower() not in stop_words]
    # with no lower case conversion
    filtered_sentence = []

    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)

    print(word_tokens)
    print(filtered_sentence)
    return " ".join(filtered_sentence)

In [None]:
from nltk.stem import PorterStemmer

porter_stemmer = PorterStemmer()


def lemmatize_text(text: str) -> str:
    word_tokens = word_tokenize(text)
    lemmatized_sentence = [porter_stemmer.stem(word) for word in word_tokens]
    return " ".join(lemmatized_sentence)

In [None]:
from typing import List
from parser.model import Section
from parser.parse import main
from pydantic import BaseModel

sections: List[Section] = main("../fe_files/exams/FE-Aug23.pdf")


for section in sections:
    for question in section.questions:
        input_question = question.text
        print(input_question)

        removed_stop_words: str = remove_stop_words(input_question)
        question.metadata.removed_stop_words = removed_stop_words
        lemmatized_text: str = lemmatize_text(removed_stop_words)
        question.metadata.lemmatized_text = lemmatized_text


class Document(BaseModel):
    sections: List[Section]


document = Document(sections=sections)

# Write pydantic models to JSON file
with open("document_preprocessed_removed_stop_words.json", "w") as json_file:
    json_file.write(document.model_dump_json())