In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from extract_pdf import extract_text_from_pdf

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/space/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [5]:
text = extract_text_from_pdf("/home/space/Downloads/test2.pdf")

In [6]:
def preprocess_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word.lower() for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)



In [7]:
# Step 3: Convert text to Document-Term Matrix
def convert_to_dtm(text_data):
    vectorizer = CountVectorizer(max_df=1.0, min_df=1, stop_words='english')
    dtm = vectorizer.fit_transform(text_data)
    return dtm, vectorizer



In [8]:
# Step 4: Perform LDA Topic Modeling
def perform_lda(dtm, num_topics=5):
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda.fit(dtm)
    return lda


In [9]:

# Step 5: Interpret the Results
def print_topics(model, vectorizer, num_words=10):
    words = vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic #{topic_idx + 1}:")
        print(" ".join([words[i] for i in topic.argsort()[:-num_words - 1:-1]]))

In [20]:
def main(num_topics=5, num_words=50):
    text = extract_text_from_pdf("/home/space/Downloads/test2.pdf")
    preprocessed_text = preprocess_text(text)
    dtm, vectorizer = convert_to_dtm([preprocessed_text])
    lda_model = perform_lda(dtm, num_topics)
    print_topics(lda_model, vectorizer, num_words)

In [21]:
main()

Topic #1:
rescue lake elliot emergency operation time incident staff response insp evidence plan neadles collapse team action chief ucrt member responder commission communication effort command information organization commander event general inquiry planning decision deployment department officer capability crane medical task training experience opp left expertise ministry section heard ontario june clearly
Topic #2:
winter ability absence year wrong absent writing worthwhile woman affected advisory worker workplace whereabouts whim adoption algo albeit aimed world achievable accumulates accumulated accountability correct accordingly accorded accident accessible controlled courage correctional contacted coroner cordon copy coping convoy trained tragedy tower troop trite took tony told today timmins timing timely
Topic #3:
winter ability absence year wrong absent writing worthwhile woman affected advisory worker workplace whereabouts whim adoption algo albeit aimed world achievable acc