In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from gensim import corpora, models
from bertopic import BERTopic
import warnings
warnings.filterwarnings("ignore")

# Download necessary NLTK resources
nltk.download("stopwords")
nltk.download("punkt")


data = {
    "posts": [
        "Looking for a mentor to guide me in deep learning projects.",
        "Anyone up for a study group for GRE preparation?",
        "Hosting a workshop on React and Web Development!",
        "How can I improve my resume for data science internships?",
        "Is anyone teaching graphic design or Photoshop?",
        "Letâ€™s collaborate on a blockchain-based project.",
        "Discussion on mental health and academic pressure.",
        "Career opportunities in AI and ML are booming lately.",
        "Organizing a mock interview session for placements.",
        "Python for beginners â€” starting from basics this week!"
    ]
}

df = pd.DataFrame(data)
print("Sample Forum Data:\n", df.head())


def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return " ".join(tokens)

df["clean_text"] = df["posts"].apply(clean_text)


# Tokenize for gensim
texts = [text.split() for text in df["clean_text"]]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# LDA Model
lda_model = models.LdaModel(
    corpus,
    num_topics=4,
    id2word=dictionary,
    passes=15,
    random_state=42
)

# Print Topics
print("\nðŸ”¹ LDA Topics:")
for idx, topic in lda_model.print_topics(num_words=5):
    print(f"Topic {idx + 1}: {topic}")


for idx, topic in lda_model.show_topics(formatted=False, num_words=10):
    wc = WordCloud(background_color="white", width=600, height=400)
    topic_words = dict(topic)
    plt.figure()
    plt.imshow(wc.generate_from_frequencies(topic_words))
    plt.axis("off")
    plt.title(f"Topic {idx + 1}")
    plt.show()


# Initialize and Fit BERTopic
topic_model = BERTopic(language="english", verbose=False)
topics, probs = topic_model.fit_transform(df["clean_text"])

# Display Discovered Topics
print("\nðŸ”¹ BERTopic Discovered Topics:")
topic_info = topic_model.get_topic_info()
print(topic_info.head())


topic_freq = topic_info[["Topic", "Count"]].head(10)
plt.figure(figsize=(8, 4))
plt.barh(topic_freq["Topic"].astype(str), topic_freq["Count"])
plt.gca().invert_yaxis()
plt.xlabel("Number of Posts")
plt.ylabel("Topic ID")
plt.title("Top Trending Topics on the Platform")
plt.show()

In [None]:
sample_post = "Anyone organizing a machine learning workshop?"
topic_pred, prob = topic_model.transform([sample_post])
topic_label = topic_model.get_topic(topic_pred[0])

print("\nSample Post:", sample_post)
print("Predicted Topic Keywords:", topic_label)