<a href="https://colab.research.google.com/github/Joshuajee/AI-ML-PROJECTS/blob/master/Topic%20Modelling%20on%20Financial%20Posts%20from%20Redit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyLDAvis
!pip install bertopic
!pip install flair
!apt-get -qq install -y libfluidsynth1



** **
## Step 1: Loading the Data
** **
The data was collected manually from twenty two financial subreddit and saved in a csv format to my github repo.

In [None]:
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_distances
from scipy.cluster.hierarchy import dendrogram, linkage

# BERTopic model
from bertopic import BERTopic
# Dimension reduction
from umap import UMAP
# Clustering
from hdbscan import HDBSCAN
from sklearn.cluster import KMeans
# Count vectorization
from sklearn.feature_extraction.text import CountVectorizer
# Sentence transformer
from sentence_transformers import SentenceTransformer
# Flair
from transformers.pipelines import pipeline
from flair.embeddings import TransformerDocumentEmbeddings


In [None]:
def get_redit_data_from_github():
  file_path = "https://raw.githubusercontent.com/Joshuajee/AI-ML-PROJECTS/master/data/reddit/reddit_financial_data.csv"
  reponse = requests.get(file_path)
  if reponse.status_code == 200:
    with open("reddit_financial_data.csv", "wb") as f:
      f.write(reponse.content)
    return pd.read_csv("reddit_financial_data.csv", sep=",")
  else:
    raise Exception("Error downloading", reponse.status_code)


In [None]:
reddit_data = get_redit_data_from_github()
reddit_data

** **
## Step 2: Data Cleaning
** **

The reddit post data contains multiple columns, but since this is an NLP task only the text and title columns are useful for our Topic modeling task the other columns will be ignored.

1. Join the title and the text columns
2. Remove punctuations and special characters.




In [None]:
# Join the title and text columns in a new content column
reddit_data['content'] = reddit_data['title'] + ' ' + reddit_data['text']
reddit_data

In [None]:
# Create a new DataFrame containing only the content column
content_df = reddit_data[['content']].copy()
content_df

In [None]:
def preprocess(text):
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'[^A-Za-z0-9\s]+', '', text)  # Remove special characters
    text = text.lower()  # Lowercase text
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

In [None]:
# Cleaning the data gotton from reddit as it contains relevant characters
content_df['cleaned_content'] = content_df['content'].apply(preprocess)
content_df

In [None]:
# Removing stop words
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
nltk.download('punkt_tab')
nltk.download('omw-1.4')
nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()

# Removing stop words from wordlist columns
content_df['cleaned_content_no_sw'] = content_df['cleaned_content'].apply(lambda x: ' '.join([w for w in x.split() if w not in stop_words]))
# Lemmatization
content_df['cleaned_content_lm_no_sw'] = content_df['cleaned_content_no_sw'].apply(lambda x: ' '.join([wn.lemmatize(w) for w in x.split() if w not in stop_words]))
content_df

** **
## Step 3: Exploratory Analysis <a class="anchor\" id="eda"></a>
** **
To better understand our data, I will make and histogram about to show the distrubution of words per posts.

To verify whether the preprocessing, we’ll make a simple word cloud using the `wordcloud` package to get a visual representation of most common words. It is key to understanding the data and ensuring we are on the right track, and if any more preprocessing is necessary before training the model.



In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(content_df['cleaned_content'])
print(f"Total unique words raw               : {len(vectorizer.get_feature_names_out())}")
X = vectorizer.fit_transform(content_df['cleaned_content_no_sw'])
print(f"Total unique words without stop words: {len(vectorizer.get_feature_names_out())}")
X = vectorizer.fit_transform(content_df['cleaned_content_lm_no_sw'])
print(f"Total unique words with lemmatization: {len(vectorizer.get_feature_names_out())}")

In [None]:
text_lengths = [len(x) for x in content_df['cleaned_content_lm_no_sw'].apply(lambda x: [w for w in x.split() if w not in stop_words])]

# Set up the figure size
plt.figure(figsize=(12, 6))

# Plot the histogram using seaborn with a KDE overlay.
sns.histplot(text_lengths, bins=50, kde=True, color="steelblue")

# Add plot labels and title
plt.title("Distribution of Text Lengths")
plt.xlabel("Text Length (number of words)")
plt.ylabel("Frequency")

# Show the plot
plt.show()

In [None]:
# Import the wordcloud library
from wordcloud import WordCloud

data_words = content_df['cleaned_content_lm_no_sw'].explode().to_list()

# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, width=1000, height=600, contour_color='steelblue')

# Generate a big chunck of text
big_chunck_text = " ".join(data_words)

# Generate a word cloud
wordcloud.generate(big_chunck_text)

# Visualize the word cloud
wordcloud.to_image()

** **
## Step 4: Building the Models <a class="anchor\" id="models"></a>
** **

For this tasks, I will be using LDA and Bertopic


In [None]:
# Hyperparameters
n_components = 10
n_top_words = 10
max_top_words = 100

** **
### Latent Dirichlet Allocation (LDA) <a class="anchor\" id="lda"></a>
** **


In [None]:
# Hyperparameters for LDA
max_df = 0.9
min_df = 4
max_features = 10000

tf_vectorizer = CountVectorizer(max_df=max_df, min_df=min_df, max_features=max_features, stop_words="english")

tf = tf_vectorizer.fit_transform(content_df['cleaned_content_lm_no_sw'])

lda_model = LatentDirichletAllocation(n_components=n_components, learning_method="online", random_state=100)

lda_model.fit(tf)

** **
###  BERTopic Model <a class="anchor\" id="bert_model"></a>
** **

In [None]:
# Use KMeans with n_clusters=n_components i.e number of topics
kmeans_model = KMeans(n_clusters=n_components, random_state=100)
# Use UMAP to reduce the dimension of the embeddings
# n_components should not be mistaken with number of topics
# n_components is the dimension to reduce the vector to
# min_dist = 0.5 Controls how tightly UMAP packs points together in the low-dimensional space
# metric='cosine'
umap_model = UMAP(n_components=n_components, min_dist=0.5, metric='cosine', random_state=100)

# Using TransformerDocumentEmbeddings and "roberta-base" pretrained model
model = TransformerDocumentEmbeddings('roberta-base')

topic_model = BERTopic(embedding_model=model, umap_model=umap_model, hdbscan_model=kmeans_model, top_n_words=max_top_words)

topics, probs = topic_model.fit_transform(content_df['cleaned_content_lm_no_sw'])

In [None]:
# Get the list of topics
topic_model.get_topic_info()

In [None]:
topic_model.visualize_topics()

** **
## Step 5: Results and Visualization <a class="anchor\" id="results"></a>
** **


In [None]:
def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
        ax.invert_yaxis()
        ax.tick_params(axis="both", which="major", labelsize=20)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()


def plot_dendrogram(topic_word_matrix, name):
    dist_matrix = cosine_distances(topic_word_matrix)
    linkage_matrix = linkage(dist_matrix, method='ward')
    # Plot dendrogram
    plt.figure(figsize=(12, 6))
    dendrogram(linkage_matrix, labels=[f"Topic {i + 1}" for i in n_components])
    plt.title(f"Hierarchical Clustering of {name} Topics")
    plt.xlabel("Topic")
    plt.ylabel("Cosine Distance")
    plt.tight_layout()
    plt.show()

### Latent Dirichlet Allocation (LDA)

In [None]:
tf_feature_names = tf_vectorizer.get_feature_names_out()
plot_top_words(lda_model, tf_feature_names, n_top_words, "Topics in LDA model")

In [None]:
for topic_idx, topic in enumerate(lda_model.components_):
    top_features_ind = topic.argsort()[: -max_top_words - 1 : -1]
    top_features = [tf_feature_names[i] for i in top_features_ind]
    weights = topic[top_features_ind]
    word_freq = dict()
    for i in range(len(top_features)):
        word_freq[top_features[i]] = weights[i]
    wordcloud = WordCloud().generate_from_frequencies(word_freq)
    plt.clf()
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"LDA Topic {topic_idx + 1}", fontsize=14)
    plt.show()
    print("")

In [None]:
topic_word_matrix = lda_model.components_ / lda_model.components_.sum(axis=1)[:, np.newaxis]
plot_dendrogram(topic_word_matrix, "Latent Dirichlet Allocation (LDA)")

### BERTopic

In [None]:
# Assuming topic_model is your BERTopic model
topics = topic_model.get_topics()

for topic_num in topics:
    if topic_num == -1:  # Skip the outlier/no-topic category
        continue
    plt.figure(figsize=(8, 6))
    word_freq = dict(topics[topic_num])
    print("")
    wordcloud = WordCloud().generate_from_frequencies(word_freq)
    plt.clf()
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"BERTopic Topic {topic_num + 1}", fontsize=14)
    plt.show()