# comments topic modelling using LDA

## setup

In [None]:
# NUM_KIDS = 100 # per story

# HN_STORIES_JSON = '../../data/hn_stories_dataset_final.json' # raw stories data, used to fetch comments from kid ids
# HN_COMMENTS_JSON = '../../data/hn_comments_dataset_final.json' # store raw stories + comments
# HN_COMMENTS_CSV = '../../data/hn_comments_dataset_final.csv' # converted from json to csv
# HN_COMMENTS_SAMPLED_CSV = '../../data/sampled_400_hn_comments_ground_truth.csv' # uniformly sampled comments for ground truth

# # Define the base URL for the Hacker News API
# BASE_URL = 'https://hacker-news.firebaseio.com/v0'

# DEPTH = 1 # comments depth

NUM_KIDS = 100 # per story

# json's
HN_STORIES_JSON = '../../data/hn_stories_dataset_final.json' # contains the raw fetched hn stories and comment ids
HN_STORIES_GH_JSON = '../../data/hn_stories_dataset_gh_final.json'
HN_COMMENTS_JSON = '../../data/hn_comments_dataset_final.json'
HN_COMMENTS_GH_JSON = '../../data/hn_comments_dataset_gh_final.json'

# csv's
HN_STORIES_CSV = '../../data/hn_stories_dataset_final.csv' # after converting raw json to raw csv
HN_STORIES_GH_CSV = '../../data/hn_stories_dataset_gh_final.csv' # after converting raw json to raw csv
HN_COMMENTS_CSV = '../../data/hn_comments_dataset_final.csv' # after converting raw json to raw csv
HN_COMMENTS_GH_CSV = '../../data/hn_comments_dataset_gh_final.csv' # after converting raw json to raw csv
HN_COMMENTS_SAMPLED_CSV = '../../data/hn_comments_dataset_stratified_final.csv' # after converting raw json to raw csv

# txt's
KEYWORDS_TXT = 'ai_keywords.txt' # used to match relevant hn stories titles
HN_GITHUB_URLS_TXT = 'hn_github_urls.txt'

# Define the base URL for the Hacker News API
BASE_URL = 'https://hacker-news.firebaseio.com/v0'

# hackernews stories id's
CHATGPT_RELEASE_ID = 33804874 # nov 30th 2022
START_ID = 31300000 # may 8th 2022
END_ID = 40300000 # may 9th 2024

# dataset retrieval parameters
INCREMENT = 1 # 1: fetch every stories, 2: skip every other stories, 3: skip every 2 stories, etc.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.sklearn
import warnings
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Suppress warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
print("Downloading NLTK data...")
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load the data
print("Loading data...")
df = pd.read_csv(HN_COMMENTS_GH_CSV)

In [None]:

# Text preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join tokens back into text
    return ' '.join(tokens)

# Preprocess all comments
print("Preprocessing comments...")
df['processed_text'] = df['comment_text'].progress_apply(preprocess_text)

# Remove empty comments after preprocessing
df = df[df['processed_text'].str.len() > 0].reset_index(drop=True)

# Create document-term matrix
print("Creating document-term matrix...")
vectorizer = CountVectorizer(max_df=0.95,  # Remove terms that appear in >95% of docs
                           min_df=2,       # Remove terms that appear in <2 docs
                           max_features=5000)

doc_term_matrix = vectorizer.fit_transform(df['processed_text'])

# Function to find optimal number of topics
def evaluate_topics(doc_term_matrix, max_topics, random_state=42):
    perplexities = []

    for n_topics in tqdm(range(2, max_topics + 1)):
        lda = LatentDirichletAllocation(n_components=n_topics,
                                      random_state=random_state,
                                      n_jobs=-1)
        lda.fit(doc_term_matrix)
        perplexities.append(lda.perplexity(doc_term_matrix))

    return perplexities

# Find optimal number of topics
print("Finding optimal number of topics...")
max_topics = 20
perplexities = evaluate_topics(doc_term_matrix, max_topics)

# Plot perplexity scores
plt.figure(figsize=(10, 6))
plt.plot(range(2, max_topics + 1), perplexities, marker='o')
plt.xlabel('Number of Topics')
plt.ylabel('Perplexity Score')
plt.title('Optimal Number of Topics')
plt.show()

# Train final LDA model with optimal number of topics
# Note: You should choose the number of topics based on the perplexity plot
# and your domain knowledge. Here we'll use 10 as an example.
n_topics = 10
print(f"Training final LDA model with {n_topics} topics...")

lda_model = LatentDirichletAllocation(n_components=n_topics,
                                     random_state=42,
                                     n_jobs=-1)
lda_output = lda_model.fit_transform(doc_term_matrix)

# Print top terms for each topic
feature_names = vectorizer.get_feature_names_out()
n_top_words = 10

print("\nTop terms per topic:")
for topic_idx, topic in enumerate(lda_model.components_):
    top_words_idx = topic.argsort()[:-n_top_words-1:-1]
    top_words = [feature_names[i] for i in top_words_idx]
    print(f"Topic {topic_idx + 1}: {', '.join(top_words)}")

# Create interactive visualization
print("\nGenerating interactive visualization...")
pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(lda_model, doc_term_matrix, vectorizer)
pyLDAvis.display(vis)

# Add topic labels to original dataframe
df['dominant_topic'] = lda_output.argmax(axis=1)

# Create time series analysis of topics
df['comment_date'] = pd.to_datetime(df['comment_date'])
df['comment_month'] = df['comment_date'].dt.to_period('M')

# Calculate topic distribution over time
topic_time_dist = pd.crosstab(df['comment_month'], df['dominant_topic'])
topic_time_dist_norm = topic_time_dist.div(topic_time_dist.sum(axis=1), axis=0)

# Plot topic distribution over time
plt.figure(figsize=(15, 8))
topic_time_dist_norm.plot(kind='area', stacked=True)
plt.title('Topic Distribution Over Time')
plt.xlabel('Date')
plt.ylabel('Proportion of Topics')
plt.legend(title='Topic', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

print("Analysis complete!")