# BERTopic Analysis & Sentiment

This notebook uses **BERTopic** (Transformer-based topic modeling) instead of LDA. 
It requires installing the `bertopic` library.

**Note:** BERTopic runs best on a GPU. If running locally on CPU, it may be slower than LDA.

In [None]:
# # Install necessary packages if not present
# !pip install bertopic nltk pandas pyarrow seaborn matplotlib
!pip install validate_data

Collecting bertopic
  Obtaining dependency information for bertopic from https://files.pythonhosted.org/packages/98/05/2d6b305391efff89c2b4cf19cf847f971ca163eb5c149d0d2ffac0a9c7ed/bertopic-0.17.3-py3-none-any.whl.metadata
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Obtaining dependency information for hdbscan>=0.8.29 from https://files.pythonhosted.org/packages/26/6b/88b8c8023c0c0b27589ad83c82084a1b751917a3e09bdf7fcacf7e6bd523/hdbscan-0.8.40-cp311-cp311-macosx_10_9_universal2.whl.metadata
  Downloading hdbscan-0.8.40-cp311-cp311-macosx_10_9_universal2.whl.metadata (15 kB)
Collecting umap-learn>=0.5.0 (from bertopic)
  Obtaining dependency information for umap-learn>=0.5.0 from https://files.pythonhosted.org/packages/6b/b1/c24deeda9baf1fd491aaad941ed89e0fed6c583a117fd7b79e0a33a1e6c0/umap_learn-0.5.9.post2-py3-none-any.whl.metadata
  Downloading umap_learn-0.5.9.post2-py3-none-any.whl.metadata (25 kB)
Collecting sentence-tr

In [6]:
import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from bertopic import BERTopic

# Download VADER lexicon
try:
    nltk.data.find('sentiment/vader_lexicon.zip')
except LookupError:
    nltk.download('vader_lexicon')

# ==========================================
# ⚙️ CONFIGURATION
# ==========================================
BASE = Path("/Users/lukas./Desktop/CMPUT660Project/inputs/processed")
PLOTS_DIR = BASE.parent.parent / "outputs" / "pokemon" / "plots"
PLOTS_DIR.mkdir(parents=True, exist_ok=True)


ImportError: cannot import name 'validate_data' from 'sklearn.utils.validation' (/Users/lukas./anaconda3/lib/python3.11/site-packages/sklearn/utils/validation.py)

## 1. Load Data
Standard loading of Parquet files.

In [None]:
def load_and_prep(filename):
    fpath = BASE / filename
    if not fpath.exists():
        return pd.DataFrame()
  
    df = pd.read_parquet(fpath)

    date_col = 'date' if 'date' in df.columns else 'created_at'
    if date_col in df.columns:
        df['date'] = pd.to_datetime(df[date_col])
    
    if 'text' in df.columns:
        df = df[['date', 'text']].dropna()
        if "before" in filename:
            df['period'] = 'Before'
        else:
            df['period'] = 'After'
        return df
    return pd.DataFrame()

files = [
    "commit_messages_before.parquet", "commit_messages_after.parquet",
    "pr_bodies_before.parquet", "pr_bodies_after.parquet"
]

dfs = [load_and_prep(f) for f in files]
full_df = pd.concat(dfs)
full_df = full_df.sort_values(by="date")

print(f"Loaded {len(full_df):,} documents.")

# Filter out very short texts which confuse embeddings
full_df = full_df[full_df['text'].str.len() > 10]
documents = full_df['text'].astype(str).tolist()
periods = full_df['period'].tolist()
timestamps = full_df['date'].tolist()

## 2. Train BERTopic Model
Unlike LDA, BERTopic handles stopwords and tokenization internally via embeddings, so manual preprocessing is less critical (though still helpful).

In [None]:
print("Training BERTopic Model... (This may take a while)")

# min_topic_size: Minimum documents per topic
# nr_topics: 'auto' or a specific number (e.g., 20)
topic_model = BERTopic(
    language="english", 
    calculate_probabilities=True, 
    verbose=True,
    nr_topics=20, # Reducing to manageable number like LDA
    min_topic_size=20
)

topics, probs = topic_model.fit_transform(documents)

# Get Topic Info
topic_info = topic_model.get_topic_info()
print(topic_info.head(10))

# Create Label Map
# BERTopic gives topics names like '0_fix_bug_issue'
TOPIC_LABELS = {row['Topic']: row['Name'] for index, row in topic_info.iterrows()}
print("\n--- Discovered Topics ---")
for t_id, t_name in list(TOPIC_LABELS.items())[:10]:
    print(f"{t_id}: {t_name}")

## 3. Sentiment Analysis
We map the BERTopic assignments to VADER sentiment scores.

In [None]:
print("Calculating Sentiment...")
sid = SentimentIntensityAnalyzer()
sentiment_data = []

for i, (doc, topic, period) in enumerate(zip(documents, topics, periods)):
    # Topic -1 is "Outlier" in BERTopic (noise). We usually skip it.
    if topic == -1:
        continue
        
    scores = sid.polarity_scores(doc)
    compound = scores['compound']
    
    # Clean up topic name (remove numbers e.g. "0_fix_bug" -> "fix_bug")
    raw_name = TOPIC_LABELS[topic]
    clean_name = "_".join(raw_name.split("_")[1:]) 
    
    sentiment_data.append({
        "Topic_ID": topic,
        "Topic_Name": clean_name,
        "Period": period,
        "Sentiment": compound
    })

df_sentiment = pd.DataFrame(sentiment_data)

# --- Visualization ---
plt.figure(figsize=(16, 10))

ax = sns.barplot(
    x="Topic_Name", 
    y="Sentiment", 
    hue="Period", 
    data=df_sentiment, 
    palette=["#E74C3C", "#3498DB"],
    errorbar=('ci', 95)
)

plt.title("Sentiment Shifts by BERTopic (Before vs After)", fontsize=16, fontweight='bold', pad=20)
plt.ylabel("Average Sentiment", fontsize=12)
plt.xlabel("Topic (Top Keywords)", fontsize=12)
plt.xticks(rotation=45, ha='right', fontsize=10)
plt.legend(title="Time Period")
plt.axhline(0, color='black', linestyle='--', linewidth=1)
plt.grid(axis='y', linestyle='--', alpha=0.3)

plt.tight_layout()
plt.show()