# Step 1: Data Scraping 
## Using the PRAW API

In [None]:
# Import libraries
import praw
import json
import time
from datetime import datetime

# Set up Reddit instance
client_id = 'xxxxxxxxxxxxxxxxxxx'
user_agent = 'xxxxxxxxxxxxx'
client_secret = 'xxxxxxxxxxxxxxxx'

reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent=user_agent)

# Save data
def save_data(posts, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        json.dump(posts, file, indent=4)

# List to store post data
post_data = []

# Generate file name with timestamp
filename = f"reddit_data_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json"

# Get 1000 hot posts
hot_posts = reddit.subreddit('OCD').hot(limit=1000)

# Define missing posts
missing_posts = 0

# Count posts to know when 100 are reached 
for index, post in enumerate(hot_posts):
    try:
        print('title:', post.title)
        print('text:', post.selftext)
        print('number of comments:', post.num_comments)

        # define deleted posts
        is_deleted = post.selftext == "[deleted]"

        # get all comments for the posts (not printed, just saved)
        post.comments.replace_more(limit=None)
        comments = [comment.body for comment in post.comments.list()]

        # Structure data for each post -> store it in a file
        post_info = {
            'title': post.title,
            'selftext': post.selftext,
            'comments': comments,
            'num_comments': post.num_comments,
            'stickied': post.stickied,
            'deleted': is_deleted,
            'upvote_ratio': post.upvote_ratio,
            'score': post.score,
            'created_utc': post.created_utc,
            'removed_by_category': post.__dict__.get("removed_by_category", None),
        }
        # Add to the list of post data
        post_data.append(post_info)

        # Save progress every 100 posts for case of error 
        if (index + 1) % 100 == 0:
            save_data(post_data, filename)
            print(f"✅ Progress saved after {index + 1} posts")

        # Add a 4-second-delay between requests (to avoid hitting the rate limit)
        time.sleep(4)

    # in case of error
    except Exception as e: 
        print(f"Error encountered: {e}. Saving progress...") # (optional)
        save_data(post_data, filename)
        break  # Stop scraping to prevent further issues

# Save the data to a JSON file
save_data(post_data, filename)

# Report the number of posts scraped (optional)
print(f"Total posts scraped: {len(post_data)}")
print(f"Total missing (deleted/removed) posts: {missing_posts}")

## Combine all scraped data into one file

In [None]:
# Import libraries
import json
import os
from datetime import datetime

folder_path = "/Users/leonaweise/PycharmProjects/Thesis/LeonaThesisProject"
files = [file for file in os.listdir(folder_path) if file.endswith(".json")]

combined_posts = {}
skipped_files = []

for filename in files:
    file_path = os.path.join(folder_path, filename)

    try:
        with open(file_path, encoding="utf-8") as file:
            data = json.load(file)

            for post in data:
                title = post.get("title", "").strip()
                body = post.get("selftext", "").strip()
                comments = post.get("comments", [])
                key = title + body

                if key in combined_posts:
                    # If new version has more comments, replace it
                    if len(comments) > len(combined_posts[key]["comments"]):
                        combined_posts[key] = post
                else:
                    combined_posts[key] = post

    except Exception as e:
        skipped_files.append((filename, str(e)))
        
# Prepare output file
timestamp = datetime.now().strftime("%Y-%m-%d")
output_file = f"combined_reddit_data_{timestamp}.json"

# Save
with open(os.path.join(folder_path, output_file), "w", encoding="utf-8") as file:
    json.dump(list(combined_posts.values()), file, indent=4, ensure_ascii=False)

print(f"Combined and saved {len(combined_posts)} unique posts to: {output_file}")
if skipped_files:
    print("Skipped files due to errors:")
    for name, reason in skipped_files:
        print(f" - {name}: {reason}")

# Step 2: Data Preprocessing 

In [None]:
What was removed?
1. 'stickied' posts
2. 2 types of bot-generated comments
    "If you or someone you know is contemplating suicide, please do not hesitate to talk to someone. The wonderful u/froidinslip has \nwritten an invaluable post to help you navigate this time: \nhttps://www.reddit.com/r/OCD/comments/q4zeo1/please_read_this_before_posting_about_feeling/ \nYou are not alone, and you have options. However, we are not able to help with suicide on an internet forum. \nPLEASE USE THE RESOURCES. You matter and deserve help.\n\n\n*I am a bot, and this action was performed automatically. Please [contact the moderators of this subreddit](/message/compose/?to=/r/OCD) if you have any questions or concerns.*"
    "This post has been automatically marked as \"spoiler\" and \"NSFW\", due to the nature of the content (and in accordance with subreddit rule number 4 if this post has been flaired as \"Crisis\").\n\n(This subreddit uses the \"spoiler\" and \"NSFW\" markers to hide a post's content behind an expandable/collapsible wall. It does not imply that the content contains actual spoiler or NSFW content, and the post will remain publicly-visible.)\n\n**Do not remove the \"spoiler\" and \"NSFW\" markers without permission from the moderators.** Failure to comply can and will result in this post being removed.\n\nThe cooperation in making this subreddit an accessible community for all will be appreciated.\n\n\n*I am a bot, and this action was performed automatically. Please [contact the moderators of this subreddit](/message/compose/?to=/r/OCD) if you have any questions or concerns.*"
3. [removed] comments

In [None]:
# Import library
import json

# Load data
with open(
        "/Users/leonaweise/PycharmProjects/Thesis/LeonaThesisProject/combined_reddit_data_2025-04-13.json",
        encoding="utf-8") as file:
    data = json.load(file)

# Remove stickied posts
non_stickied_data = [post for post in data if not post.get('stickied', False)]
stickied_removed = len(data) - len(non_stickied_data)

# Counter for each type of comment removed
spoiler_bot_comments = 0
suicide_bot_comments = 0
removed_comments = 0

# Clean comments 
cleaned_data = []

for post in non_stickied_data:
    cleaned_post = post.copy()

    if 'comments' in cleaned_post and isinstance(cleaned_post['comments'], list):
        cleaned_comments = []

        for comment in cleaned_post['comments']:
            if not isinstance(comment, str):
                cleaned_comments.append(comment)
                continue

            if comment.startswith("This post has been automatically marked as \"spoiler\" and \"NSFW\""):
                spoiler_bot_comments += 1
            elif comment.startswith("If you or someone you know is contemplating suicide"):
                suicide_bot_comments += 1
            elif comment == "[removed]":
                removed_comments += 1
            else:
                cleaned_comments.append(comment)

        cleaned_post['comments'] = cleaned_comments

    cleaned_data.append(cleaned_post)

# Save the cleaned data
with open("final_cleaned_reddit_data_2025_04_13.json", "w", encoding="utf-8") as file:
    json.dump(cleaned_data, file)
    
# Define removed comments
total_comments_removed = spoiler_bot_comments + suicide_bot_comments + removed_comments

# (optional) 
print(f"Done. Processing results:")
print(f"- Removed {stickied_removed} stickied posts")
print(f"- Removed {spoiler_bot_comments} spoiler/NSFW bot comments")
print(f"- Removed {suicide_bot_comments} suicide prevention bot comments")
print(f"- Removed {removed_comments} [removed] comments")
print(f"- Total comments removed: {total_comments_removed}")

# Step 3: Topic Modelling
## Using BERTopic

In [None]:
# Import libraries
from datetime import datetime
import json
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired

# 1. Load data as Dataframe
with open("//Users/leonaweise/PycharmProjects/Thesis/LeonaThesisProject/final_cleaned_reddit_data_2025_04_13.json", encoding="utf-8") as file:
    data = json.load(file)
df = pd.DataFrame(data)

# 2. Prepare documents: Combining title, body and comments
df["whole_post"] = df['title'].fillna("") + " " + df["selftext"].fillna("") + " " + df['comments'].apply(lambda x: " ".join(x) if isinstance(x, list) else "")
docs = df["whole_post"].tolist()

# 3. BERTopic Pipeline
# Embedding model
embedding_model = SentenceTransformer("all-MiniLM-L12-v2")

# Dimensionality reduction
umap_model = UMAP(
    n_neighbors=10,
    n_components=3,
    min_dist=0.0,
    metric='cosine',
    random_state=42
)

# Clustering model
hdbscan_model = HDBSCAN(
    min_cluster_size=15,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True,
    min_samples=10
)

# Vectorizer model
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 2))

# c-TF-IDF model
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)


# Topic representation model (optional, Grootendorst sometimes uses this)
representation_model = KeyBERTInspired()

# 4. Initialize BERTopic with components
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    representation_model=representation_model,
    top_n_words=10,  # Show more words per topic
    verbose=True
)

# 5. Fit model
topics, probs = topic_model.fit_transform(docs)

# Save raw topics and probabilities for reuse (before reduction)
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
with open(f"raw_topics_{timestamp}.json", "w", encoding="utf-8") as f:
    json.dump(topics.tolist() if isinstance(topics, np.ndarray) else topics, f)
np.save(f"raw_probs_{timestamp}.npy", probs)

# 6. Reduce final topics
topic_model.reduce_topics(docs, nr_topics=35)
reduced_topics, reduced_probs = topic_model.transform(docs)

# Create topic-term matrix
topic_term_matrix = topic_model.get_topics()
with open(f"topic_term_matrix_{timestamp}.pkl", "wb") as f:
    pickle.dump(topic_term_matrix, f)

# Save term ranks
term_ranks = {}
for topic_id in set(reduced_topics):
    if topic_id != -1:  # Skip outlier topic
        # Get topic terms with weights
        terms_with_weights = topic_model.get_topic(topic_id)
        # Convert to dictionary format for easier analysis
        term_ranks[int(topic_id)] = {word: float(weight) for word, weight in terms_with_weights}

# Save to JSON for easy access
with open(f"topic_term_ranks_{timestamp}.json", "w", encoding="utf-8") as f:
    json.dump(term_ranks, f)

# 6.1 Calculate topic coherence for reduced topics
from gensim.models.coherencemodel import CoherenceModel
import gensim.corpora as corpora

# Prepare text data for coherence calculation
print("Preparing data for coherence calculation...")
# Properly tokenize documents (simple whitespace split may not be enough)
# First, make sure all docs are strings and lowercase for consistency
string_docs = [str(doc).lower() for doc in docs]
# Then tokenize - this is a simple approach, you might want to use nltk or spacy for better tokenization
processed_docs = [doc.split() for doc in string_docs]
# Create a Dictionary from processed docs
id2word = corpora.Dictionary(processed_docs)
# Filter extremely rare words (optional but recommended)
id2word.filter_extremes(no_below=3, no_above=0.5)
# Create corpus
corpus = [id2word.doc2bow(doc) for doc in processed_docs]

# Get topic words for reduced topics
print("Extracting topic words...")
n_words = 10
topics_dict = topic_model.get_topics()
topic_words = []
skipped_topics = []

# Make sure to properly prepare topic words for coherence calculation
for topic_id in sorted(topics_dict.keys()):
    if topic_id == -1:  # Skip outlier topic
        continue

    topic = topic_model.get_topic(topic_id)
    if topic:
        # Extract just the words (without weights)
        words = [word for word, _ in topic[:n_words]]
        # For each topic term, split it if it contains a space
        words_expanded = []
        for word in words:
            if " " in word:
                words_expanded.extend(word.split())
            else:
                words_expanded.append(word)
        words = words_expanded

        # Important: Verify that all words are in the dictionary
        # Debug which words are missing from dictionary
        for word in words:
            if word not in id2word.token2id:
                print(f"Topic {topic_id}: Term '{word}' not in dictionary")
        # Filter out words that aren't in the id2word dictionary
        words_in_vocab = [word for word in words if word in id2word.token2id]

        if len(words_in_vocab) >= 3:  # Need at least a few words for coherence
            topic_words.append(words_in_vocab)
        else:
            skipped_topics.append(topic_id)
            print(f"Warning: Topic {topic_id} has too few words in vocabulary: {words_in_vocab}")
    else:
        skipped_topics.append(topic_id)
        print(f"Warning: Topic {topic_id} is empty")

# Verify we have topics to calculate coherence on
if not topic_words:
    print("ERROR: No valid topics found for coherence calculation!")
    coherence_results = {
        'cv_coherence': None,
        'npmi_coherence': None,
        'num_topics': 0,
        'error': 'No valid topics for coherence calculation'
    }
else:
    # Debug print to help diagnose issues
    print(f"First topic words sample: {topic_words[0]}")
    print(f"Number of topics for coherence: {len(topic_words)}")

    # Calculate coherence metrics - make sure topics and texts match format
    print("Calculating coherence scores...")
    try:
        coherence_cv = CoherenceModel(
            topics=topic_words,
            texts=processed_docs,
            dictionary=id2word,
            coherence='c_v',
            processes=1  # Force single process mode
        ).get_coherence()

        coherence_npmi = CoherenceModel(
            topics=topic_words,
            texts=processed_docs,
            dictionary=id2word,
            coherence='c_npmi',
            processes=1  # Force single process mode
        ).get_coherence()

        # Save coherence results
        coherence_results = {
            'cv_coherence': float(coherence_cv),
            'npmi_coherence': float(coherence_npmi),
            'num_topics': len(topic_words)
        }

        print("Coherence scores:")
        for metric, value in coherence_results.items():
            print(f"  {metric}: {value}")
    except Exception as e:
        print(f"Error in coherence calculation: {e}")
        # Provide a fallback
        coherence_results = {
            'error': str(e),
            'num_topics': len(topic_words)
        }

if skipped_topics:
    print("Skipped empty or invalid topics:", skipped_topics)
else:
    print("All topics were valid for coherence calculation.")

with open(f"coherence_metrics_{timestamp}.json", "w") as f:
    json.dump(coherence_results, f)

# Quick topic size histogram
topic_counts = pd.Series(reduced_topics).value_counts()
topic_counts = topic_counts[topic_counts.index != -1]  # Remove outlier topic
topic_counts.sort_index().plot(kind='bar', figsize=(15, 5))
plt.title('Number of Documents per Topic')
plt.xlabel('Topic ID')
plt.ylabel('Count')
plt.savefig(f"topic_size_histogram_{timestamp}.png", dpi=300)

# 7. Create DataFrame with results
topic_info = topic_model.get_topic_info()
topic_info.to_csv(f"reduced_topic_info_{timestamp}.csv", index=False)
probabilities = probs.max(axis=1) if len(probs.shape) > 1 else np.ones(len(topics))

# After getting reduced topics, create a topic-to-documents mapping
# Create a mapping of topics to documents with counts and probability stats
# Create a mapping of topics to documents with counts and probability stats
topic_to_docs = {}

for topic_id in set(reduced_topics):
    # Skip outlier topic if desired
    if topic_id == -1:
        continue

    # Get all documents for this topic
    topic_docs_indices = [i for i, t in enumerate(reduced_topics) if t == topic_id]

    # Document count for this topic
    doc_count = len(topic_docs_indices)

    # Collect all probabilities for this topic
    topic_probabilities = []

    # Create a list of documents with their full content
    topic_documents = []
    for idx in topic_docs_indices:
        # Get probability for this document-topic pair
        if isinstance(reduced_probs[idx], np.ndarray):
            doc_prob = float(reduced_probs[idx][topic_id])
        else:
            doc_prob = float(reduced_probs[idx])

        topic_probabilities.append(doc_prob)

        # Get number of comments (either from field or by counting)
        num_comments = data[idx].get("num_comments")
        if num_comments is None:
            num_comments = len(data[idx].get("comments", []))

        # Include the full document content with topic probability
        doc_info = {
            'title': data[idx].get('title', ''),
            'selftext': data[idx].get('selftext', ''),
            'comments': data[idx].get('comments', []),
            'num_comments': num_comments,
            'topic_probability': doc_prob  # Document's probability for this topic
        }
        topic_documents.append(doc_info)

    # Sort by probability (most representative first)
    topic_documents.sort(key=lambda x: x['topic_probability'], reverse=True)

    # Calculate probability statistics
    avg_probability = sum(topic_probabilities) / max(1, len(topic_probabilities))

    # Store everything for this topic
    topic_to_docs[int(topic_id)] = {
        'documents': topic_documents,
        'document_count': doc_count,
        'avg_topic_probability': avg_probability,
        'keywords': [word for word, _ in topic_model.get_topic(topic_id)][:10]  # Top 10 keywords
    }

# Save all documents by topic
with open(f"documents_by_topic_{timestamp}.json", "w", encoding="utf-8") as f:
    json.dump(topic_to_docs, f)

# 9. Visualizations
fig_heatmap = topic_model.visualize_heatmap()
fig_heatmap.write_html(f"topic_document_heatmap_{timestamp}.html")

fig_topics = topic_model.visualize_topics()
fig_topics.write_html(f"intertopic_distance_map_{timestamp}.html")

fig_barchart = topic_model.visualize_barchart(top_n_topics=50)
fig_barchart.write_html(f"topic_barchart_{timestamp}.html")

fig_hierarchy = topic_model.visualize_hierarchy()
fig_hierarchy.write_html(f"topic_hierarchy_{timestamp}.html")

# 10. Save model
topic_model.save(f"bertopic_model_{timestamp}")

print(f"Analysis complete. Found {len(topic_info) - 1} topics.")
print(f"Topic information:")
print(topic_info.head(10))

## Create labels Using GPT 4o

In [None]:
# Import libraries
import os
import pandas as pd
import ast
import time
import openai
from bertopic import BERTopic

# Set the OpenAI API key
os.environ["OPENAI_API_KEY"] = "xxxxxxxxxxxxxxxxxxxx"

# Load topic keywords from CSV
df_keywords = pd.read_csv("reduced_topic_info_2025-04-22_22-52-49.csv")
df_keywords = df_keywords[df_keywords["Topic"] >= 0].copy()
df_keywords["Parsed_Keywords"] = df_keywords["Representation"].apply(ast.literal_eval)
topic_to_keywords = dict(zip(df_keywords["Topic"], df_keywords["Parsed_Keywords"]))

# Load the full thesis document summary
df_full_summary = pd.read_csv("Thesis_Topic_Document_Summary_FULL.csv")

# Create OpenAI client
client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])

# Detailed prompt for OCD topic labeling
prompt_template = """You will receive topic keywords and full Reddit posts (including titles, selftexts, and comments).
Your task is to generate a short, accurate, and uniquely descriptive topic label for each topic, 
that captures the specific OCD theme, concern, or experience in this topic.

You are analyzing a topic from an OCD subreddit. The topic contains posts with these keywords:
{keywords}

Here are sample posts from this topic:
{sample_posts}

Follow these strict rules:
- Format: topic: <label>
- The label must be no more than 4 words
- Do not repeat labels across topics — each label must be semantically distinct
- Avoid vague or generic labels like "topic 2", or "miscellaneous"
- Base the label primarily on the post content, not just the keywords
- Consider the types of distress, compulsions, fears, or themes being expressed
- Use natural language, not diagnostic or clinical jargon unless reflected in the posts
- Focus on the specific fears, compulsions, or themes expressed

Examples of good labels: 'Religious and Moral Scrupulosity', 'Memory issues', 'OCD Treatment', 'Pregnancy-related OCD',
 'Numerical OCD', 'Pregnancy-Related Intrusive Thoughts'

Respond only with the label, without quotation marks or explanation."""

# Process each topic individually
topic_labels = {}
valid_topics = sorted(topic_to_keywords.keys())
total_topics = len(valid_topics)

for index, topic_id in enumerate(valid_topics, start=1):
    print(f"\nProcessing Topic {topic_id} ({index}/{total_topics})...")

    # Get keywords for this topic
    keywords = topic_to_keywords[topic_id]
    keywords_str = ", ".join(keywords[:15])  

    # Get all posts for this topic
    topic_docs = df_full_summary[(df_full_summary["topic_number"] == topic_id) & df_full_summary["post_title"].notna()]
    sample_posts = []

    # Use all available posts for each topic (no limit)
    for _, doc in topic_docs.iterrows():
        title = doc["post_title"]
        selftext = doc["post_body"]
        comments = doc["comments"]

        # Create a combined sample with title, body, and comments
        sample = f"POST TITLE: {title}\nPOST CONTENT: {selftext}\nCOMMENTS:\n{comments}"
        sample_posts.append(sample)

    # Combine samples for the final prompt
    sample_posts_str = "\n".join(sample_posts)

    # Create the complete prompt
    prompt = prompt_template.format(
        keywords=keywords_str,
        sample_posts=sample_posts_str
    )

    # Call OpenAI API directly
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system",
                 "content": "You are a helpful assistant specializing in psychology and mental health."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
            max_tokens=10  # Keep responses short
        )

        # Extract the label
        label = response.choices[0].message.content.strip()

        # Store the label
        topic_labels[topic_id] = label
        print(f"  Topic {topic_id} label: {label}")

        # Wait between API calls to avoid rate limits
        time.sleep(15)

    except Exception as e:
        print(f" Error processing Topic {topic_id}: {str(e)}")

        # Wait longer if rate limited
        if "rate_limit" in str(e).lower():
            time.sleep(60)
        else:
            time.sleep(15)

# Update model with new labels
if topic_labels:
    print(f"\nUpdating model with {len(topic_labels)} new topic labels...")

    # Save just the topic labels to a CSV file
    topic_labels_df = pd.DataFrame({
        'Topic': list(topic_labels.keys()),
        'Label': list(topic_labels.values())
    })
    topic_labels_df.to_csv("topic_labels_only.csv", index=False)
    print("Saved raw topic labels to topic_labels_only.csv")

    topic_model = BERTopic.load("bertopic_model_2025-04-22_22-52-49")
    topic_model.set_topic_labels(topic_labels)
    print("Labels applied successfully")

    # Save the updated model
    topic_model.save("TOPICS_updated_with_gpt4o_labels3")
    print("Updated model saved as 'TOPICS_updated_with_gpt4o_labels2'")

    # Export topic info to CSV
    topic_info = topic_model.get_topic_info()
    topic_info.to_csv("GPT4o_Topic_Labels2.csv", index=False)
    print("Saved topic labels to GPT4o_Topic_Labels2.csv")
else:
    print("No labels were generated, model not updated")

## Create an overview of all topics

In [None]:
import os
import json
import pandas as pd
import ast
import time
from bertopic import BERTopic

# Load topic-to-documents JSON
with open("documents_by_topic_2025-04-22_22-52-49.json", "r") as f:
    documents_by_topic = json.load(f)

# Load topic information
topic_model = BERTopic.load("bertopic_model_2025-04-22_22-52-49")
topic_info = topic_model.get_topic_info()
label_map = dict(zip(topic_info.Topic, topic_info.Name))

# Load keywords
df_keywords = pd.read_csv("reduced_topic_info_2025-04-22_22-52-49.csv")
df_keywords["Parsed_Keywords"] = df_keywords["Representation"].apply(ast.literal_eval)

# Build grouped CSV structure
rows = []
for _, row in df_keywords.iterrows():
    topic_id = row["Topic"]

    # Skip the -1 topic (outlier group)
    if topic_id == -1:
        print(f"Skipping outlier topic -1")
        continue

    topic_label = label_map.get(topic_id, "UNLABELED")
    topic_keywords = ", ".join(row["Parsed_Keywords"])
    documents = documents_by_topic.get(str(topic_id), {}).get("documents", [])

    # Calculate average topic probability
    if documents:
        avg_prob = round(sum(doc.get("topic_probability", 0) for doc in documents) / len(documents), 4)
    else:
        avg_prob = 0.0

    # Sort documents by topic probability (highest to lowest)
    sorted_docs = sorted(documents, key=lambda doc: doc.get("topic_probability", 0), reverse=True)

    # Add topic header row
    rows.append({
        "topic_number": topic_id,
        "topic_label": topic_label,
        "topic_keywords": topic_keywords,
        "average_topic_probability": avg_prob,
        "document_count": len(documents),
        "post_title": "",
        "post_body": "",
        "comments": "",
        "number_of_comments": "",
        "topic_probability": ""
    })

    # Add sorted document rows
    for doc in sorted_docs:
        title = doc.get("title", "")
        selftext = doc.get("selftext", "")

        # Add numbered comments with line breaks
        comments_list = doc.get("comments", [])
        comments = "\n".join([f"{i+1}. {comment.strip()}" for i, comment in enumerate(comments_list)])

        comment_count = doc.get("num_comments", len(comments_list))
        topic_prob = round(doc.get("topic_probability", 0), 6)  # Use 6 decimals for consistency

        rows.append({
            "topic_number": "",
            "topic_label": "",
            "topic_keywords": "",
            "average_topic_probability": "",
            "document_count": "",
            "post_title": title,
            "post_body": selftext,
            "comments": comments,
            "number_of_comments": comment_count,
            "topic_probability": topic_prob
        })

# Export to CSV
final_df = pd.DataFrame(rows)
final_df.to_csv("Thesis_Topic_Document_Summary_FULL2.csv", index=False)
print("✅ Exported: Thesis_Topic_Document_Summary_FULL.csv (grouped by topic)")

# Print sample of topic labels
print("\n📋 Sample topic labels:")
print(topic_info[["Topic", "Name"]].head(10).to_string(index=False))

# Step 4: Sentiment Analysis and Emotion Recognition
## Using twitter-roberta-base-sentiment and twitter-roberta-base-emotion-multilabel-latest

In [None]:
# Import libraries
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import font_manager, rcParams
from tqdm import tqdm
from collections import Counter
from datetime import datetime

# Set font for Results section globally (optional)
# Load SF Pro Display font manually (optional)
from matplotlib import font_manager

font_path = '/Users/leonaweise/Desktop/SF-Pro-Display-Regular.otf'  # <-- path to your font file
sfpro_font = font_manager.FontProperties(fname=font_path)

# Timestamp for saving output
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Load your reduced BERTopic results file
results_df = pd.read_csv("bertopic_results_for_sentiment.csv")


# Load sentiment analysis model
print("Loading sentiment analysis model...")
sentiment_model_name = "cardiffnlp/twitter-roberta-base-sentiment"
sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name)

# Load emotion analysis model
print("Loading emotion analysis model...")
emotion_model_name = "cardiffnlp/twitter-roberta-base-emotion-multilabel-latest"
emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_name)
emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_name)

# Sentiment labels
sentiment_labels = ["Negative", "Neutral", "Positive"]

# Emotion labels
emotion_labels = ["anger", "anticipation", "disgust", "fear", "joy", "love", "optimism", "pessimism", "sadness", "surprise", "trust"]


def analyze_sentiment(texts, max_length=512):
    results = []
    for text in tqdm(texts, desc="Analyzing sentiment"):
        # Convert to string and handle NaN values
        if isinstance(text, float) and np.isnan(text):
            text = ""  # Replace NaN with empty string
        else:
            text = str(text)  # Convert all types to string

        text = text[:10000]  # Now safe to slice
        inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length)
        with torch.no_grad():
            outputs = sentiment_model(**inputs)
        scores = torch.nn.functional.softmax(outputs.logits, dim=1).detach().numpy()[0]
        label_id = np.argmax(scores)
        results.append({
            "sentiment": sentiment_labels[label_id],
            "score": float(scores[label_id]),
            "scores": {label: float(score) for label, score in zip(sentiment_labels, scores)}
        })
    return results


def analyze_emotions(texts, threshold=0.5, max_length=512):
    results = []
    for text in tqdm(texts, desc="Analyzing emotions"):
        # Convert to string and handle NaN values
        if isinstance(text, float) and np.isnan(text):
            text = ""  # Replace NaN with empty string
        else:
            text = str(text)  # Convert all types to string

        text = text[:10000]  # Now safe to slice
        inputs = emotion_tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length)
        with torch.no_grad():
            outputs = emotion_model(**inputs)
        scores = torch.sigmoid(outputs.logits).detach().numpy()[0]
        emotions = [emotion_labels[i] for i, score in enumerate(scores) if score > threshold]
        emotion_scores = {emotion_labels[i]: float(score) for i, score in enumerate(scores)}
        results.append({
            "emotions": emotions,
            "scores": emotion_scores
        })
    return results
def analyze_topic_sentiments_emotions(df, docs_per_topic=None):
    if docs_per_topic is None:
        docs_per_topic = 50
    topic_ids = sorted(df["topic"].unique())
    topic_results = {}
    for topic_id in topic_ids:
        print(f"\nAnalyzing Topic {topic_id}")
        topic_docs = df[df["topic"] == topic_id]
        if len(topic_docs) > docs_per_topic:
            sampled_docs = topic_docs.head(docs_per_topic)  # Take first 50, since documents are sorted by probability, thus the first 50 are the most representative
        else:
            sampled_docs = topic_docs
        texts = sampled_docs["document"].tolist()
        sentiment_results = analyze_sentiment(texts)
        emotion_results = analyze_emotions(texts)
        sentiment_counts = Counter([r["sentiment"] for r in sentiment_results])
        sentiment_dist = {s: count/len(sentiment_results) for s, count in sentiment_counts.items()}
        all_emotions = [emo for r in emotion_results for emo in r["emotions"]]
        emotion_counts = Counter(all_emotions)
        emotion_scores = {}
        for emotion in emotion_labels:
            # Get average score for this emotion across all documents
            avg_score = sum(r["scores"].get(emotion, 0) for r in emotion_results) / len(emotion_results)
            emotion_scores[emotion] = avg_score

        topic_results[topic_id] = {
            "sentiment_distribution": sentiment_dist,
            "emotion_distribution": emotion_scores,
            "sample_size": len(sampled_docs)
        }
    return topic_results

topic_sentiment_emotion_results = analyze_topic_sentiments_emotions(results_df)

sentiment_emotion_df = pd.DataFrame()
for topic_id, data in topic_sentiment_emotion_results.items():
    row = {"Topic": topic_id, "Sample_Size": data["sample_size"]}
    for sentiment, value in data["sentiment_distribution"].items():
        row[f"Sentiment_{sentiment}"] = value
    for emo, value in data["emotion_distribution"].items():
        row[f"Emotion_{emo}"] = value
    sentiment_emotion_df = pd.concat([sentiment_emotion_df, pd.DataFrame([row])], ignore_index=True)

# Ensure all emotions columns exist
for emotion in emotion_labels:
    col_name = f"Emotion_{emotion}"
    if col_name not in sentiment_emotion_df.columns:
        sentiment_emotion_df[col_name] = 0.0

# Save CSV
sentiment_emotion_df.to_csv(f"topic_sentiment_emotion_{timestamp}.csv", index=False)

# Create a clean summary table of sentiment distributions by topic
sentiment_summary = sentiment_emotion_df[['Topic', 'Sample_Size', 'Sentiment_Negative', 'Sentiment_Neutral', 'Sentiment_Positive']].copy()

# Sort by topic ID
sentiment_summary = sentiment_summary.sort_values('Topic')

# Save this simplified table
sentiment_summary.to_csv(f"topic_sentiment_summary_{timestamp}.csv", index=False)

# Create a cleaner visualization - horizontal bar chart of sentiments by topic
fig, ax = plt.subplots(figsize=(12, 10))

# Remove borders (APA 7 style)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Create bar chart for sentiment distribution across all topics
topics = sentiment_summary['Topic'].tolist()
pos = sentiment_summary['Sentiment_Positive'].tolist()
neu = sentiment_summary['Sentiment_Neutral'].tolist()
neg = sentiment_summary['Sentiment_Negative'].tolist()

indices = list(range(len(topics)))
width = 0.8

ax.barh(indices, neg, width, label='Negative', color='#d92f0d')
ax.barh(indices, neu, width, left=neg, label='Neutral', color='#aaaaaa')
ax.barh(indices, pos, width, left=[n+ne for n, ne in zip(neg, neu)], label='Positive', color='#2aad32')

ax.set_yticks(indices)
ax.set_yticklabels([f"Topic {t}" for t in topics], fontproperties=sfpro_font)
ax.set_xlabel('Proportion', fontproperties=sfpro_font)

# Corrected: save the legend as 'leg'
leg = ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=3, prop=sfpro_font)

# Now modify the legend box
frame = leg.get_frame()
frame.set_edgecolor('black')  # Set legend box border color to black
frame.set_linewidth(1.0)      # Optional: make the border slightly visible and clean

plt.tight_layout()
plt.savefig(f"sentiment_distribution_by_topic_{timestamp}.png", dpi=300, bbox_inches="tight")
plt.close()

def plot_topic_sentiment_emotions(topic_results, top_n_topics=10):
    plot_topic_ids = [t for t in topic_results.keys() if t != -1][:top_n_topics]
    plt.figure(figsize=(12, 8))
    x_vals = list(range(len(sentiment_labels)))
    for topic_id in plot_topic_ids:
        sentiment_dist = topic_results[topic_id]["sentiment_distribution"]
        y = [sentiment_dist.get(label, 0) for label in sentiment_labels]
        plt.bar([xi + 0.1*topic_id for xi in x_vals], y, width=0.1, label=f"Topic {topic_id}")
    plt.xlabel("Sentiment")
    plt.ylabel("Proportion")
    plt.title("Sentiment Distribution Across Topics")
    plt.xticks(x_vals, sentiment_labels)
    plt.legend()
    plt.savefig(f"topic_sentiment_distribution_{timestamp}.png", dpi=300, bbox_inches="tight")
    plt.figure(figsize=(14, 8))
    x_vals = list(range(len(emotion_labels)))
    for topic_id in plot_topic_ids:
        emotion_dist = topic_results[topic_id]["emotion_distribution"]
        y = [emotion_dist.get(label, 0) for label in emotion_labels]
        plt.bar([xi + 0.1*topic_id for xi in x_vals], y, width=0.1, label=f"Topic {topic_id}")
    plt.xlabel("Emotion")
    plt.ylabel("Proportion")
    plt.title("Emotion Distribution Across Topics")
    plt.xticks(x_vals, emotion_labels, rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.savefig(f"topic_emotion_distribution_{timestamp}.png", dpi=300, bbox_inches="tight")

plot_topic_sentiment_emotions(topic_sentiment_emotion_results, top_n_topics=100)
print("Sentiment and emotion analysis complete. Results saved to CSV and visualizations created.")

# Calculate average sentiment for each topic
topic_sentiment_summary = sentiment_emotion_df.groupby('Topic')[['Sentiment_Positive', 'Sentiment_Neutral', 'Sentiment_Negative']].mean().reset_index()

# Find the dominant emotions for each topic
emotion_cols = [col for col in sentiment_emotion_df.columns if col.startswith('Emotion_')]
topic_emotion_summary = sentiment_emotion_df.groupby('Topic')[emotion_cols].mean().reset_index()

# For each topic, identify the top 2 emotions
top_emotions_by_topic = []
for _, row in topic_emotion_summary.iterrows():
    topic = row['Topic']
    # Get emotion scores for this topic
    emotion_scores = {col.replace('Emotion_', ''): row[col] for col in emotion_cols}
    # Sort by score and get top 2
    top_emotions = sorted(emotion_scores.items(), key=lambda x: x[1], reverse=True)[:2]
    top_emotions_by_topic.append({
        'Topic': topic,
        'Primary_Emotion': top_emotions[0][0],
        'Primary_Score': top_emotions[0][1],
        'Secondary_Emotion': top_emotions[1][0] if len(top_emotions) > 1 else None,
        'Secondary_Score': top_emotions[1][1] if len(top_emotions) > 1 else None
    })

# Convert to DataFrame
top_emotions_df = pd.DataFrame(top_emotions_by_topic)

# Save the topic sentiment summary
topic_sentiment_summary.to_csv(f"topic_sentiment_averages_{timestamp}.csv", index=False)

# Save the top emotions for each topic
top_emotions_df.to_csv(f"topic_dominant_emotions_{timestamp}.csv", index=False)

# Create visualizations for all 34 topics across 11 emotions using three line charts

# Define 13 distinct colors
primary_colors = [
    "#3498db",  # Bright Blue
    "#2ecc71",  # Bright Green
    "#f39c12",  # Orange-Yellow
    "#f1c40f",  # Bright Yellow
    "#d92f0d",  # Strong Red
    "#2c1b96",  # Deep Blue-Purple
    "#045c2d",  # Deep Forest Green
    "#d90d5f",  # Strong Pink
    "#d94e0d",  # Bright Orange-Red
    "#650dd9",  # Deep Violet
    "#7289a1",  # Steel Blue
    "#a6842e",  # Bronze Yellow
    "#de9a97",  # Soft Coral Pink
]

# Prepare emotion columns and labels
emotion_cols = [col for col in sentiment_emotion_df.columns if col.startswith('Emotion_')]
emotion_labels = [col.replace('Emotion_', '') for col in emotion_cols]

# Topic groups: (start_topic, end_topic)
topic_groups = [
    (0, 11),
    (12, 24),
    (25, 34)
]

# Set figure size to A4 landscape
figsize = (11.7, 8.3)

# Timestamp for saving files
from datetime import datetime

timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

for idx, (start_topic, end_topic) in enumerate(topic_groups, 1):
    # Filter topics
    group_df = sentiment_emotion_df[
        (sentiment_emotion_df['Topic'] >= start_topic) & (sentiment_emotion_df['Topic'] <= end_topic)
        ]

    # Create figure
    fig, ax = plt.subplots(figsize=figsize)
    # Remove the top and right borders for APA
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    # Plot each topic
    for i, (_, row) in enumerate(group_df.iterrows()):
        y = [row[f'Emotion_{emo}'] for emo in emotion_labels]
        color = primary_colors[i % len(primary_colors)]  # Loop colors safely
        ax.plot(emotion_labels, y, label=f'Topic {int(row["Topic"])}',
                color=color, linewidth=3, alpha=0.9)

    # Customize axes and legend
    ax.set_xlabel('Emotion', fontsize=14, fontproperties=sfpro_font)
    ax.set_ylabel('Proportion', fontsize=14, fontproperties=sfpro_font)
    ax.set_ylim(0, 1)
    ax.tick_params(axis='x', rotation=45)
    leg = ax.legend(loc='upper right', prop=sfpro_font)
    leg = ax.legend(loc='upper right', prop=sfpro_font)
    frame = leg.get_frame()
    frame.set_edgecolor('black')  # Set legend box border color to black
    frame.set_linewidth(1.0)  # Optional: set border thickness (1.0 is clean and APA)

    plt.tight_layout()

    # Save figure
    plt.savefig(f"emotion_distribution_topics_{start_topic}-{end_topic}_sfpro_{timestamp}.png",
                dpi=300, bbox_inches="tight")
    plt.close()

print("Emotion distribution line charts generated and saved successfully!")
