In [4]:
import pandas as pd
df = pd.read_csv("datasets/reddit_cleaned.csv")

In [6]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

# Apply VADER to raw text column
df['vader_score'] = df['text'].astype(str).apply(lambda x: sia.polarity_scores(x)['compound'])

# Classify sentiment based on compound score
def classify_sentiment(score):
    if score >= 0.05:
        return 'Positive'
    elif score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

df['sentiment'] = df['vader_score'].apply(classify_sentiment)

# Convert datetime and extract date for trend analysis
df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')
df['date'] = df['datetime'].dt.date

df.to_csv('datasets/reddit_with_sentiment.csv', index=False)
print("reddit_with_sentiment.csv")


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/harrietmathew/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


reddit_with_sentiment.csv


In [7]:
# Classify strong sentiment
def classify_strong_sentiment(score):
    if score >= 0.6:
        return 'strong_positive'
    elif score <= -0.6:
        return 'strong_negative'
    else:
        return 'other'

df['platform'] = 'Reddit' 
df['strong_sentiment'] = df['vader_score'].apply(classify_strong_sentiment)

# Group and summarize
grouped = df.groupby(['platform', 'event'])
summary = grouped.agg(
    avg_sentiment=('vader_score', 'mean'),
    total=('vader_score', 'count'),
    strong_positive=('strong_sentiment', lambda x: (x == 'strong_positive').sum()),
    strong_negative=('strong_sentiment', lambda x: (x == 'strong_negative').sum())
).reset_index()

# Add percentage columns
summary['% Strong Positive (≥ 0.6)'] = (summary['strong_positive'] / summary['total'] * 100).round(1)
summary['% Strong Negative (≤ -0.6)'] = (summary['strong_negative'] / summary['total'] * 100).round(1)

# Final summary table
summary_table = summary[['platform', 'event', 'avg_sentiment', '% Strong Positive (≥ 0.6)', '% Strong Negative (≤ -0.6)']]
summary_table.columns = ['Platform', 'Event', 'Avg. Sentiment', '% Strong Positive (≥ 0.6)', '% Strong Negative (≤ -0.6)']

# Save table
summary_table.to_csv('datasets/reddit_emotional_unity_summary.csv', index=False)
print(summary_table)

  Platform      Event  Avg. Sentiment  % Strong Positive (≥ 0.6)  \
0   Reddit   Olympics        0.142080                       22.7   
1   Reddit  World Cup        0.120668                       22.0   

   % Strong Negative (≤ -0.6)  
0                         8.8  
1                         9.5  


In [8]:
reddit_summary = pd.read_csv('datasets/reddit_emotional_unity_summary.csv')
youtube_summary = pd.read_csv('datasets/youtube_emotional_unity_summary.csv')

combined_summary = pd.concat([reddit_summary, youtube_summary], ignore_index=True)
print(combined_summary)

  Platform      Event  Avg. Sentiment  % Strong Positive (≥ 0.6)  \
0   Reddit   Olympics        0.142080                       22.7   
1   Reddit  World Cup        0.120668                       22.0   
2  YouTube   Olympics        0.060340                       14.9   
3  YouTube  World Cup        0.073496                       13.2   

   % Strong Negative (≤ -0.6)  
0                         8.8  
1                         9.5  
2                         9.3  
3                         5.6  


In [31]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Load preprocessed dataset
df = pd.read_csv('datasets/reddit_with_sentiment.csv')

# Use already cleaned text for LDA
df['lda_text'] = df['clean_text']
df['strong_sentiment'] = df['vader_score'].apply(classify_strong_sentiment)

# LDA topic function
def get_topics_from_texts(texts, n_topics=4, n_top_words=10):
    vectorizer = CountVectorizer(max_df=0.95, min_df=5, stop_words='english')
    X = vectorizer.fit_transform(texts)
    
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda.fit(X)

    words = vectorizer.get_feature_names_out()
    topic_keywords = []
    for topic_idx, topic in enumerate(lda.components_):
        top_words = [words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topic_keywords.append((f"Topic {topic_idx + 1}", top_words))
    return topic_keywords
    
subset = subset.dropna(subset=['lda_text']) 

for event in ['World Cup', 'Olympics']:
    for sentiment in ['strong_positive', 'strong_negative']:
        subset = df[(df['event'] == event) & (df['strong_sentiment'] == sentiment)]
        if not subset.empty:
            print(f"\nTop Topics in {sentiment.replace('_', ' ').title()} Comments - {event}")
            topics = get_topics_from_texts(subset['lda_text'])
            for topic, keywords in topics:
                print(f"{topic}: {', '.join(keywords)}")



Top Topics in Strong Positive Comments - World Cup
Topic 1: thank, like, great, post, russia, love, thanks, question, qatar, people
Topic 2: like, team, year, post, time, race, point, make, sport, championship
Topic 3: good, really, like, game, happy, love, think, people, year, time
Topic 4: team, world, cup, player, game, win, best, final, like, play

Top Topics in Strong Negative Comments - World Cup
Topic 1: like, game, dont, time, people, team, shit, hell, going, really
Topic 2: world, cup, russia, country, fifa, team, russian, people, shit, war
Topic 3: day, group, thing, like, dead, cheating, let, damn, new, evidence
Topic 4: qatar, people, fuck, world, dont, cup, like, make, country, year

Top Topics in Strong Positive Comments - Olympics
Topic 1: like, love, great, make, dont, really, know, people, thing, good
Topic 2: song, like, love, album, really, good, feel, dont, track, sound
Topic 3: like, olympics, medal, time, gold, year, going, think, people, best
Topic 4: like, game