In [3]:
import networkx as nx
import pandas as pd
from collections import defaultdict

G = nx.read_graphml('graphs/reddit/merged_all_events_louvain.graphml')

comms = defaultdict(set)
for i, attr in G.nodes(data=True):
    comm = attr["community"]
    comms[comm].add(i)

comms_list = {
    10: None,
    0: None,
    7: None,
    1: None
}
def classify_strong_sentiment(score):
    if score >= 0.6:
        return 'strong_positive'
    elif score <= -0.6:
        return 'strong_negative'
    else:
        return 'other'
    
df = pd.read_csv('datasets/reddit_with_sentiment.csv')

df['lda_text'] = df['clean_text']
df['strong_sentiment'] = df['vader_score'].apply(classify_strong_sentiment)

for i in comms_list:
    comms_list[i] = df[df["author"].isin(comms[i])].copy()

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation


# LDA topic function
def get_topics_from_texts(texts, n_topics=8, n_top_words=10):
    # Drop NaN or empty strings before vectorization
    texts = texts.dropna()
    texts = texts[texts.str.strip() != '']
    
    vectorizer = CountVectorizer(max_df=0.95, min_df=5, stop_words='english')
    X = vectorizer.fit_transform(texts)
    
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda.fit(X)

    words = vectorizer.get_feature_names_out()
    topic_keywords = []
    for topic_idx, topic in enumerate(lda.components_):
        top_words = [words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topic_keywords.append((f"Topic {topic_idx + 1}", top_words))
    return topic_keywords

    
for comm in comms_list:
    for sentiment in ['strong_positive', 'strong_negative']:
        subset = comms_list[comm]
        subset = subset[subset["strong_sentiment"] == sentiment]
        if not subset.empty:
            print(f"\n({len(subset)}) Top Topics in {sentiment.replace('_', ' ').title()} Comments - {comm}")
            topics = get_topics_from_texts(subset['lda_text'])
            for topic, keywords in topics:
                print(f"{topic}: {', '.join(keywords)}")



(2333) Top Topics in Strong Positive Comments - 10
Topic 1: game, people, dont, love, like, thanks, world, olympics, better, wow
Topic 2: like, lol, medal, look, win, gold, athlete, thats, super, great
Topic 3: theme, series, final, fantasy, heart, hero, star, trigger, kingdom, chrono
Topic 4: sport, olympics, year, great, time, week, really, game, ive, watching
Topic 5: love, best, great, good, song, lmao, ceremony, time, athlete, thing
Topic 6: awesome, amazing, game, ceremony, olympics, think, happy, time, guy, like
Topic 7: ceremony, olympic, closing, medal, opening, fencing, love, bach, champion, thomas
Topic 8: olympics, like, paris, really, french, great, beautiful, world, france, people

(546) Top Topics in Strong Negative Comments - 10
Topic 1: dont, time, olympics, know, medal, watch, game, world, didnt, event
Topic 2: fuck, shit, damn, olympics, year, miss, covid, nbc, need, world
Topic 3: hell, fucking, year, fucked, fuck, wtf, yeah, way, theyre, country
Topic 4: olympics,

In [6]:
from collections import Counter

words = """
Topic 1: time, team, way, game, world, final, cup, lot, player, place
Topic 2: worst, argentina, team, world, bad, cup, year, qatar, final, game
Topic 3: fucked, world, going, performance, end, got, cup, dude, dont, god
Topic 4: penalty, england, ive, seen, worst, didnt, brazil, southgate, game, france
Topic 5: match, fuck, group, long, level, game, time, tournament, half, chance
Topic 6: penalty, kane, really, french, like, good, right, ref, missed, think
Topic 7: shit, world, holy, fuck, qatar, cup, today, croatia, like, sad
Topic 8: fifa, hell, people, world, game, good, cup, minute, tournament, player
"""

for i in range(8):
    words = words.replace(f"Topic {i+1}:", "")

words = [i.strip() for i in words.split(",")]

result = dict(Counter(words).most_common())
print(", ".join([i for i in result if result[i] > 1]))

world, cup, game, time, team, final, player, qatar, fuck, tournament, like, good
