# Topic Modeling

# Goal 
Understand themes in negatives tweets about an entity

## Data Prepration

In [40]:
import pandas as pd
from gensim import corpora, models
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
    
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
pyLDAvis.enable_notebook()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mengqizhou/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mengqizhou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/mengqizhou/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [41]:
df = pd.read_csv('~/sentiment_analysis/archive/twitter_training_preprocessed.csv') 
df = df.dropna()
grouped_df = df.groupby('entity')

## Topic Modeling With Latent Dirichlet Allocation(LDA)

In [43]:
from collections import Counter

stop_words = set(stopwords.words('english'))
def find_topics(group): 
    processed_docs = [
        [word.lower() for word in word_tokenize(doc) if word.isalpha() and word.lower() not in stop_words]
        for doc in group[group['label']=='Negative']['text_processed']
    ]
    print('length of negative tweets: ', len(processed_docs))
    dictionary = corpora.Dictionary(processed_docs)
    # Create Corpus: Term Document Frequency
    corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    
    lda_model = models.LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=10,  # Number of topics
        random_state=42,
        passes=10,
        alpha='auto',
        per_word_topics=True
    )
    print("\nTopics:")
    for idx, topic in lda_model.print_topics(-1):
        print(f"Topic {idx}: {topic}")

    topic_counts = Counter()
    for doc_bow in corpus:
        topic_distribution = lda_model.get_document_topics(doc_bow)
        for topic_id, prob in topic_distribution:
            topic_counts[topic_id] += prob
    sorted_topic_counts = sorted(topic_counts.items(), key=lambda x: x[1], reverse=True)
    print("Topic Frequencies:")
    for topic_id, freq in sorted_topic_counts:
        print(f"Topic {topic_id}: {freq:.2f}")
    lda_vis = gensimvis.prepare(lda_model, corpus, dictionary)
    pyLDAvis.show(lda_vis, local=False)
    print('------------------------------------------------------------')
    
def process_group(entity, group): 
    print('Entity', entity)
    find_topics(group)

In [44]:
process_group('MaddenNFL', grouped_df.get_group('MaddenNFL'))   

Entity MaddenNFL
length of negative tweets:  1630

Topics:
Topic 0: 0.058*"eamaddennfl" + 0.036*"game" + 0.024*"easports" + 0.021*"like" + 0.017*"nfl" + 0.016*"madden" + 0.013*"play" + 0.012*"k" + 0.010*"well" + 0.010*"break"
Topic 1: 0.048*"eamaddennfl" + 0.030*"year" + 0.029*"shit" + 0.027*"madden" + 0.025*"rhandlerr" + 0.021*"need" + 0.020*"suck" + 0.016*"ea" + 0.014*"time" + 0.014*"game"
Topic 2: 0.053*"eamaddennfl" + 0.040*"play" + 0.024*"year" + 0.023*"garbage" + 0.020*"game" + 0.015*"madden" + 0.014*"like" + 0.013*"ass" + 0.011*"fucking" + 0.010*"easports"
Topic 3: 0.096*"eamaddennfl" + 0.023*"know" + 0.018*"trash" + 0.017*"profane" + 0.016*"ea" + 0.014*"nfl" + 0.014*"let" + 0.013*"franchise" + 0.012*"joke" + 0.010*"defense"
Topic 4: 0.061*"eamaddennfl" + 0.025*"game" + 0.017*"disrespect" + 0.016*"easports" + 0.016*"player" + 0.014*"man" + 0.012*"stupid" + 0.012*"fix" + 0.011*"get" + 0.011*"play"
Topic 5: 0.082*"game" + 0.078*"eamaddennfl" + 0.025*"fix" + 0.018*"year" + 0.013*"g

127.0.0.1 - - [04/Jan/2025 17:16:02] "GET / HTTP/1.1" 200 -



stopping Server...
------------------------------------------------------------
