# Topic Modeling

# Goal 
Understand themes in negatives tweets about an entity

## Data Prepration

In [11]:
import pandas as pd
from gensim import corpora, models
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
    
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mengqizhou/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mengqizhou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/mengqizhou/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
df = pd.read_csv('~/sentiment_analysis/archive/twitter_training_preprocessed.csv') 
df = df.dropna()
grouped_df = df.groupby('entity')

## Topic Modeling With Latent Dirichlet Allocation(LDA)

In [29]:
from collections import Counter
pyLDAvis.enable_notebook()
stop_words = set(stopwords.words('english'))

def process(group): 
    #print('length of negative tweets: ', len(processed_docs))
    processed_docs = [
        [word.lower() for word in word_tokenize(doc) if word.isalpha() and word.lower() not in stop_words]
        for doc in group[group['label']=='Negative']['text_processed']
    ]
    dictionary = corpora.Dictionary(processed_docs)
    # Create Corpus: Term Document Frequency
    corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    return processed_docs, corpus, dictionary
    
def train_lda_model(corpus,dictionary, num_topics, passes): 
    model = models.LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,  
        random_state=42,
        passes=passes,
        alpha='auto',
        per_word_topics=True,
    )
    return model
    '''
    print("\nTopics:")
    for idx, topic in lda_model.print_topics(-1):
        print(f"Topic {idx}: {topic}")

    topic_counts = Counter()
    for doc_bow in corpus:
        topic_distribution = lda_model.get_document_topics(doc_bow)
        for topic_id, prob in topic_distribution:
            topic_counts[topic_id] += prob
    sorted_topic_counts = sorted(topic_counts.items(), key=lambda x: x[1], reverse=True)
    print("Topic Frequencies:")
    for topic_id, freq in sorted_topic_counts:
        print(f"Topic {topic_id}: {freq:.2f}")
    '''
    if visualize: 
        lda_vis = gensimvis.prepare(lda_model, corpus, dictionary)
        return lda_vis
    return None 
    
def process_group(entity, group, num_topics=10, passes=10, eval_every=None, visualize=False): 
    #print('Entity', entity)
    return find_topics(group, num_topics, passes, eval_every, visualize)

## Number of Topics Sampling

In [30]:
# process "Amazon" tweets
texts, corpus, dictionary = process(grouped_df.get_group('Amazon'))
num_to_models = {}
for num_topics in [3,5,10,15,20]:
    num_to_models[num_topics] = train_lda_model(corpus, dictionary, num_topics, num_topics*3)

In [31]:
from gensim.models.coherencemodel import CoherenceModel

for num_topics, model in num_to_models.items(): 
    coherence_model = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    print(f"Num Topics: {num_topics} Coherence Score: {coherence_score:.4f}")

Num Topics: 3 Coherence Score: 0.4673
Num Topics: 5 Coherence Score: 0.4685
Num Topics: 10 Coherence Score: 0.3918
Num Topics: 15 Coherence Score: 0.3693
Num Topics: 20 Coherence Score: 0.4023


In [34]:
def show(lda_model, corpus, dictionary):
    return gensimvis.prepare(lda_model, corpus, dictionary)

In [33]:
show(num_to_models[5], corpus, dictionary)