# Topic Modeling with Twitter Data

## Package Imports

In [4]:
import sys
from os import getcwd
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import STOPWORDS, WordCloud
import gensim
from gensim.models import CoherenceModel
from gensim import corpora
from pprint import pprint
import string
import os
import re

We need some code that is inside the parent directory of where we are. Inorder for import to work I will add the
parent directory to the system path

In [5]:
cwd = getcwd()
parent = cwd.replace('/notebooks', '')
sys.path.insert(0, parent) 

from extract_dataframe import read_json
from extract_dataframe import TweetDfExtractor
from clean_tweets_dataframe import CleanTweets
from tweets_preprocess import SADataPreparation
from utils import DataLoader

In [6]:
DataLoader_obj= DataLoader("../", "processed_tweet_data.csv")

In [7]:
tweets_df = DataLoader_obj.read_csv()
# tweets_df.dropna()
# tweets_df.reset_index(drop=True, inplace=True)
print("len: ", len(tweets_df))
tweets_df.head()

len:  24637


Unnamed: 0,created_at,source,original_text,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place
0,Fri Apr 22 22:20:18 +0000 2022,Twitter for Android,RT @nikitheblogger: Irre: Annalena Baerbock sa...,0.0,0.0,de,0,355.0,McMc74078966,3,12,,[],['nikitheblogger'],
1,Fri Apr 22 22:19:16 +0000 2022,Twitter for Android,RT @sagt_mit: Merkel schaffte es in 1 Jahr 1 M...,0.0,0.0,de,0,505.0,McMc74078966,3,12,,[],['sagt_mit'],
2,Fri Apr 22 22:17:28 +0000 2022,Twitter for Android,RT @Kryptonoun: @WRi007 Pharma in Lebensmittel...,0.0,0.0,de,0,4.0,McMc74078966,3,12,,[],"['Kryptonoun', 'WRi007']",
3,Fri Apr 22 22:17:20 +0000 2022,Twitter for Android,RT @WRi007: Die #Deutschen sind ein braves Vol...,0.0,0.0,de,0,332.0,McMc74078966,3,12,,"['Deutschen', 'Spritpreisen', 'inflation', 'Ab...",['WRi007'],
4,Fri Apr 22 22:13:15 +0000 2022,Twitter for Android,RT @RolandTichy: Baerbock verkündet mal so neb...,0.0,0.0,de,0,386.0,McMc74078966,3,12,,[],['RolandTichy'],


## Text Data Preparation

## Data Cleaner
We have imported the `CleanTweets` class for this task.

In [8]:
cleaner = CleanTweets()
cleaned_df = cleaner.run_pipeline(tweets_df)
cleaned_df.info()

Automation in Action...!!!
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4307 entries, 0 to 4306
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   created_at          4307 non-null   datetime64[ns, UTC]
 1   source              4307 non-null   object             
 2   original_text       4307 non-null   object             
 3   polarity            4307 non-null   float64            
 4   subjectivity        4307 non-null   float64            
 5   lang                4307 non-null   object             
 6   favorite_count      4307 non-null   int64              
 7   retweet_count       4307 non-null   float64            
 8   original_author     4307 non-null   object             
 9   followers_count     4307 non-null   int64              
 10  friends_count       4307 non-null   int64              
 11  possibly_sensitive  4307 non-null   object             
 12  hashtag

In [9]:
class PrepareData:
    def __init__(self,df):
        self.df=df

    def preprocess_data(self):
        tweets_df = self.df.loc[self.df['lang'] =="en"]


        #text Preprocessing
        tweets_df['original_text'] = tweets_df['original_text'].astype(str)
        tweets_df['original_text'] = tweets_df['original_text'].apply(
            lambda x: x.lower())
        tweets_df['original_text']= tweets_df['original_text'].apply(
            lambda x: x.translate(str.maketrans(' ', ' ', string.punctuation)))

        #Converting tweets to list of words For feature engineering
        sentence_list = [tweet for tweet in tweets_df['original_text']]
        word_list = [sent.split() for sent in sentence_list]

        #Create dictionary which contains Id and word 
        word_to_id = corpora.Dictionary(word_list)
        corpus_1= [word_to_id.doc2bow(tweet) for tweet in word_list]




        return word_list, word_to_id, corpus_1

In [10]:
PrepareData_obj=PrepareData(tweets_df)
word_list ,id2word,corpus=PrepareData_obj.preprocess_data()

In [11]:
print(len(corpus))

16465


### Topic Modeling using Latent Dirichlet Allocation 
based on the distributional hypothesis, (i.e. similar topics make use of similar words) and the statistical mixture hypothesis (i.e. documents talk about several topics) for which a statistical distribution can be determined. 

*  The purpose of LDA is mapping each teweets in our corpus to a set of topics 
which covers a good deal of the words in the tweet

In [12]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus,
                                           id2word=id2word,
                                           num_topics=3, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=512,
                                           passes=5,
                                           alpha='auto',
                                           per_word_topics=True)

In [14]:
pprint(lda_model.show_topics(formatted=False))

[(0,
  [('we', 0.016874667),
   ('our', 0.009928418),
   ('please', 0.005698203),
   ('president', 0.004345086),
   ('man', 0.003346508),
   ('thank', 0.0032037112),
   ('details', 0.0031766891),
   ('love', 0.0031146999),
   ('🟩🟩🟩🟩🟩', 0.0030106565),
   ('pnpkakampimo', 0.0029432138)]),
 (1,
  [('the', 0.04357081),
   ('rt', 0.037276402),
   ('to', 0.028883902),
   ('of', 0.02274348),
   ('in', 0.019179167),
   ('a', 0.019001065),
   ('and', 0.016870314),
   ('is', 0.01582212),
   ('for', 0.013602853),
   ('on', 0.010785839)]),
 (2,
  [('india', 0.01583555),
   ('aitcofficial', 0.008558942),
   ('srilanka', 0.008279453),
   ('lka', 0.005601042),
   ('sri', 0.0050456594),
   ('cartoon', 0.0046119452),
   ('mamataofficial', 0.0044812784),
   ('ukraine', 0.0039761323),
   ('economiccrisislk', 0.0036615052),
   ('lanka', 0.0032493053)])]


# **Model Analysis**

Perplexity is also a measure of model quality and in natural language processing is often used as “perplexity per number of words”. It describes how well a model predicts a sample, i.e. how much it is “perplexed” by a sample from the observed data. The lower the score, the better the model for the given data.

A coherence matrix is used to test the model for accuracy. Topic coherence is a measure that compares different topic models based on their human-interpretability. The coherence score ‘C_V’ provides a numerical value to the interpretability of the topics

In [15]:
# Compute Perplexity

#It's a measure of how good the model is. The lower the better. Perplexity is a negative value
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  
doc_lda = lda_model[corpus]


# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=word_list, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\n Ldamodel Coherence Score/Accuracy on Tweets: ', coherence_lda)


Perplexity:  -8.802934203688965

 Ldamodel Coherence Score/Accuracy on Tweets:  0.40493752279116224


**Anlayizing results**
Exploring the Intertopic Distance Plot can help you learn about how topics relate to each other, including potential higher-level structure between groups of topics

In [16]:
import pyLDAvis.gensim_models as gensimvis
import pickle 
import pyLDAvis
# Visualize the topics
pyLDAvis.enable_notebook()

LDAvis_prepared = gensimvis.prepare(lda_model, corpus, id2word)
LDAvis_prepared

  from imp import reload


  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
