# CHALLENGE - 3 - topic analysis

### Import modules and working files

In [33]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from pprint import pprint
import string
import os
import re

# visual
import matplotlib.pyplot as plt
import seaborn as sns

# filter out noise words and more clean up on word
from wordcloud import STOPWORDS,WordCloud
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# gensim
import gensim
from gensim.models import CoherenceModel
from gensim import corpora



file_name = 'processed_tweet_data.csv'
df = pd.read_csv(file_name)

## DATA-PREPARATION

In [34]:
# glimpes fo the data
# 
df.head(2)

Unnamed: 0,created_at,source,original_text,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place
0,Fri Jun 18 17:55:49 +0000 2021,"<a href=""http://twitter.com/download/iphone"" r...","ðŸš¨Africa is ""in the midst of a full-blown third...",0.166667,0.188889,en,548,612.0,ketuesriche,551,351,,,,Mass
1,Fri Jun 18 17:55:59 +0000 2021,"<a href=""https://mobile.twitter.com"" rel=""nofo...","Dr Moeti is head of WHO in Africa, and one of ...",0.133333,0.455556,en,195,92.0,Grid1949,66,92,,,,"Edinburgh, Scotland"


In [35]:
# size of data set
df.shape

(6532, 15)

In [36]:
df.columns

Index(['created_at', 'source', 'original_text', 'polarity', 'subjectivity',
       'lang', 'favorite_count', 'retweet_count', 'original_author',
       'followers_count', 'friends_count', 'possibly_sensitive', 'hashtags',
       'user_mentions', 'place'],
      dtype='object')

In [37]:
# check if we have null 
count_texts = df['original_text'].isnull().sum()
print("Data Frame before removing null rows ", df.shape)
print("Number of null orignal_text ", count_texts)

Data Frame before removing null rows  (6532, 15)
Number of null orignal_text  2812


In [38]:
# remove null rows
df = df[~df['original_text'].isnull()]
count_texts = df['original_text'].isnull().sum()
print("Data Frame after removing null rows ", df.shape)
print("Number of null orignal_text ", count_texts)

Data Frame after removing null rows  (3720, 15)
Number of null orignal_text  0


In [39]:
# clean out unwanted values from the text, like links and other staff
import re
def clean_text(text):
    hash_tag_removed = re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', text)
    removed_links = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', hash_tag_removed, flags=re.MULTILINE)
    result = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', removed_links)
    return result

df['original_text'] = df['original_text'].apply(clean_text)

In [40]:
# SETUP NEW DATA FRAME FOR CLEANED DATA
cleanTweet = pd.DataFrame()
cleanTweet['clean_text'] = df['original_text']
cleanTweet['polarity'] = df['polarity']

cleanTweet.head()

Unnamed: 0,clean_text,polarity
0,"ðŸš¨Africa is ""in the midst of a full-blown third...",0.166667
1,"Dr Moeti is head of WHO in Africa, and one of ...",0.133333
2,Thank you for creating this amazing campaign ...,0.316667
3,"Former Pfizer VP and Virologist, Dr. Michael Y...",0.086111
4,I think itâ€™s important that we donâ€™t sell COVA...,0.28


In [41]:
cleanTweet.columns

Index(['clean_text', 'polarity'], dtype='object')

In [42]:
cleanTweet.dropna()

Unnamed: 0,clean_text,polarity
0,"ðŸš¨Africa is ""in the midst of a full-blown third...",0.166667
1,"Dr Moeti is head of WHO in Africa, and one of ...",0.133333
2,Thank you for creating this amazing campaign ...,0.316667
3,"Former Pfizer VP and Virologist, Dr. Michael Y...",0.086111
4,I think itâ€™s important that we donâ€™t sell COVA...,0.280000
...,...,...
6521,Australia is sending vaccines.\nAustralia is s...,0.100000
6522,The Truth Behind COVID-19 Vaccines (6) â€”â€” Bell...,-0.386111
6524,Covid19 vaccines reach the remotest places of ...,-0.050000
6528,"Former Pfizer VP and Virologist, Dr. Michael Y...",0.086111


In [43]:
# Noise words need to be removed.
def process_data():
    cleanTweet['clean_text'] = cleanTweet['clean_text'].apply(lambda x: x.lower())
    cleanTweet['clean_text'] = cleanTweet['clean_text'].apply(lambda x: x.translate(str.maketrans(' ', ' ', string.punctuation)))
    
    sentence_list = [tweet for tweet in cleanTweet['clean_text']]
    word_list = [sentence.split() for sentence in sentence_list]
    
    word_to_id = corpora.Dictionary(word_list)
    corpus_1= [word_to_id.doc2bow(tweet) for tweet in word_list]
    
    return word_list, word_to_id, corpus_1
    

In [44]:
word_list, id2word, corpus = process_data()
print(corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 2), (14, 1), (15, 2), (16, 1), (17, 1), (18, 1), (19, 1), (20, 3), (21, 1), (22, 2), (23, 1), (24, 4), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1)]


In [45]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [29]:
pprint(lda_model.show_topics(formatted=False))

[(0,
  [('the', 0.059252065),
   ('of', 0.04634105),
   ('in', 0.02593329),
   ('and', 0.020663017),
   ('to', 0.018838525),
   ('vaccines', 0.016757298),
   ('by', 0.016396975),
   ('have', 0.015626773),
   ('amp', 0.013902199),
   ('on', 0.012009615)]),
 (1,
  [('in', 0.044179715),
   ('the', 0.039230857),
   ('india', 0.037722204),
   ('of', 0.035731066),
   ('a', 0.02738511),
   ('and', 0.021746458),
   ('wave', 0.020910054),
   ('africa', 0.020825444),
   ('amp', 0.020103788),
   ('third', 0.020095471)]),
 (2,
  [('to', 0.058688432),
   ('vaccines', 0.033360817),
   ('need', 0.032535426),
   ('we', 0.02992966),
   ('the', 0.026091734),
   ('are', 0.02354833),
   ('and', 0.023392623),
   ('you', 0.021670066),
   ('from', 0.01607607),
   ('with', 0.015258023)]),
 (3,
  [('to', 0.06913963),
   ('the', 0.03681693),
   ('is', 0.028010882),
   ('of', 0.026883086),
   ('and', 0.026510036),
   ('africa', 0.021838428),
   ('vaccines', 0.019329678),
   ('australia', 0.017714309),
   ('in', 

In [46]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=word_list, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\n Ldamodel Coherence Score/Accuracy on Tweets: ', coherence_lda)


Perplexity:  -6.079593998776833

 Ldamodel Coherence Score/Accuracy on Tweets:  0.41991022045536186


In [47]:
#### pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
vis