# Data preprocessing

In [47]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statistics
import string
import os
import re
from textblob import TextBlob
from wordcloud import STOPWORDS,WordCloud
from sklearn.model_selection import train_test_split
import gensim
from gensim.models import CoherenceModel
from gensim import corpora
from pprint import pprint
import pyLDAvis.gensim_models as gensimvis
import pickle 
import pyLDAvis

In [23]:
def read_data():
    data = pd.read_csv('processed_tweet_data.csv')
    df = data.copy()
    df = df.drop(['place_coord_boundaries'], axis=1)
    cleanTweet = pd.DataFrame(columns=['text','polarity'])
    cleanTweet['text'] = df['cleaned_text'].to_list()
    cleanTweet['polarity'] = df['polarity'].to_list()
    def text_category(p):
        if p < 0:
            category = 'negative'
        elif p == 0:
            category = 'neutral'
        else:
            category = 'positive'
        return category
    polarities = [TextBlob(t).sentiment.polarity for t in df['original_text'].to_list()]
    scores = [text_category(p) for p in polarities]
    cleanTweet['scores'] = scores
    scoremap = {'positive':1, 'negative':0} 
    cleanTweet['scoremap'] = cleanTweet['scores'].map(scoremap)
    df['scores'] = cleanTweet['scoremap'].to_list()
    df = df.loc[df['scores'].isna() == False]
    df['scores'] = df['scores'].astype('int')
    return df

In [24]:
df = read_data()

In [25]:
df

Unnamed: 0,created_at,source,original_text,cleaned_text,sentiment,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,screen_count,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place,statuses_count,scores
0,Sun Aug 07 22:31:20 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @i_ameztoy: Extra random image (I):\n\nLets...,iameztoy extra random image ilets focus one sp...,"Sentiment(polarity=-0.125, subjectivity=0.190625)",-1.250000e-01,0.190625,en,15760,2,i_ameztoy,232,20497,2621,False,"[{'text': 'City', 'indices': [132, 137]}]","[{'screen_name': 'i_ameztoy', 'name': 'Iban Am...",,8097,0
1,Sun Aug 07 22:31:16 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @IndoPac_Info: #China's media explains the ...,indopacinfo chinas media explains military rea...,"Sentiment(polarity=-0.1, subjectivity=0.1)",-1.000000e-01,0.100000,en,6967,201,ZIisq,3,65,272,False,"[{'text': 'China', 'indices': [18, 24]}, {'tex...","[{'screen_name': 'IndoPac_Info', 'name': 'Indo...",,5831,0
3,Sun Aug 07 22:31:06 +0000 2022,"<a href=""http://twitter.com/download/android"" ...","Putin to #XiJinping : I told you my friend, Ta...",putin xijinping told friend taiwan vassal stat...,"Sentiment(polarity=0.1, subjectivity=0.35)",1.000000e-01,0.350000,en,2166,0,Fin21Free,0,85,392,,"[{'text': 'XiJinping', 'indices': [9, 19]}]",[],Netherlands,1627,1
4,Sun Aug 07 22:31:04 +0000 2022,"<a href=""http://twitter.com/download/iphone"" r...","RT @ChinaUncensored: I’m sorry, I thought Taiw...",chinauncensored im sorry thought taiwan indepe...,"Sentiment(polarity=-6.938893903907228e-18, sub...",-6.938894e-18,0.556250,en,17247,381,VizziniDolores,0,910,2608,False,[],"[{'screen_name': 'ChinaUncensored', 'name': 'C...","Ayent, Schweiz",18958,0
5,Sun Aug 07 22:31:02 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @benedictrogers: We must not let this happe...,benedictrogers must let happenwe must readywe ...,"Sentiment(polarity=0.2, subjectivity=0.5)",2.000000e-01,0.500000,en,41770,36,GraceCh15554845,0,207,54,False,"[{'text': 'Taiwan', 'indices': [84, 91]}]","[{'screen_name': 'benedictrogers', 'name': 'Be...","Melbourne, Victoria",48483,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21995,Sat Aug 06 18:03:29 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @IndoPac_Info: A good infographic of #China...,indopacinfo good infographic chinas missile la...,"Sentiment(polarity=0.7, subjectivity=0.6000000...",7.000000e-01,0.600000,en,9683,183,VandelayT,2,62,471,False,"[{'text': 'China', 'indices': [40, 46]}, {'tex...","[{'screen_name': 'IndoPac_Info', 'name': 'Indo...",,6600,1
21996,Sat Aug 06 18:03:27 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @IndoPac_Info: A good infographic of #China...,indopacinfo good infographic chinas missile la...,"Sentiment(polarity=0.7, subjectivity=0.6000000...",7.000000e-01,0.600000,en,11538,183,sashalenik,0,94,1751,False,"[{'text': 'China', 'indices': [40, 46]}, {'tex...","[{'screen_name': 'IndoPac_Info', 'name': 'Indo...",Gelendzhik,3739,1
21997,Sat Aug 06 18:03:27 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",@Reuters Thanks #Pelosi smart move.,reuters thanks pelosi smart move,"Sentiment(polarity=0.20714285714285713, subjec...",2.071429e-01,0.421429,en,1940,0,ZeitounRimal,0,88,0,,"[{'text': 'Pelosi', 'indices': [16, 23]}]","[{'screen_name': 'Reuters', 'name': 'Reuters',...",🇺🇲🇷🇺🇺🇦🇫🇷🇦🇪🇮🇱🏳️‍🌈,3540,1
21998,Sat Aug 06 18:03:26 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @IndoPac_Info: #Taiwan people's Desire for ...,indopacinfo taiwan peoples desire unification ...,"Sentiment(polarity=0.05, subjectivity=0.35)",5.000000e-02,0.350000,en,11849,67,SazzyCowgirl1,5,537,317,False,"[{'text': 'Taiwan', 'indices': [18, 25]}, {'te...","[{'screen_name': 'IndoPac_Info', 'name': 'Indo...","Oregon, USA",21833,1


In [34]:
class PrepareData:
    def __init__(self,df):
        self.df=df
    
    def hashtagsProcess(self):
        
        def find_hashtags(text):
            '''This function will extract hashtags'''
            return re.findall('(#[A-Za-z]+[A-Za-z0-9-_]+)', text)
        
        # update Hashtage columon with hashtages extracted from Orignal_text columon
        df['hashtags'] = df.original_text.apply(find_hashtags)
        
        # take the rows from the hashtag columns where there are actually hashtags
        hashtags_list_df = df.loc[
                               df.hashtags.apply(
                                   lambda hashtags_list: hashtags_list !=[]
                               ),['hashtags']]        
        flattened_hashtags_df = pd.DataFrame(
            [hashtag for hashtags_list in hashtags_list_df.hashtags
            for hashtag in hashtags_list],
            columns=['hashtag'])
        
        #add flatten_hashtags to tweet_df
        df["flattened_hashtags"]= flattened_hashtags_df
        
        return df
    
    def preprocess_data(self):

        #text Preprocessing
        df['cleaned_text'] = df['cleaned_text'].astype(str)
        df['cleaned_text'] = df['cleaned_text'].apply(lambda x: x.translate(str.maketrans(' ', ' ', string.punctuation)))
        
        #Converting tweets to list of words For feature engineering
        sentence_list = [tweet for tweet in df['cleaned_text']]
        word_list = [sent.split() for sent in sentence_list]
        
        #Create dictionary which contains Id and word 
        word_to_id = corpora.Dictionary(word_list) #generate unique tokens
        #  we can see the word to unique integer mapping
        # print(word_to_id.token2id)
        # using bag of words(bow), we create a corpus that contains the word id and its frequency in each document.
        corpus_1= [word_to_id.doc2bow(tweet) for tweet in word_list]
        # TFIDF

        return word_list, word_to_id, corpus_1

In [35]:
PrepareData_obj = PrepareData(df)

In [36]:
df = PrepareData.hashtagsProcess(df)

In [37]:
df

Unnamed: 0,created_at,source,original_text,cleaned_text,sentiment,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,screen_count,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place,statuses_count,scores,flattened_hashtags
0,Sun Aug 07 22:31:20 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @i_ameztoy: Extra random image (I):\n\nLets...,iameztoy extra random image ilets focus one sp...,"Sentiment(polarity=-0.125, subjectivity=0.190625)",-1.250000e-01,0.190625,en,15760,2,i_ameztoy,232,20497,2621,False,"[#City, #Ta]","[{'screen_name': 'i_ameztoy', 'name': 'Iban Am...",,8097,0,#City
1,Sun Aug 07 22:31:16 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @IndoPac_Info: #China's media explains the ...,indopacinfo chinas media explains military rea...,"Sentiment(polarity=-0.1, subjectivity=0.1)",-1.000000e-01,0.100000,en,6967,201,ZIisq,3,65,272,False,"[#China, #Taiwan]","[{'screen_name': 'IndoPac_Info', 'name': 'Indo...",,5831,0,#Ta
3,Sun Aug 07 22:31:06 +0000 2022,"<a href=""http://twitter.com/download/android"" ...","Putin to #XiJinping : I told you my friend, Ta...",putin xijinping told friend taiwan vassal stat...,"Sentiment(polarity=0.1, subjectivity=0.35)",1.000000e-01,0.350000,en,2166,0,Fin21Free,0,85,392,,[#XiJinping],[],Netherlands,1627,1,#Taiwan
4,Sun Aug 07 22:31:04 +0000 2022,"<a href=""http://twitter.com/download/iphone"" r...","RT @ChinaUncensored: I’m sorry, I thought Taiw...",chinauncensored im sorry thought taiwan indepe...,"Sentiment(polarity=-6.938893903907228e-18, sub...",-6.938894e-18,0.556250,en,17247,381,VizziniDolores,0,910,2608,False,[],"[{'screen_name': 'ChinaUncensored', 'name': 'C...","Ayent, Schweiz",18958,0,#XiJinping
5,Sun Aug 07 22:31:02 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @benedictrogers: We must not let this happe...,benedictrogers must let happenwe must readywe ...,"Sentiment(polarity=0.2, subjectivity=0.5)",2.000000e-01,0.500000,en,41770,36,GraceCh15554845,0,207,54,False,[#Taiwan],"[{'screen_name': 'benedictrogers', 'name': 'Be...","Melbourne, Victoria",48483,1,#Taiwan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21995,Sat Aug 06 18:03:29 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @IndoPac_Info: A good infographic of #China...,indopacinfo good infographic chinas missile la...,"Sentiment(polarity=0.7, subjectivity=0.6000000...",7.000000e-01,0.600000,en,9683,183,VandelayT,2,62,471,False,"[#China, #Taiwan, #ChinaTaiwanCrisis]","[{'screen_name': 'IndoPac_Info', 'name': 'Indo...",,6600,1,#Taiwan
21996,Sat Aug 06 18:03:27 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @IndoPac_Info: A good infographic of #China...,indopacinfo good infographic chinas missile la...,"Sentiment(polarity=0.7, subjectivity=0.6000000...",7.000000e-01,0.600000,en,11538,183,sashalenik,0,94,1751,False,"[#China, #Taiwan, #ChinaTaiwanCrisis]","[{'screen_name': 'IndoPac_Info', 'name': 'Indo...",Gelendzhik,3739,1,#Chinese
21997,Sat Aug 06 18:03:27 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",@Reuters Thanks #Pelosi smart move.,reuters thanks pelosi smart move,"Sentiment(polarity=0.20714285714285713, subjec...",2.071429e-01,0.421429,en,1940,0,ZeitounRimal,0,88,0,,[#Pelosi],"[{'screen_name': 'Reuters', 'name': 'Reuters',...",🇺🇲🇷🇺🇺🇦🇫🇷🇦🇪🇮🇱🏳️‍🌈,3540,1,#Taiwan
21998,Sat Aug 06 18:03:26 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @IndoPac_Info: #Taiwan people's Desire for ...,indopacinfo taiwan peoples desire unification ...,"Sentiment(polarity=0.05, subjectivity=0.35)",5.000000e-02,0.350000,en,11849,67,SazzyCowgirl1,5,537,317,False,"[#Taiwan, #China]","[{'screen_name': 'IndoPac_Info', 'name': 'Indo...","Oregon, USA",21833,1,#To


In [38]:
word_list ,id2word,corpus=PrepareData_obj.preprocess_data()

In [39]:
print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1)], [(17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1)], [(18, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1)], [(24, 1), (28, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1)], [(28, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 3), (60, 1)], [(47, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1)], [(28, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1)], [(19, 1), (28, 1), (70, 1), (79, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1)], [(18, 1), (21, 1), (28, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (9

In [40]:
id_words = [[(id2word[id], count) for id, count in line] for line in corpus]

In [41]:
print(id_words)



In [None]:
## Retain relevant features (fields)

In [None]:
## Data imputation and additional data transformation

# Topic modeling

In [42]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [43]:
pprint(lda_model.print_topics())

[(0,
  '0.045*"taiwan" + 0.027*"china" + 0.017*"pelosi" + 0.014*"ukraine" + '
  '0.013*"us" + 0.010*"spokespersonchn" + 0.009*"war" + 0.007*"world" + '
  '0.007*"ccp" + 0.007*"video"'),
 (1,
  '0.038*"taiwans" + 0.028*"taiwan" + 0.023*"chinese" + 0.022*"island" + '
  '0.021*"indopacinfo" + 0.018*"near" + 0.018*"amp" + 0.015*"military" + '
  '0.014*"found" + 0.014*"dead"'),
 (2,
  '0.046*"part" + 0.035*"taiwan" + 0.021*"small" + 0.021*"dont" + '
  '0.020*"fulliv" + 0.018*"matter" + 0.018*"baddecisions" + '
  '0.017*"ukrainianarmy" + 0.017*"clintonsnowbird" + 0.015*"missiles"'),
 (3,
  '0.085*"taiwan" + 0.044*"missile" + 0.036*"chinas" + 0.033*"indopacinfo" + '
  '0.028*"good" + 0.028*"august" + 0.027*"military" + 0.026*"launches" + '
  '0.026*"infographic" + 0.024*"visit"'),
 (4,
  '0.038*"china" + 0.017*"taiwan" + 0.015*"new" + 0.014*"like" + 0.013*"us" + '
  '0.013*"cgmeifangzhang" + 0.011*"today" + 0.011*"c" + 0.010*"well" + '
  '0.010*"region"')]


In [44]:
pprint(lda_model.show_topics(formatted=False))

[(0,
  [('taiwan', 0.045099575),
   ('china', 0.027197734),
   ('pelosi', 0.017139299),
   ('ukraine', 0.014170255),
   ('us', 0.0134665575),
   ('spokespersonchn', 0.01047103),
   ('war', 0.008648602),
   ('world', 0.007464854),
   ('ccp', 0.0069047622),
   ('video', 0.0068009053)]),
 (1,
  [('taiwans', 0.037669312),
   ('taiwan', 0.027748499),
   ('chinese', 0.022973994),
   ('island', 0.021846779),
   ('indopacinfo', 0.021272086),
   ('near', 0.01822703),
   ('amp', 0.018083699),
   ('military', 0.014537824),
   ('found', 0.014306254),
   ('dead', 0.0142380325)]),
 (2,
  [('part', 0.04622452),
   ('taiwan', 0.03473166),
   ('small', 0.021044375),
   ('dont', 0.020564713),
   ('fulliv', 0.019736027),
   ('matter', 0.018497903),
   ('baddecisions', 0.018369475),
   ('ukrainianarmy', 0.017362896),
   ('clintonsnowbird', 0.016995743),
   ('missiles', 0.014550201)]),
 (3,
  [('taiwan', 0.085032865),
   ('missile', 0.04396922),
   ('chinas', 0.03578042),
   ('indopacinfo', 0.03261408),
  

## Model Analysis

In [46]:
# Compute Perplexity

#It's a measure of how good the model is. The lower the better. Perplexity is a negative value
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  
doc_lda = lda_model[corpus]


# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=word_list, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\n Ldamodel Coherence Score/Accuracy on Tweets: ', coherence_lda)


Perplexity:  -8.029637225364997

 Ldamodel Coherence Score/Accuracy on Tweets:  0.262727907406686


## Visualize the topics

In [48]:
pyLDAvis.enable_notebook()

LDAvis_prepared = gensimvis.prepare(lda_model, corpus, id2word)
LDAvis_prepared

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from scipy.sparse.base import spmatrix
  from scipy.sparse.base import spmatrix
  from scipy.sparse.base import spmatrix
  from scipy.sparse.base import spmatrix
  from scipy.optimize.linesearch import line_search_wolfe2, line_search_wolfe1
  from scipy.optimize.linesearch import line_search_wolfe2, line_search_wolfe1
  from scipy.optimize.linesearch import line_search_wolfe2, line_search_wolfe1
  from scipy.optimize.linesearch import line_search_wolfe2, line_search_wolfe1
  from scipy.optimize.linesearch import line_search_wolfe2, line_search_wolfe1
  from scipy.optimize.linesearch import line_search_wolfe2, line_search_wolfe1
  from scipy.optimize.linesearch import line_search_wolfe2, line_search_wolfe1
  from scipy.optimize.linesearch import line_search_wolfe2, line_search_wolfe1


In [None]:
# Tranining on different values of topics/k

# for k in range(2,25): # Train LDA on different values of k
#     print('Round: '+str(k))
#     LDA = gensim.models.ldamulticore.LdaMulticore
#     ldamodel = LDA(doc_term_matrix, num_topics=k, id2word = dictionary, passes=20, iterations=100,
#                    chunksize = 10000, eval_every = 10, random_state=20)
    
#     ldamodel.save(f"ldamodel_for_{k}topics_Run_10")
#     pprint(ldamodel.print_topics())

In [None]:
# Coherence score
# coherence = []
# for k in range(2,25):
#     LDA = gensim.models.ldamulticore.LdaMulticore
#     ldamodel = LDA.load(f"ldamodel_for_{k}topics_Run_10")
#     cm = gensim.models.coherencemodel.CoherenceModel(model=ldamodel, texts=Complete_Content, dictionary=dictionary, coherence='c_v')
#     coherence.append((k, 'default', 'default', cm.get_coherence()))

# Sentiment Analysis