In [21]:
import pandas as pd
import numpy as np
import os
from gensim.utils import simple_preprocess
import json
import nltk
import spacy
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim.models import LsiModel, TfidfModel
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
import gensim
import pickle
from wordcloud import WordCloud
import pyLDAvis
import requests
import sys, fitz
from os import listdir
from os.path import basename
import logging
import gensim.models
from gensim.models.wrappers import LdaMallet
logger = logging.getLogger('ftpuploader')


In [52]:
from spacy.tokenizer import Tokenizer
from gensim.parsing.preprocessing import STOPWORDS as SW
from wordcloud import STOPWORDS
stopwords = set(STOPWORDS)
import re
import string

In [3]:
with open('tweet-text.txt', 'r') as f:
    contents = f.read()
    tweets = contents.split('--------------------TWEET BREAK--------------------')

In [7]:
print('There are ' + str(len(tweets)) + ' tweets in this file.')

There are 109134 tweets in this file.


## Pre-processing

In [8]:
# replace newline character with a space
for i in range(len(tweets)):
    tweets[i] = tweets[i].replace("\n", " ")

In [24]:
# normalise words with the american spelling dictionary if needed
with open('american_spellings.json','r') as json_file:
    normalising_dict=json.load(json_file)

In [25]:
def normalise_spelling(word):
    if word in normalising_dict:
        return normalising_dict[word]
    else:
        return word

In [26]:
# get stopwords from nltk library
stop_words = nltk.corpus.stopwords.words('english')
stop_words = list(set(stop_words))

In [45]:
# Load spacy
# Make sure to restart the runtime after running installations and libraries tab
nlp = spacy.load('en_core_web_lg')
     

# Tokenizer
tokenizer = Tokenizer(nlp.vocab)


# Custom stopwords
custom_stopwords = ['hi','\n','\n\n', '&', ' ', '.', '-', 'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@']

# Customize stop words by adding to the default list
STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords)

# ALL_STOP_WORDS = spacy + gensim + wordcloud
ALL_STOP_WORDS = STOP_WORDS.union(SW).union(stopwords)


tokens = []

for doc in tokenizer.pipe(tweets, batch_size=500):
    doc_tokens = []    
    for token in doc: 
        if token.text.lower() not in STOP_WORDS:
            doc_tokens.append(token.text.lower())   
    tokens.append(doc_tokens)


In [47]:
df = pd.DataFrame()
df['tokens'] = tokens
df.head()

Unnamed: 0,tokens
0,"[@elnescio, @goergegalloway, @botthescott, @la..."
1,"[@theeejawsh, @midrangeszn, @joelembiid, milwa..."
2,"[@arnahunas, @ign, hope, right, hard, af, ty, ..."
3,"[@von_battenberg, sorry, hassle., kindly, send..."
4,"[kenneth, branagh, 👵🏻, https://t.co/ikkiml6vyw]"


In [48]:
# Make tokens a string again
df['tokens_back_to_text'] = [' '.join(map(str, l)) for l in df['tokens']]

def get_lemmas(text):
    '''Used to lemmatize the processed tweets'''
    lemmas = []
    
    doc = nlp(text)
    
    # Something goes here :P
    for token in doc: 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
            lemmas.append(token.lemma_)
    
    return lemmas

df['lemmas'] = df['tokens_back_to_text'].apply(get_lemmas)

In [59]:
# Make lemmas a string again
df['lemmas_back_to_text'] = [' '.join(map(str, l)) for l in df['lemmas']]

# Tokenizer function
def tokenize(text):
    """
    Parses a string into a list of semantic units (words)
    Args:
        text (str): The string that the function will tokenize.
    Returns:
        list: tokens parsed out
    """
    # Removing url's
    pattern = r"http\S+"
    
    tokens = re.sub(pattern, "", text) # https://www.youtube.com/watch?v=O2onA4r5UaY
    tokens = re.sub('[^a-zA-Z 0-9]', '', tokens)
    tokens = re.sub('[%s]' % re.escape(string.punctuation), '', tokens) # Remove punctuation
    tokens = re.sub(r'\w*\d\w*', '', tokens) # Remove words containing numbers
#     tokens = re.sub('\w*\d\w*', '', tokens) 
    tokens = re.sub(r'\b@\w+\b', '', tokens)
#     tokens = re.sub('@*!*$*', '', text) # Remove @ ! $
#     tokens = tokens.strip(',') # TESTING THIS LINE
#     tokens = tokens.strip('?') # TESTING THIS LINE
#     tokens = tokens.strip('!') # TESTING THIS LINE
#     tokens = tokens.strip("'") # TESTING THIS LINE
#     tokens = tokens.strip(".") # TESTING THIS LINE

    tokens = tokens.lower().split() # Make text lowercase and split it
    
    return tokens

# Apply tokenizer
df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize)
     

In [60]:
df['lemma_tokens']

0         [elnescio, goergegalloway, botthescott, lavbaa...
1         [theeejawsh, midrangeszn, joelembiid, milwauke...
2         [arnahuna, ign, hope, right, hard, af, ty, fra...
3         [vonbattenberg, sorry, hassle, kindly, send, e...
4                                        [kenneth, branagh]
                                ...                        
109129    [replicantpinky, meet, cure, meat, isle, eye, ...
109130    [replicantpinky, ask, try, fromunda, cheese, r...
109131                 [rihanna, oontz, oontz, music, well]
109132                  [hindsite, joke, laugh, funnytweet]
109133                                                   []
Name: lemma_tokens, Length: 109134, dtype: object

In [72]:
# Create a id2word dictionary
dictionary = corpora.Dictionary(df['lemma_tokens'])
dictionary.filter_extremes(no_below = 500, no_above = 0.9)
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df['lemma_tokens']]

In [73]:
# dictionary df
dict_corpus = {}

for i in range(len(doc_term_matrix)):
    for idx, freq in doc_term_matrix[i]:
        if dictionary[idx] in dict_corpus:
            dict_corpus[dictionary[idx]] += freq
        else:
            dict_corpus[dictionary[idx]] = freq

dict_df = pd.DataFrame.from_dict(dict_corpus, orient='index', columns=['freq'])

In [74]:
dict_df.sort_values('freq', ascending=False).head(10)

Unnamed: 0,freq
good,4582
people,4256
think,4094
amp,4056
know,3589
time,3468
thank,3228
look,3154
go,3017
need,2933


In [76]:
corpus_lda = [dictionary.doc2bow(text) for text in df['lemma_tokens']]

In [77]:
mallet_path = '/Users/scottsang/Documents/GitHub/topic_modelling_subak/mallet-2.0.8/bin/mallet'

In [80]:
# lda topic modelling
ldamallet_60 = LdaMallet(mallet_path, corpus=corpus_lda, num_topics=60, id2word=dictionary, random_seed = 122)
tm_results_60 = ldamallet_60[corpus_lda]
corpus_topics_60 = [sorted(topics, key=lambda record: -record[1])[0] for topics in tm_results_60]
lda_topics_60 = [[(term, round(wt, 3)) for term, wt in ldamallet_60.show_topic(n, topn=10)] for n in range(0, ldamallet_60.num_topics)]
lda_topics_60_df = pd.DataFrame([[term for term, wt in topic] for topic in lda_topics_60], columns = ['Term'+str(i) for i in range(1, 11)], index=['Topic '+str(t) for t in range(1, ldamallet_60.num_topics+1)])
lda_topics_60_df.head()


Mallet LDA: 60 topics, 6 topic bits, 111111 topic mask
Data loaded.
max tokens: 28
total tokens: 149363
<10> LL/token: -12.12566
<20> LL/token: -11.73198
<30> LL/token: -11.41357
<40> LL/token: -11.03927

0	0.83333	covid person people find good hard hope hate start guy bad great fuck lose happen lol thing talk world sound 
1	0.83333	good day people love change bring call post work power election australia hope give turn australian year yeah home friend 
2	0.83333	school people love work buy beautiful make show thing day absolutely hard child amp wrong care hear agree state play 
3	0.83333	amp play good win money share day woman time pretty talk post run people turn work state reason problem case 
4	0.83333	amp lose time people morrison good election country live bit woman wait feel work leave beautiful absolutely australian kid albomp 
5	0.83333	job amp miss call problem free shit thing work learn love house today post people support fact australian reason issue 
6	0.83333	make thing t


0	0.83333	covid people friend stop good problem thing world absolutely care hard hope person bad lose australia talk lnp beautiful great 
1	0.83333	day good people hope guy work change call turn yeah give australian bad bring friend love election year post hard 
2	0.83333	school work love hard give absolutely beautiful people agree world play start buy hear child fuck bring friend albomp woman 
3	0.83333	amp election leave lol 
4	0.83333	election lose people albomp leave morrison government time good country feel bit absolutely beautiful work house vote head power australian 
5	0.83333	job amp free shit post call work miss feel love problem make thing agree buy bad people reason lol change 
6	0.83333	thing time make good love happen talk shit stop day watch high sound change happy amazing guess link friend bad 
7	0.83333	morrison albomp auspol tamepunk watch lnp thing understand big australia point day play samanthamaiden stop reason nice run bring open 
8	0.83333	home start love peop

<160> LL/token: -9.15774
<170> LL/token: -9.09403
<180> LL/token: -9.04786
<190> LL/token: -9.00726

0	0.83333	care covid people stop hard feel word nft friend morrison big yeah number child 
1	0.83333	day guy hope good work question call long discrimination end link lose 
2	0.83333	school hard high work absolutely give people lot albomp share beautiful home long nice 
3	0.83333	amp talk work 
4	0.83333	leave election lose albomp government people morrison fuck family start book win yeah 
5	0.83333	amp job free shit bit fact love samanthamaiden woman hope 
6	0.83333	time make stop change talk guess good call thing turn love labor watch true case problem happy fuck point pay 
7	0.83333	morrison auspol albomp lnp tamepunk follow send 
8	0.83333	home love start beautiful open day bring big people hear absolutely good point mate happen covid hate leave news green 
9	0.83333	good long post sound world live point feel day job give understand week time head year home open green power 
10	0.83

<260> LL/token: -8.77197
<270> LL/token: -8.75425
<280> LL/token: -8.74373
<290> LL/token: -8.73391

0	0.83333	care covid people election job word school happy 
1	0.83333	day hope bring guy stop watch nice work law 
2	0.83333	school high work hard absolutely give hear miss 
3	0.83333	amp question nice call 
4	0.83333	leave election lose auspol open albomp morrison number fact stop read government reason nft high family time 
5	0.83333	amp job morrison high man feel idea issue australia bring 
6	0.83333	stop make time guess listen leave real hard lol morrison week guy today nft buy pass support 
7	0.83333	morrison auspol albomp lnp labor hard god home feel school reason 
8	0.83333	home love beautiful bring family open start long day case money give 
9	0.83333	good sound world long post wrong bit albomp give 
10	0.83333	feel hope play happy turn today buy australian book 
11	0.83333	case show australia today people lol bad good call post month true place change news amp happen 
12	0.8333

<360> LL/token: -8.67939
<370> LL/token: -8.67365
<380> LL/token: -8.6626
<390> LL/token: -8.65654

0	0.83333	care covid people school law win word country 
1	0.83333	day hope month green find feel fact 
2	0.83333	school high work absolutely hard give love story year nice beautiful 
3	0.83333	amp tamepunk home 
4	0.83333	leave lose election auspol albomp government make mate pass high shit bill play beautiful 
5	0.83333	amp job leave week medium head twitter albomp person pretty people pass share work follow 
6	0.83333	stop make time guess leave story hold australia run 
7	0.83333	morrison auspol lnp labor albomp election learn party 
8	0.83333	home beautiful love family bring open follow show high wait child amp 
9	0.83333	good sound world bit watch labor auspol time true leave hope 
10	0.83333	feel hope play turn election sound hard people world time 
11	0.83333	case today show australia lol good question call government albomp make story idea family number work 
12	0.83333	hear happ

<460> LL/token: -8.62937
<470> LL/token: -8.61704
<480> LL/token: -8.61771
<490> LL/token: -8.61542

0	0.83333	care covid people hear happen listen join true end lol nice tweet 
1	0.83333	day hope call link listen 
2	0.83333	school high give work love absolutely country australia real nice today 
3	0.83333	amp team discrimination news power game 
4	0.83333	leave lose election auspol albomp friend government hope 
5	0.83333	amp job leave show hold pay lnp end free share woman 
6	0.83333	stop make time guess god run world open lol happen absolutely child school 
7	0.83333	morrison auspol lnp labor election albomp link follow agree bad lose case 
8	0.83333	home family beautiful love bring number join mate nice green party 
9	0.83333	good sound reason world support turn point time friend 
10	0.83333	feel hope play albomp place medium check turn watch 
11	0.83333	case show today australia lol good free issue read world wrong give hold place money 
12	0.83333	hear happy great make hope sound

<560> LL/token: -8.61112
<570> LL/token: -8.61232
<580> LL/token: -8.60048
<590> LL/token: -8.60188

0	0.83333	care covid people god absolutely end tamepunk twitter government 
1	0.83333	day hope hear wait kid bill absolutely issue 
2	0.83333	school high give work absolutely love amazing guy big stop real lol morrison year australia 
3	0.83333	amp hope samanthamaiden medium listen twitter live 
4	0.83333	leave lose election auspol call government head true lot buy home 
5	0.83333	amp job leave open people question community 
6	0.83333	make stop time team care happen agree head today covid hard 
7	0.83333	morrison auspol lnp election albomp labor support australian hold happy guy open listen call 
8	0.83333	home beautiful family love samanthamaiden care god labor 
9	0.83333	good sound world reason case absolutely talk watch real send 
10	0.83333	feel hope true find child covid reason end people link 
11	0.83333	today case show australia lol good australian understand happen law pretty h

<660> LL/token: -8.59873
<670> LL/token: -8.58779
<680> LL/token: -8.59959
<690> LL/token: -8.60502

0	0.83333	care covid people big lnp listen book high miss week love 
1	0.83333	day hope experience news find 
2	0.83333	school high give work absolutely check love wait hold big shit call yeah number game join book guy win 
3	0.83333	amp lose high feel pay 
4	0.83333	leave lose election auspol government buy listen vote show 
5	0.83333	amp job leave talk people free listen open show read run power twitter real week 
6	0.83333	make stop time support watch problem home mate amendment pay person care 
7	0.83333	morrison auspol lnp albomp election labor wait change life 
8	0.83333	home family love beautiful understand amazing free show 
9	0.83333	good sound reason hear lot world country 
10	0.83333	feel hope hear support post law share give 
11	0.83333	today case show australia share good speak lol story wait buy mate place team true sound fact 
12	0.83333	hear happy great make sound agree 

<760> LL/token: -8.59895
<770> LL/token: -8.59751
<780> LL/token: -8.60981
<790> LL/token: -8.59536

0	0.83333	care covid people amp today month live thing absolutely show 
1	0.83333	day hope hold wrong happen medium time 
2	0.83333	school high give work kid understand good check bill stop read case number child 
3	0.83333	amp link australian turn check 
4	0.83333	leave lose election government auspol medium case run wait understand support 
5	0.83333	amp job people person leave world vote hope 
6	0.83333	make stop time miss show story real good 
7	0.83333	morrison auspol lnp labor election albomp great wrong play hear 
8	0.83333	love home family beautiful story absolutely today vote learn talk 
9	0.83333	good sound reason hear make man woman 
10	0.83333	feel hope love twitter mate man australia 
11	0.83333	today case show great australia share australian good lol hate free auspol job law care 
12	0.83333	happy hear great make guess check lose law australian green 
13	0.83333	great new

<860> LL/token: -8.58577
<870> LL/token: -8.59305
<880> LL/token: -8.589
<890> LL/token: -8.58499

0	0.83333	care covid people miss call love government tamepunk news start beautiful 
1	0.83333	day hope wait share miss hate story religious watch work 
2	0.83333	school kid high work game home child 
3	0.83333	amp power idea community job start reason 
4	0.83333	leave lose election government auspol covid happen support listen thing month green today give 
5	0.83333	amp job people year idea question bring person turn 
6	0.83333	stop make time give call find idea samanthamaiden lie law news true issue 
7	0.83333	morrison auspol lnp labor albomp election bit hate follow school samanthamaiden god 
8	0.83333	home family beautiful love happen number time power good year make point post hear morrison twitter share 
9	0.83333	good sound reason hear wrong check real learn change lot morrison covid 
10	0.83333	feel hope make friend play high change call samanthamaiden stop great link 
11	0.83333	

<960> LL/token: -8.57755
<970> LL/token: -8.57885
<980> LL/token: -8.57644
<990> LL/token: -8.56915

0	0.83333	care covid people australia open bit hold family number watch money good 
1	0.83333	day hope hard check guy auspol amp school 
2	0.83333	school kid high miss family send country listen question open read real give 
3	0.83333	amp leave medium pay amazing problem 
4	0.83333	leave lose election government happen home wrong problem country end find read big guy true beautiful time 
5	0.83333	amp job people bring listen hold morrison speak green point pay fact 
6	0.83333	stop make time work call long albomp win tweet kid house hear wrong ago run hard 
7	0.83333	morrison auspol lnp election albomp labor samanthamaiden bill hold today amendment question word experience 
8	0.83333	home beautiful love family long support nice job idea money man 
9	0.83333	good sound reason hear man wrong hope question 
10	0.83333	feel hope good understand number post question person night share place p

Unnamed: 0,Term1,Term2,Term3,Term4,Term5,Term6,Term7,Term8,Term9,Term10
Topic 1,care,covid,people,open,australia,bit,hold,family,number,lol
Topic 2,day,hope,hard,school,amp,watch,old,post,feel,story
Topic 3,school,kid,high,miss,family,country,send,listen,australian,read
Topic 4,amp,leave,medium,fuck,wrong,tell,new,help,feel,story
Topic 5,leave,lose,election,government,happen,wrong,home,find,problem,read


In [81]:
with pd.ExcelWriter('Twitter-topic_modelling_keywords.xlsx') as writer:  
    lda_topics_60_df.to_excel(writer, sheet_name = '60-topic')