In [43]:
import re
import nltk
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

import pymongo
import json

from collections import defaultdict

import numpy as np

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\GradziPC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\GradziPC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Preprocessing

In [77]:
def preprocess(string):
    #lower case
    string =string.lower()
    
    #remove non letters
    regex = re.compile('[^a-z ]')
    string = regex.sub('', string)
    
    #break sentences into words and lemmatisation
    lemmatizer = WordNetLemmatizer()
    words_list = [lemmatizer.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else lemmatizer.lemmatize(i) for i,j in pos_tag(word_tokenize(string))]
    
    #remove english stopwords
    stopword_list = stopwords.words('english')
    stopword_list.append('rt')
    stopword_list = set(stopword_list)
    words_list = [w for w in words_list if not w in stopword_list]
    #add rt to 
    
    return words_list

## Import Dataset

In [79]:
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db = client['Assignment3']
collection = db['tweets']

In [80]:
tweets = collection.find(projection={'text'})
tweets_list = []
for i in range(tweets.count()):
    t = tweets.next()['text']
    t = preprocess(t)
    tweets_list.append(t)

  This is separate from the ipykernel package so we can avoid doing imports until


## LDA

In [81]:
#calclate document frequency per word
def calc_token_frequencies(doc_list):
    frequencies = defaultdict(int)# Each dict item will start off as int(0)
    
    for token_set in doc_list:
        frequencies_tw = {}
        for token in token_set:
            frequencies_tw[token] = 1
        for tok in frequencies_tw:
            frequencies[tok] += 1
    return frequencies

In [82]:
#Remove words that occur in less than 10 documents, and words that occur in more than 90% of the documents
words_count = calc_token_frequencies(tweets_list)
filted_dict = {k:v for k,v in words_count.items() if v>9 and v<len(tweets_list)*0.9}

#Transform each document to a vectorized form by computing the frequency of each word. (50000x5104 vector)
tweets_vect = np.zeros((len(filted_dict),len(tweets_list)))
for i in range(len(tweets_list)):
    tw_dict = dict.fromkeys(filted_dict,0)
    for word in tweets_list[i]:
        if word in tw_dict:
            tw_dict[word] +=1
    tweets_vect[:,i] = [int(a) for a in list(tw_dict.values())]


In [83]:
#Apply the LDA model.
#Examine the topics, with your own proposal for the “name” of each topic that is meaningful to you. Display your results in the notebook.
import lda

X= tweets_vect.astype(int)
X = X.transpose()
model = lda.LDA(n_topics=20, n_iter=1500, random_state=1)
model.fit(X)  # model.fit_transform(X) is also available
topic_word = model.topic_word_  # model.components_ also works
n_top_words = 5

INFO:lda:n_documents: 50000
INFO:lda:vocab_size: 4735
INFO:lda:n_words: 440360
INFO:lda:n_topics: 20
INFO:lda:n_iter: 1500
INFO:lda:<0> log likelihood: -5226611
INFO:lda:<10> log likelihood: -3220307
INFO:lda:<20> log likelihood: -3052857
INFO:lda:<30> log likelihood: -3023662
INFO:lda:<40> log likelihood: -3011148
INFO:lda:<50> log likelihood: -3003903
INFO:lda:<60> log likelihood: -2999472
INFO:lda:<70> log likelihood: -2994881
INFO:lda:<80> log likelihood: -2992297
INFO:lda:<90> log likelihood: -2990867
INFO:lda:<100> log likelihood: -2987660
INFO:lda:<110> log likelihood: -2986675
INFO:lda:<120> log likelihood: -2983914
INFO:lda:<130> log likelihood: -2982660
INFO:lda:<140> log likelihood: -2979963
INFO:lda:<150> log likelihood: -2978323
INFO:lda:<160> log likelihood: -2977262
INFO:lda:<170> log likelihood: -2977210
INFO:lda:<180> log likelihood: -2977489
INFO:lda:<190> log likelihood: -2975944
INFO:lda:<200> log likelihood: -2974675
INFO:lda:<210> log likelihood: -2974509
INFO:lda

In [85]:
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(tuple(filted_dict))[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: covid get home spread go
Topic 1: amp use pandemic take update
Topic 2: uk death break piersmorgan spike
Topic 3: watch covid pandemic brief point
Topic 4: china covid day wuhan say
Topic 5: help mask fight combat amp
Topic 6: death u rate die south
Topic 7: death report may break covid
Topic 8: covid home stay know life
Topic 9: people die help lockdown worker
Topic 10: covid time week http life
Topic 11: covid fight help quicktake pandemic
Topic 12: pandemic covid impact support crisis
Topic 13: case new covid death total
Topic 14: latest daily thanks day american
Topic 15: test state china warn could
Topic 16: get tomfitton public realdonaldtrump shutdown
Topic 17: care boris johnson minister intensive
Topic 18: crisis people continue current accelerate
Topic 19: trump medical president staff nh


## Results with title

#### Topic Go home" :              covid get home spread go

Topic **Update**:               amp use pandemic take update

Topic **UK spike**:             uk death break piersmorgan spike
   
Topic **Quick stats**:          watch covid pandemic brief point

Topic **Wuhan discharged**:     china covid day wuhan say

Topic **Mask**:                 help mask fight combat amp

Topic **Death Rate**:           death u rate die south

Topic **May report**:           death report may break covid

Topic **Stay@home**:            covid home stay know life

Topic **Lockdown&workers**:     people die help lockdown worker

Topic **Times**:                covid time week http life

Topic **Fight covid**:          covid fight help quicktake pandemic

Topic **Covid impact**:         pandemic covid impact support crisis

Topic **Covid stats**:          case new covid death total

Topic **Thanks day**:           latest daily thanks day american

Topic **China's late warning**: test state china warn could

Topic **Tim Fitton**:           get tomfitton public realdonaldtrump shutdown

Topic **Boris johnson**:        care boris johnson minister intensive

Topic **Crisis**:         people continue current accelerate

Topic **Trump & medical staff**:      trump medical president staff nh