In [12]:
import pandas as pd
from pandas import DataFrame
data = pd.read_csv("sample_data.csv")
list(data.columns.values)

['coordinates',
 'created_at',
 'hashtags',
 'media',
 'urls',
 'favorite_count',
 'id',
 'in_reply_to_screen_name',
 'in_reply_to_status_id',
 'in_reply_to_user_id',
 'lang',
 'place',
 'possibly_sensitive',
 'retweet_count',
 'reweet_id',
 'retweet_screen_name',
 'source',
 'text',
 'tweet_url',
 'user_created_at',
 'user_screen_name',
 'user_default_profile_image',
 'user_description',
 'user_favourites_count',
 'user_followers_count',
 'user_friends_count',
 'user_listed_count',
 'user_location',
 'user_name',
 'user_screen_name.1',
 'user_statuses_count',
 'user_time_zone',
 'user_urls',
 'user_verified']

In [21]:
df = DataFrame(data, columns = data.columns.values)
df = df.loc[df["lang"] == "en"]
tweet_text = df["text"]
tweet_text['index'] = tweet_text.index
tweets = tweet_text
print(len(tweets))
print(tweets[2410])

27666
We will not meet our monthly fundraising goal for this month and that is ok. 

We will continue fighting. Come join us when you are ready and able. We'll leave a spot for you.

Very grateful for the people who have contributed and the donors who have offered matching funds. https://t.co/qmDQIX05Ms


In [28]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/fayland/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

 Functions to perform lemmatize and stem preprocessing steps on the data set. (from  https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24)

In [31]:
stemmer = SnowballStemmer("english")

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

### Check if the stemming function works

In [39]:
words = []
for word in tweets[2410].split(' '):
    if word not in gensim.parsing.preprocessing.STOPWORDS and len(word) > 3:
        words.append(word)
print("original: ",words)
sample_process = preprocess(tweets[2410])
print("Processed:", sample_process)

original:  ['meet', 'monthly', 'fundraising', 'goal', 'month', '\n\nWe', 'continue', 'fighting.', 'Come', 'join', 'ready', 'able.', "We'll", 'leave', 'spot', 'you.\n\nVery', 'grateful', 'people', 'contributed', 'donors', 'offered', 'matching', 'funds.', 'https://t.co/qmDQIX05Ms']
Processed: ['meet', 'month', 'fundrais', 'goal', 'month', 'continu', 'fight', 'come', 'join', 'readi', 'abl', 'leav', 'spot', 'grate', 'peopl', 'contribut', 'donor', 'offer', 'match', 'fund', 'https', 'qmdqix']


### Process the tweets

In [47]:
processed_tweets = tweets.fillna('').astype(str).map(preprocess)
processed_tweets[:10]

1     [wionew, gravita, china, biggest, strain, viru...
2     [dailycal, rick, scott, want, probe, china, co...
3     [farouqsajoh, today, mark, lockdown, abuja, re...
4     [mailonlin, wuhan, doctor, alert, medic, sprea...
5     [christufton, dear, jamaican, social, distanc,...
6     [kt_so_it_go, tomorrow, realiti, host, turn, m...
7     [spectatorindex, coronavirus, death, itali, sp...
8           [jrocismajor_, florida, liter, nigga, care]
10    [gayconservativ, busi, bing, compani, busi, ma...
11    [adityarajkaul, indian, prime, minist, narendr...
Name: text, dtype: object

In [70]:
dictionary = gensim.corpora.Dictionary(processed_tweets)
print("dictionary size:", len(dictionary))
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

dictionary size: 31149
0 biggest
1 china
2 equip
3 gravita
4 handl
5 keep
6 sampl
7 strain
8 virus
9 wionew
10 cover


### Filter out tokens that appear in
less than 15 documents (absolute number) or
more than 0.8 documents (fraction of total corpus size, not absolute number).
after the above two steps, keep only the first 100000 most frequent tokens. 

In [73]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
print("dictionary size:", len(dictionary))
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

dictionary size: 2316
0 biggest
1 china
2 equip
3 handl
4 keep
5 sampl
6 virus
7 cover
8 dailycal
9 https
10 rick


### Gensim doc2bow
For each document we create a dictionary reporting how many
words and how many times those words appear. Save this to ‘bow_corpus’, then check our selected document earlier.

In [74]:
bow_corpus = [dictionary.doc2bow(tweet) for tweet in processed_tweets]
bow_corpus[2410]

[(9, 1),
 (81, 1),
 (95, 1),
 (151, 1),
 (198, 1),
 (257, 1),
 (258, 1),
 (259, 1),
 (847, 1),
 (1067, 1),
 (1940, 1)]

In [63]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(bow_corpus))

Number of unique tokens: 2316
Number of documents: 27666


### train LDA model

In [76]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 5
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=bow_corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [77]:
top_topics = model.top_topics(bow_corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -5.8246.
[([(0.08420193, 'https'),
   (0.045797788, 'coronavirus'),
   (0.033386055, 'covid'),
   (0.027639877, 'trump'),
   (0.018805657, 'pandem'),
   (0.016973855, 'china'),
   (0.016536236, 'say'),
   (0.016375512, 'peopl'),
   (0.011346821, 'time'),
   (0.011289188, 'like'),
   (0.009991487, 'know'),
   (0.009387468, 'presid'),
   (0.008708884, 'go'),
   (0.008301215, 'virus'),
   (0.0074680247, 'work'),
   (0.0073828166, 'american'),
   (0.0065728123, 'need'),
   (0.0063553127, 'respons'),
   (0.006340456, 'help'),
   (0.0062009273, 'think')],
  -3.3907247196380776),
 ([(0.026925016, 'covid'),
   (0.026822621, 'test'),
   (0.025913974, 'case'),
   (0.025300793, 'coronavirus'),
   (0.022030132, 'lockdown'),
   (0.019439049, 'death'),
   (0.015253407, 'health'),
   (0.014799189, 'report'),
   (0.014494035, 'posit'),
   (0.013674314, 'state'),
   (0.013092979, 'hospit'),
   (0.011997539, 'come'),
   (0.009235028, 'break'),
   (0.008878068, 'million'),
   (0.