In [9]:
# Import libraries
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from time import time

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mathe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Defining parameters
n_features = 10000
n_components = 10
n_top_word = 7

In [4]:
# Function to print topics
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [7]:
# Loading Dataset
print("Loading dataset...")
dataset = pd.read_csv('Corona_tweets.csv', engine='python')
data_samples = dataset.iloc[1:,0]
n_samples = len(data_samples)
dataset.head()

Loading dataset...


Unnamed: 0,User_name,Time,Location,text
0,1,16-03-2020,London,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...
1,2,16-03-2020,UK,advice Talk to your neighbours family to excha...
2,3,16-03-2020,Vagabonds,Coronavirus Australia: Woolworths to give elde...
3,4,16-03-2020,,My food stock is not the only one which is emp...
4,5,16-03-2020,,"Me, ready to go at supermarket during the #COV..."


In [None]:
# StopWords
stopwords.words('english')

In [14]:
new_comment = []
# Pre-processing
for i in range(dataset.shape[0]):
    comment = re.sub('[^a-zA-Z]',' ',dataset['text'][i]) # Remove non-letters
    comment = comment.split("http", 1)[0] # Remove address from string
    comment = comment.lower() # Set lower case
    comment = comment.split() # Divide into a list
    comment = [word for word in comment if not word in stopwords.words('english')] # Select important words
    comment =' '.join(comment)
    new_comment.append(comment)

new_comment[0:5]

['menyrbie phil gahan chrisitv',
 'advice talk neighbours family exchange phone numbers create contact list phone numbers neighbours schools employer chemist gp set online shopping accounts poss adequate supplies regular meds order',
 'coronavirus australia woolworths give elderly disabled dedicated shopping hours amid covid outbreak',
 'food stock one empty please panic enough food everyone take need stay calm stay safe covid france covid covid coronavirus confinement confinementotal confinementgeneral',
 'ready go supermarket covid outbreak paranoid food stock litteraly empty coronavirus serious thing please panic causes shortage coronavirusfrance restezchezvous stayathome confinement']

In [16]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words=stopwords.words('english'))
tfidf = tfidf_vectorizer.fit_transform(new_comment)

print("tf-idf features extracted!")

Extracting tf-idf features for NMF...
tf-idf features extracted!


In [17]:
# Use tf features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words=stopwords.words('english'))
tf = tf_vectorizer.fit_transform(new_comment)

print("tf features for LDA extraction is completed!")

Extracting tf features for LDA...
tf features for LDA extraction is completed!


In [19]:
# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_word)

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=41156 and n_features=10000...
done in 7.897s.

Topics in NMF model (Frobenius norm):
Topic #0: covid pandemic uk corona retail virus coronavirusoutbreak
Topic #1: coronavirus sanitizer toiletpaper hand toilet paper pandemic
Topic #2: store grocery workers go get employees retail
Topic #3: food panic demand stock buying need supply
Topic #4: prices oil price pandemic gas low due
Topic #5: amp workers hand sanitizer us home health
Topic #6: supermarket workers shelves go staff get one
Topic #7: online shopping home delivery time get shop
Topic #8: people need buying panic get many going
Topic #9: consumer pandemic new us crisis business behavior





In [20]:
# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_word)

Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features, n_samples=41156 and n_features=10000...
done in 18.635s.

Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: covid coronavirus virus lockdown spread us new
Topic #1: coronavirus toiletpaper toilet paper like get quarantine
Topic #2: store grocery covid workers retail employees stores
Topic #3: food panic stock buying demand need supply
Topic #4: prices oil price gas market pandemic low
Topic #5: amp supermarket workers staff home social work
Topic #6: people supermarket go like get going need
Topic #7: shopping online home stay delivery get help
Topic #8: sanitizer hand people masks scams use hands
Topic #9: consumer pandemic crisis new demand business impact



In [21]:
# Fit the LDA model
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_word)

Fitting LDA models with tf features, n_samples=41156 and n_features=10000...
done in 64.527s.

Topics in LDA model:
Topic #0: coronavirus covid toiletpaper high find supermarket behavior
Topic #1: covid consumer amp coronavirus social safe new
Topic #2: shopping online covid delivery coronavirus amazon thing
Topic #3: amp services bank company ever top die
Topic #4: much workers weeks next staff full two
Topic #5: day help quarantine public shoppers trying supermarket
Topic #6: supermarket covid food people coronavirus need get
Topic #7: prices covid food consumers items businesses country
Topic #8: prices covid coronavirus demand oil pandemic consumer
Topic #9: store grocery coronavirus covid sanitizer hand workers



In [22]:
# Fit the LSA model
print("Fiting LSA model")

lsa = TruncatedSVD(n_components=n_components, n_iter=40, tol=0.01)

lsa.fit(tf)

print("\nTopics in LSA model:")

print_top_words(lsa, tf_feature_names, n_top_word)

Fiting LSA model

Topics in LSA model:
Topic #0: covid coronavirus store food prices grocery supermarket
Topic #1: covid prices consumer oil demand impact due
Topic #2: store grocery workers covid go employees people
Topic #3: food people amp panic supermarket buying stock
Topic #4: prices amp store grocery oil food demand
Topic #5: amp supermarket people shopping online workers staff
Topic #6: amp consumer online coronavirus shopping covid food
Topic #7: online shopping people prices home delivery time
Topic #8: people consumer sanitizer hand covid amp behavior
Topic #9: consumer workers supermarket pandemic demand us crisis

