In [1]:
# Import necessary libraries
import re
import nltk
import numpy as np
import pandas as pd

from time import time
from nltk.corpus import stopwords
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mathe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Topic Modeling

In [13]:
# Defining parameters
n_features = 10000
n_components = 15
n_top_word = 7

In [14]:
# Function to print topics
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

### Dataset

In [15]:
# Loading dataset
print("Loading dataset...")
dataset = pd.read_csv('tweets-covid-positive.csv', engine='python')
data_samples = dataset.iloc[1:,0]
n_samples = len(data_samples)
dataset.head()

Loading dataset...


Unnamed: 0.1,Unnamed: 0,User_name,Time,Location,text,Sentiment
0,1,2,16-03-2020,UK,advice Talk to your neighbours family to excha...,Positive
1,3,4,16-03-2020,,My food stock is not the only one which is emp...,Positive
2,5,6,16-03-2020,"ÃƒÆ’Ã‚Å“T: 36.319708,-82.363649",As news of the regionÃƒâ€šÃ‚â€™s first confirm...,Positive
3,6,7,16-03-2020,"35.926541,-78.753267",Cashier at grocery store was sharing his insig...,Positive
4,8,9,16-03-2020,"Atlanta, GA USA",Due to COVID-19 our retail store and classroom...,Positive


### Preprocessing

In [None]:
# StopWords
stopwords.words('english')

In [5]:
new_comment = []

# Preprocessing
for i in range(dataset.shape[0]):
    comment = re.sub('[^a-zA-Z]',' ',dataset['text'][i]) # Remove non-letters
    comment = comment.split("http", 1)[0] # Remove address from string
    comment = comment.lower() # Set lower case
    comment = comment.split() # Divide into a list
    comment = [word for word in comment if not word in stopwords.words('english')] # Select important words
    comment =' '.join(comment)
    new_comment.append(comment)

new_comment[0:5]

['advice talk neighbours family exchange phone numbers create contact list phone numbers neighbours schools employer chemist gp set online shopping accounts poss adequate supplies regular meds order',
 'food stock one empty please panic enough food everyone take need stay calm stay safe covid france covid covid coronavirus confinement confinementotal confinementgeneral',
 'news region first confirmed covid case came sullivan county last week people flocked area stores purchase cleaning supplies hand sanitizer food toilet paper goods tim dodson reports',
 'cashier grocery store sharing insights covid prove credibility commented civics class know talking',
 'due covid retail store classroom atlanta open walk business classes next two weeks beginning monday march continue process online phone orders normal thank understanding']

### Topic Extraction

In [16]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words=stopwords.words('english'))
tfidf = tfidf_vectorizer.fit_transform(new_comment)

print("tf-idf features extracted!")

Extracting tf-idf features for NMF...
tf-idf features extracted!


In [17]:
# Use tf features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words=stopwords.words('english'))
tf = tf_vectorizer.fit_transform(new_comment)

print("tf features for LDA extraction is completed!")

Extracting tf features for LDA...
tf features for LDA extraction is completed!


In [18]:
# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_word)

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=17341 and n_features=10000...
done in 4.315s.

Topics in NMF model (Frobenius norm):
Topic #0: covid pandemic retail uk virus spread due
Topic #1: store grocery get go like employees retail
Topic #2: coronavirus toiletpaper pandemic quarantine right outbreak stophoarding
Topic #3: food demand stock need bank panic supply
Topic #4: online shopping delivery shop time support amazon
Topic #5: supermarket staff get go like shelves one
Topic #6: amp staff delivery support health stores keep
Topic #7: sanitizer hand masks hands use alcohol wash
Topic #8: prices time get price masks free oil
Topic #9: workers thank drivers staff delivery care nurses
Topic #10: consumer pandemic business new behavior time consumers
Topic #11: people many need like going also buying
Topic #12: home stay safe keep work please hands
Topic #13: help need us please local support get
Topic #14: paper toilet toiletpaper like get would rolls





In [19]:
# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_word)

Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features, n_samples=17341 and n_features=10000...
done in 13.253s.

Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: covid pandemic store retail good one open
Topic #1: store grocery coronavirus go like socialdistancing going
Topic #2: coronavirus pandemic toiletpaper good corona new stophoarding
Topic #3: food stock demand panic buy local supply
Topic #4: online shopping delivery shop time free amazon
Topic #5: supermarket staff shelves go like local going
Topic #6: amp covid health best retail measures public
Topic #7: sanitizer hand masks use gloves hands make
Topic #8: prices price high time oil market low
Topic #9: workers thank pandemic health employees care staff
Topic #10: consumer pandemic business new read impact consumers
Topic #11: people covid coronavirus many stayathome buying coronavirusoutbreak
Topic #12: home stay safe keep covid work everyone
Topic #13: help covid us pl

In [10]:
# Fit the LDA model
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_word)

Fitting LDA models with tf features, n_samples=17341 and n_features=10000...
done in 29.006s.

Topics in LDA model:
Topic #0: supermarket covid coronavirus people get food like
Topic #1: covid consumer prices coronavirus amp us social
Topic #2: prices covid market high coronavirus oil free
Topic #3: coronavirus toiletpaper paper toilet price stayhomesavelives covid
Topic #4: online covid shopping food stay home amp
Topic #5: covid store grocery coronavirus working supermarket people
Topic #6: consumer data behavior consumers industry read covid
Topic #7: store grocery covid coronavirus workers amp employees
Topic #8: workers supermarket staff covid thank stop food
Topic #9: sanitizer hand coronavirus masks covid use hands



In [12]:
# Fit the LSA model
print("Fiting LSA model")

lsa = TruncatedSVD(n_components=n_components, n_iter=40, tol=0.01)

lsa.fit(tf)

print("\nTopics in LSA model:")

print_top_words(lsa, tf_feature_names, n_top_word)

Fiting LSA model

Topics in LSA model:
Topic #0: covid coronavirus store grocery food supermarket amp
Topic #1: covid consumer prices food online supermarket shopping
Topic #2: store grocery workers covid thank employees retail
Topic #3: food amp supermarket people online shopping need
Topic #4: online shopping amp home shop delivery time
Topic #5: food sanitizer store grocery hand online demand
Topic #6: amp hand sanitizer consumer prices workers masks
Topic #7: hand sanitizer people supermarket online shopping food
Topic #8: prices people get time need like help
Topic #9: workers prices consumer thank us pandemic online

