In [1]:
# Import necessary libraries
import re
import nltk
import numpy as np
import pandas as pd

from time import time
from nltk.corpus import stopwords
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mathe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Topic Modeling

In [2]:
# Defining parameters
n_features = 10000
n_components = 10
n_top_word = 7

In [3]:
# Function to print topics
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

### Dataset

In [4]:
# Loading dataset
print("Loading dataset...")
dataset = pd.read_csv('data/tweets-covid-neutral.csv', engine='python')
data_samples = dataset.iloc[1:,0]
n_samples = len(data_samples)
dataset.head()

Loading dataset...


Unnamed: 0.1,Unnamed: 0,User_name,Time,Location,text,Sentiment
0,0,1,16-03-2020,London,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,2,3,16-03-2020,Vagabonds,Coronavirus Australia: Woolworths to give elde...,Neutral
2,7,8,16-03-2020,Austria,Was at the supermarket today. Didn't buy toile...,Neutral
3,10,11,16-03-2020,"Makati, Manila",All month there hasn't been crowding in the su...,Neutral
4,14,15,16-03-2020,,ADARA Releases COVID-19 Resource Center for Tr...,Neutral


### Preprocessing

In [5]:
# StopWords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [7]:
new_comment = []

# Preprocessing
for i in range(dataset.shape[0]):
    comment = re.sub('[^a-zA-Z]',' ',dataset['text'][i]) # Remove non-letters
    comment = comment.split("http", 1)[0] # Remove address from string
    comment = comment.lower() # Set lower case
    comment = comment.split() # Divide into a list
    comment = [word for word in comment if not word in stopwords.words('english')] # Select important words
    comment =' '.join(comment)
    new_comment.append(comment)

new_comment[0:5]

['menyrbie phil gahan chrisitv',
 'coronavirus australia woolworths give elderly disabled dedicated shopping hours amid covid outbreak',
 'supermarket today buy toilet paper rebel toiletpapercrisis covid',
 'month crowding supermarkets restaurants however reducing hours closing malls means everyone using entrance dependent single supermarket manila lockdown covid philippines',
 'adara releases covid resource center travel brands insights help travel brands stay date consumer travel behavior trends']

### Topic Extraction

In [8]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words=stopwords.words('english'))
tfidf = tfidf_vectorizer.fit_transform(new_comment)

print("tf-idf features extracted!")

Extracting tf-idf features for NMF...
tf-idf features extracted!


In [9]:
# Use tf features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words=stopwords.words('english'))
tf = tf_vectorizer.fit_transform(new_comment)

print("tf features for LDA extraction is completed!")

Extracting tf features for LDA...
tf features for LDA extraction is completed!


In [10]:
# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_word)

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=7373 and n_features=10000...
done in 1.071s.

Topics in NMF model (Frobenius norm):
Topic #0: covid corona coronavirusoutbreak uk coronaviruspandemic quarantine coronacrisis
Topic #1: coronavirus toiletpaper pandemic sanitizer lockdown quarantine outbreak
Topic #2: store grocery go retail socialdistancing line workers
Topic #3: supermarket go uk local one get socialdistancing
Topic #4: shopping online pandemic new stores home grocery
Topic #5: prices oil gas market due fall impact
Topic #6: consumer behavior pandemic impact new amp trends
Topic #7: toilet paper toiletpaper roll rolls need quarantine
Topic #8: food stock amp stores delivery demand due
Topic #9: people home go social many outside line





In [10]:
# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_word)

Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features, n_samples=7373 and n_features=10000...
done in 1.952s.

Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: covid coronavirus corona toiletpaper pandemic coronavirusoutbreak lockdown
Topic #1: coronavirus sanitizer pandemic hand time could right
Topic #2: store grocery retail workers going today go
Topic #3: supermarket uk local shelves one workers shoppers
Topic #4: shopping online consumer behavior new pandemic trends
Topic #5: prices oil gas market due fall rise
Topic #6: consumer impact pandemic new amp latest us
Topic #7: toilet toiletpaper paper need last quarantine roll
Topic #8: food stores stock amp due delivery closed
Topic #9: people go get home going social socialdistancing



In [15]:
# Fit the LDA model
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_word)

Fitting LDA models with tf features, n_samples=7373 and n_features=10000...
done in 9.688s.

Topics in LDA model:
Topic #0: coronavirus covid supermarket toiletpaper toilet paper people
Topic #1: gt five fast increases inflation canadian narendramodi
Topic #2: online shopping covid pandemic coronavirus amid outbreak
Topic #3: store grocery coronavirus covid today going go
Topic #4: covid coronavirus food consumer new home stock
Topic #5: consumer covid impact behavior coronavirus via data
Topic #6: coronavirus covid sanitizer amp stayathome got stayhome
Topic #7: supermarket need covid week food supply products
Topic #8: prices covid coronavirus oil gas face india
Topic #9: coronavirus supermarket social workers due distancing march



In [16]:
# Fit the LSA model
print("Fiting LSA model")

lsa = TruncatedSVD(n_components=n_components, n_iter=40, tol=0.01)

lsa.fit(tf)

print("\nTopics in LSA model:")

print_top_words(lsa, tf_feature_names, n_top_word)

Fiting LSA model

Topics in LSA model:
Topic #0: covid coronavirus store supermarket grocery prices consumer
Topic #1: covid consumer behavior impact response changes report
Topic #2: store grocery go covid retail line socialdistancing
Topic #3: supermarket people go get socialdistancing local social
Topic #4: shopping online consumer behavior pandemic new retail
Topic #5: consumer behavior coronavirus toiletpaper store impact toilet
Topic #6: toiletpaper covid paper toilet coronavirus corona quarantine
Topic #7: toilet paper toiletpaper people prices consumer amp
Topic #8: food stock amp stores people delivery demand
Topic #9: people grocery go toiletpaper get consumer line

