In [1]:
# Import necessary libraries
import re
import nltk
import numpy as np
import pandas as pd

from time import time
from nltk.corpus import stopwords
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mathe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Topic Modeling

In [13]:
# Defining parameters
n_features = 10000
n_components = 15
n_top_word = 7

In [3]:
# Function to print topics
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

### Dataset

In [4]:
# Loading dataset
print("Loading dataset...")
dataset = pd.read_csv('data/tweets-covid-negative.csv', engine='python')
data_samples = dataset.iloc[1:,0]
n_samples = len(data_samples)
dataset.head()

Loading dataset...


Unnamed: 0.1,Unnamed: 0,User_name,Time,Location,text,Sentiment
0,4,5,16-03-2020,,"Me, ready to go at supermarket during the #COV...",Negative
1,18,19,16-03-2020,North America,"Amazon Glitch Stymies Whole Foods, Fresh Groce...",Negative
2,20,21,16-03-2020,southampton soxx xxx,with 100 nations inficted with covid 19 th...,Negative
3,24,25,16-03-2020,Downstage centre,@10DowningStreet @grantshapps what is being do...,Negative
4,26,27,16-03-2020,"Ketchum, Idaho",In preparation for higher demand and a potenti...,Negative


### Preprocessing

In [None]:
# StopWords
stopwords.words('english')

In [5]:
new_comment = []

# Preprocessing
for i in range(dataset.shape[0]):
    comment = re.sub('[^a-zA-Z]',' ',dataset['text'][i]) # Remove non-letters
    comment = comment.split("http", 1)[0] # Remove address from string
    comment = comment.lower() # Set lower case
    comment = comment.split() # Divide into a list
    comment = [word for word in comment if not word in stopwords.words('english')] # Select important words
    comment =' '.join(comment)
    new_comment.append(comment)

new_comment[0:5]

['ready go supermarket covid outbreak paranoid food stock litteraly empty coronavirus serious thing please panic causes shortage coronavirusfrance restezchezvous stayathome confinement',
 'amazon glitch stymies whole foods fresh grocery deliveries covid spread seen significant increase people shopping online groceries spokeswoman said statement today resulted systems impact affecting',
 'nations inficted covid world must play fair china goverments must demand china adopts new guilde lines food safty chinese goverment guilty irosponcible life global scale',
 'downingstreet grantshapps done ensure food essential products stocked supermarkets panic buying actively discouraged cannot left checkout staff police actions selfish profiteer',
 'preparation higher demand potential food shortage hunger coalition purchased percent food implemented new protocols due covid coronavirus']

### Topic Extraction

In [14]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words=stopwords.words('english'))
tfidf = tfidf_vectorizer.fit_transform(new_comment)

print("tf-idf features extracted!")

Extracting tf-idf features for NMF...
tf-idf features extracted!


In [15]:
# Use tf features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words=stopwords.words('english'))
tf = tf_vectorizer.fit_transform(new_comment)

print("tf features for LDA extraction is completed!")

Extracting tf features for LDA...
tf features for LDA extraction is completed!


In [9]:
# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_word)

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=16440 and n_features=10000...
done in 2.669s.

Topics in NMF model (Frobenius norm):
Topic #0: covid online shopping virus due lockdown uk
Topic #1: prices oil price low pandemic demand gas
Topic #2: coronavirus toiletpaper sanitizer toilet pandemic shopping paper
Topic #3: food demand stock supply need banks get
Topic #4: amp us home online health help risk
Topic #5: people need get going many think still
Topic #6: store grocery workers go work get going
Topic #7: panic buying stop need buy shelves food
Topic #8: supermarket go shelves get empty going home
Topic #9: consumer crisis demand pandemic us economy scams





In [17]:
# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_word)

Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features, n_samples=16440 and n_features=10000...
done in 12.603s.

Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: covid coronavirus virus one even like paper
Topic #1: prices covid price gas market low pandemic
Topic #2: coronavirus prices sanitizer hand masks news trump
Topic #3: food panic buying supply demand stock supplies
Topic #4: amp masks govt people many risk help
Topic #5: people going prices time many still think
Topic #6: store grocery retail stores food employees like
Topic #7: panic supermarket buying shelves stop empty buy
Topic #8: supermarket home one go social local day
Topic #9: consumer pandemic new scams business economy consumers
Topic #10: shopping online delivery home time order please
Topic #11: oil pandemic global covid russia amid demand
Topic #12: get covid go need stock us home
Topic #13: crisis covid workers us health essential staff
Topic #14: demand cov

In [11]:
# Fit the LDA model
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_word)

Fitting LDA models with tf features, n_samples=16440 and n_features=10000...
done in 26.437s.

Topics in LDA model:
Topic #0: covid oil consumer coronavirus prices price pandemic
Topic #1: prices shops current better selling sold shame
Topic #2: prices day covid health crisis care amp
Topic #3: covid coronavirus supermarket people store grocery food
Topic #4: consumer crisis covid amp china response company
Topic #5: prices covid coronavirus food demand amp pandemic
Topic #6: workers toiletpaper good chains gt stayhomesavelives employees
Topic #7: covid online shopping food demand new distancing
Topic #8: pay coronaviruspandemic story rent employee street cnn
Topic #9: higher cuts bill potential new fed step



In [12]:
# Fit the LSA model
print("Fiting LSA model")

lsa = TruncatedSVD(n_components=n_components, n_iter=40, tol=0.01)

lsa.fit(tf)

print("\nTopics in LSA model:")

print_top_words(lsa, tf_feature_names, n_top_word)

Fiting LSA model

Topics in LSA model:
Topic #0: covid coronavirus prices food people amp panic
Topic #1: food panic people coronavirus buying store grocery
Topic #2: covid food panic supermarket buying consumer need
Topic #3: food prices panic buying demand oil amp
Topic #4: amp store grocery people workers home prices
Topic #5: people prices oil supermarket store go need
Topic #6: store grocery food prices oil demand workers
Topic #7: supermarket panic buying prices shelves oil store
Topic #8: panic buying consumer people covid amp stop
Topic #9: consumer crisis oil demand pandemic panic buying

