In [1]:
import re 
import nltk 
import string 
import numpy as np
import pandas as pd
import multiprocessing
from datetime import datetime
from collections import Counter
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from nltk.stem.porter import *
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
plt.style.use('seaborn-white')
orange, purple, elm ='#FFCC00', '#666599', '#217C7E'
blue, red, grey, green ='#336699', '#9A3334', '#AAAAAA', '#335333'
np.random.seed(2018)
%matplotlib inline
mydpi=600
s=18

__author__ = 'HK Dambanemuya'
__version__ = 'Python2'

In [2]:
print ("Reading News...")
news = pd.read_csv("../Data/Fusion/news.csv")
news = news[news['length']>100]
news['date'] = [datetime.strftime(pd.to_datetime(date), '%Y-%m-%d') for date in tqdm_notebook(news['date'])]

Reading News...


HBox(children=(IntProgress(value=0, max=2323528), HTML(value=u'')))


Reading Blogs...


HBox(children=(IntProgress(value=0, max=985949), HTML(value=u'')))


Reading Discussions...


HBox(children=(IntProgress(value=0, max=412165), HTML(value=u'')))




In [3]:
# Select countries common in all 3 data sets
country_map = Counter(set(news['country']))+Counter(set(blogs['country']))+ Counter(set(discussions['country']))
common_locations = []
for k,v in tqdm_notebook(country_map.items()):
    if (v == 3) and (k==k):
        common_locations.append(k)
print("Common Countries:", len(common_locations))

HBox(children=(IntProgress(value=0, max=202), HTML(value=u'')))


('Common Countries:', 79)


In [4]:
# Filter data by common countries
news = news[news['country'].isin(common_locations)]
blogs = blogs[blogs['country'].isin(common_locations)]
discussions = discussions[discussions['country'].isin(common_locations)]
print ("News:", len(news))
print ("Blogs:", len(blogs))
print ("Discussions:", len(discussions))

('News:', 2277563)
('Blogs:', 946755)
('Discussions:', 409091)


## Popular News Posts

In [6]:
min_participants = 300
popular_news = news[news['participants']>=min_participants]
print "# of popular news posts:", len(popular_news)

# of popular news posts: 254


In [7]:
stop_words = stopwords.words("english")
stop_words.extend(['coronavirus', 'covid', 'virus', 'news', 'new', 'outbreak', 'pandemic'])

In [11]:
# convert to lowercase
def text_lowercase(text): 
    return text.decode('ascii', 'ignore').lower() 

# remove numbers
def remove_numbers(text): 
    result = re.sub(r'\d+', '', text.encode('ascii', 'ignore')) 
    return result 

# remove whitespace from text 
def remove_whitespace(text): 
    return  " ".join(text.encode('ascii', 'ignore').split())

# remove stopwords 
def remove_stopwords(text): 
    word_tokens = word_tokenize(text.encode('ascii', 'ignore')) 
    filtered_text = [word for word in word_tokens if word not in stop_words] 
    return filtered_text 

# pre-process text
def pre_process(text):
    text_lower = text_lowercase(str(text).decode('ascii', 'ignore'))
    no_numbers = remove_numbers(text_lower)
    no_punctuation = no_numbers.translate(None, string.punctuation)
    no_whitespace = remove_whitespace(no_punctuation)
    no_stopwords = remove_stopwords(no_whitespace)
    lemmas = [lemmatizer.lemmatize(token) for token in no_stopwords]
    no_chars = [i for i in lemmas if len(i) > 1]
    return no_chars   

# get bi-grams
def bi_grams(tokens):
    return list(nltk.bigrams(tokens))

In [12]:
pool = multiprocessing.Pool(processes=8)

In [13]:
text = pool.map(pre_process, popular_news['text'].dropna())

In [14]:
words = [element for lis in text for element in lis]

In [15]:
print Counter(words).most_common(100) 

[('people', 472), ('flu', 314), ('trump', 226), (u'case', 132), ('vaccine', 125), ('get', 123), ('help', 121), ('disease', 116), ('china', 103), ('time', 101), ('going', 98), ('day', 98), ('yet', 93), ('one', 91), ('spread', 88), (u'population', 85), ('rate', 85), ('like', 85), ('dont', 83), ('would', 82), (u'death', 80), (u'community', 79), ('president', 78), ('much', 77), ('democrat', 77), ('year', 76), (u'thing', 76), (u'system', 72), ('good', 71), (u'number', 71), ('vulnerable', 71), ('unlike', 70), ('medical', 69), ('still', 69), ('every', 68), ('many', 67), ('infected', 65), ('take', 64), ('response', 58), ('might', 58), (u'american', 58), ('state', 57), ('immune', 56), ('world', 55), ('feb', 55), ('even', 54), (u'medium', 54), ('control', 54), (u'need', 54), ('health', 53), ('know', 53), ('however', 52), ('thats', 51), ('related', 51), (u'country', 50), ('life', 50), ('bill', 50), (u'body', 49), ('care', 48), (u'kill', 48), ('including', 47), (u'symptom', 46), ('said', 46), ('mi

In [17]:
grams_collection = pool.map(bi_grams, text)

In [18]:
topics = []
for grams in tqdm_notebook(grams_collection):
    topics.extend(grams)

HBox(children=(IntProgress(value=0, max=254), HTML(value=u'')))




In [19]:
Counter(topics).most_common(100)

[(('disease', 'people'), 70),
 (('unlike', 'flu'), 67),
 (('mortality', 'rate'), 43),
 (('immune', 'response'), 29),
 ((u'body', 'yet'), 28),
 (('vulnerable', 'related'), 28),
 (('going', 'kill'), 27),
 (('heart', 'disease'), 27),
 (('united', u'state'), 27),
 (('immune', u'system'), 26),
 (('including', 'life'), 25),
 (('might', 'vulnerable'), 25),
 (('life', 'might'), 25),
 ((u'condition', 'people'), 25),
 (('liver', 'disease'), 25),
 (('asymptomatically', 'spread'), 25),
 (('people', 'people'), 25),
 (('kidney', 'disease'), 25),
 (('donald', 'trump'), 24),
 (('asthma', 'people'), 24),
 (('still', 'asymptomatically'), 24),
 (('people', 'diabetes'), 24),
 (('people', 'lung'), 24),
 (('cancer', 'people'), 24),
 (('people', 'heart'), 24),
 (('built', 'natural'), 24),
 (('people', 'kidney'), 24),
 (('people', 'cancer'), 24),
 (('people', 'asthma'), 24),
 (('diabetes', 'people'), 24),
 ((u'population', 'vulnerable'), 24),
 (('related', 'complicationsdeaths'), 24),
 (('president', 'trump')

A further investigation of the data shows that these posts are relate to the nature of the coronavirus disease, how it spreads, mortality rate, and conditions such as asthma, diabetes, and cancer as well as heart, liver, kidney, and lung diseases that are believed to be related to COVID-19 complications and deaths. 