In [1]:
# Import python libraries
import pandas as pd
import numpy as np
from datetime import datetime
import wikipediaapi
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

# Specify which US State to analyze:
state = "Pennsylvania"

In [2]:
# Read information from wikipedia
wiki = wikipediaapi.Wikipedia('en')
if state == "Washington":
    state = "Washington (state)"
state_wiki = wiki.page("2020 coronavirus pandemic in {}".format(state.replace(' ','_')))
#print(state_wiki.text)

In [13]:
# Tokenize all the text into sentences
sentences = sent_tokenize(state_wiki.text.replace('.','. ').replace('\n','. '))
valid_sents = []
for sentence in sentences:
    if len(sentence) > 1 and len(word_tokenize(sentence)) > 3:
        valid_sents.append(sentence)
print("{} sentences of interest".format(len(valid_sents)))

40 sentences of interest


In [18]:
# Create a len(valid_sents)-long vector of labels for severity of gov actions
"""
Label definitions:
    0 ~ No actions taken or no date included
    1 ~ Minor actions taken (School or specific business closure)
    2 ~ Moderate actions taken (Industry closures or very localized, strict actions)
    3 ~ Major actions taken (Large-scale closures or shelter-in-place orders)
"""
action_label = np.zeros(len(valid_sents))
for i, valid_sent in enumerate(valid_sents):
    print("{}: {}".format(i,valid_sent))

# Manually create labels for training dataset
action_label[6] = 1
action_label[11] = 1
action_label[12] = 3
action_label[20] = 2
action_label[21] = 2
action_label[24] = 2
action_label[25] = 3
action_label[26] = 2
print(action_label)

0: This article details the viral pandemic of coronavirus disease 2019 (COVID-19) in the U. S.  state of Pennsylvania.
1: As of March 24, 2020, the Pennsylvania Department of Health has confirmed 851 cases and 7 deaths in the state.
2: On March 6, Governor Tom Wolf reported Pennsylvania's first two confirmed cases of coronavirus in Delaware County and in Wayne County.
3: Both cases were related to travel.
4: March 9 brought 4 more, total 10. .
5: March 10 saw 2 case, total standing at 12. .
6: On March 13, Governor Wolf announced that all Pennsylvania schools will be closed for 10 days.
7: Additionally, park programs were canceled.
8: By March 17, there were 96 cases in the state; more than half of them were in the Philadelphia area with Montgomery County as the highest number.
9: On March 18, the department of health reported the state's first death related to the virus, a patient at St.  Luke's Fountain Hill campus in Northampton County.
10: As of March 18, there are 133 cases in the

In [27]:
# Join the sentences with the labels into a Pandas DataFrame
df = pd.DataFrame()
df["Sentences"] = valid_sents
df["Labels"] = action_label.astype(int)
df

Unnamed: 0,Sentences,Labels
0,This article details the viral pandemic of cor...,0
1,"As of March 24, 2020, the Pennsylvania Departm...",0
2,"On March 6, Governor Tom Wolf reported Pennsyl...",0
3,Both cases were related to travel.,0
4,"March 9 brought 4 more, total 10. .",0
5,"March 10 saw 2 case, total standing at 12. .",0
6,"On March 13, Governor Wolf announced that all ...",1
7,"Additionally, park programs were canceled.",0
8,"By March 17, there were 96 cases in the state;...",0
9,"On March 18, the department of health reported...",0


In [28]:
# Save the data into a CSV for training a supervised ML model
dirn = "nlp_data/"
filename = "{}.csv".format(state.lower())
df.to_csv(dirn+filename, index=False)