In [32]:
import glob
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

In [7]:
# Locate data file(s)
data_dirn = "nlp_data/"
data_files = glob.glob(data_dirn + "*.csv")
print(data_files)
print(len(data_files))    

['nlp_data/pennsylvania.csv']
1


In [13]:
# Read in and join data files into joint DataFrame
df = pd.DataFrame()
for data_file in data_files:
    df_cur = pd.read_csv(data_file)
    df = df.append(df_cur)
df

Unnamed: 0,Sentences,Labels
0,This article details the viral pandemic of cor...,0
1,"As of March 24, 2020, the Pennsylvania Departm...",0
2,"On March 6, Governor Tom Wolf reported Pennsyl...",0
3,Both cases were related to travel.,0
4,"March 9 brought 4 more, total 10. .",0
5,"March 10 saw 2 case, total standing at 12. .",0
6,"On March 13, Governor Wolf announced that all ...",1
7,"Additionally, park programs were canceled.",0
8,"By March 17, there were 96 cases in the state;...",0
9,"On March 18, the department of health reported...",0


In [33]:
# Tranform sentences/words to a word embeddings vector
vocab_size = 10000
x_train = df["Sentences"]
y_train = df["Labels"]
x_train = [one_hot(d, vocab_size, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True, split=' ') for d in x_train]
print("{}:\n{}".format(df["Sentences"][1],x_train[1]))

As of March 24, 2020, the Pennsylvania Department of Health has confirmed 851 cases and 7 deaths in the state.:
[3876, 4978, 4970, 2786, 4352, 9972, 4904, 8953, 4978, 8648, 4479, 7460, 942, 9558, 6968, 7463, 4209, 6858, 9972, 3112]


In [34]:
# Pad each sentence embedding to be the same length
print(len(x_train[0]))
print(len(x_train[1]))
max_length = 40
x_train = pad_sequences(x_train, maxlen=max_length, padding='pre')
print(len(x_train[0]))
print(len(x_train[1]))
print(x_train[1])

19
20
40
40
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0 3876 4978 4970 2786 4352 9972 4904 8953
 4978 8648 4479 7460  942 9558 6968 7463 4209 6858 9972 3112]


In [54]:
# Create ML Model
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=max_length))
model.add(Flatten())
model.add(Dense(1, activation='linear'))

model.compile(optimizer='adam', loss='mse', metrics=['acc'])

print(model.summary())

history = model.fit(x_train, y_train, epochs=100, verbose=0)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 40, 8)             80000     
_________________________________________________________________
flatten_7 (Flatten)          (None, 320)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 321       
Total params: 80,321
Trainable params: 80,321
Non-trainable params: 0
_________________________________________________________________
None


In [55]:
# Report loss and accuracy
loss, accuracy = model.evaluate(x_train, y_train, verbose=1)
print('Training Accuracy is {}%'.format(accuracy*100))

Training Accuracy is 100.0%


In [56]:
# Running inference as test of outputs
preds = model.predict(x_train)
print("Predictions:\n{}\nLabels:\n{}".format(np.round(preds).astype(int).flatten(), y_train.values))

Predictions:
[0 0 0 0 0 0 1 0 0 0 0 1 3 0 0 0 0 0 0 0 2 2 0 0 2 3 2 0 0 0 0 0 0 0 0 0 0
 0 0 0]
Labels:
[0 0 0 0 0 0 1 0 0 0 0 1 3 0 0 0 0 0 0 0 2 2 0 0 2 3 2 0 0 0 0 0 0 0 0 0 0
 0 0 0]
