In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.util import ngrams

In [4]:
df = pd.read_csv('data/tokenized_clean.csv')
df.drop(columns=['Unnamed: 0'], inplace=True)
df.replace(to_replace=r"\[|\]|\' ", value='', inplace=True, regex=True)
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this may allah fo...,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all residents asked to shelter in place are be...,1
3,6,,,people receive evacuation orders in california,1
4,7,,,just got sent this photo from ruby as smoke f...,1


In [5]:
freqs = {}
for i in df['text']:
    tokens = nltk.tokenize.word_tokenize(i, language="english")
    for n in range(1, 4):
        for j in ngrams(tokens, n):
            j = str(j)
            freqs.setdefault(j, 0)
            freqs[j]+=1
freqs

{"('our',)": 99,
 "('deeds',)": 2,
 "('are',)": 401,
 "('the',)": 3261,
 "('reason',)": 20,
 "('of',)": 1823,
 "('this',)": 477,
 "('may',)": 88,
 "('allah',)": 4,
 "('forgive',)": 2,
 "('us',)": 163,
 "('all',)": 257,
 "('our', 'deeds')": 1,
 "('deeds', 'are')": 1,
 "('are', 'the')": 15,
 "('the', 'reason')": 2,
 "('reason', 'of')": 2,
 "('of', 'this')": 9,
 "('this', 'may')": 1,
 "('may', 'allah')": 3,
 "('allah', 'forgive')": 1,
 "('forgive', 'us')": 1,
 "('us', 'all')": 5,
 "('our', 'deeds', 'are')": 1,
 "('deeds', 'are', 'the')": 1,
 "('are', 'the', 'reason')": 1,
 "('the', 'reason', 'of')": 1,
 "('reason', 'of', 'this')": 1,
 "('of', 'this', 'may')": 1,
 "('this', 'may', 'allah')": 1,
 "('may', 'allah', 'forgive')": 1,
 "('allah', 'forgive', 'us')": 1,
 "('forgive', 'us', 'all')": 1,
 "('forest',)": 63,
 "('fire',)": 247,
 "('near',)": 54,
 "('la',)": 25,
 "('ronge',)": 1,
 "('sask',)": 1,
 "('canada',)": 9,
 "('forest', 'fire')": 8,
 "('fire', 'near')": 3,
 "('near', 'la')": 1,


In [24]:
indexer = {}
index = 0
for i in df['text']:
    tokens = nltk.tokenize.word_tokenize(i, language="english")
    for n in range(1, 3):
        for j in ngrams(tokens, n):
            j = str(j)
            if j not in indexer.keys():# and freqs[j] >= 2:
                indexer[j] = index
                index+=1
indexer

{"('our',)": 0,
 "('deeds',)": 1,
 "('are',)": 2,
 "('the',)": 3,
 "('reason',)": 4,
 "('of',)": 5,
 "('this',)": 6,
 "('may',)": 7,
 "('allah',)": 8,
 "('forgive',)": 9,
 "('us',)": 10,
 "('all',)": 11,
 "('our', 'deeds')": 12,
 "('deeds', 'are')": 13,
 "('are', 'the')": 14,
 "('the', 'reason')": 15,
 "('reason', 'of')": 16,
 "('of', 'this')": 17,
 "('this', 'may')": 18,
 "('may', 'allah')": 19,
 "('allah', 'forgive')": 20,
 "('forgive', 'us')": 21,
 "('us', 'all')": 22,
 "('forest',)": 23,
 "('fire',)": 24,
 "('near',)": 25,
 "('la',)": 26,
 "('ronge',)": 27,
 "('sask',)": 28,
 "('canada',)": 29,
 "('forest', 'fire')": 30,
 "('fire', 'near')": 31,
 "('near', 'la')": 32,
 "('la', 'ronge')": 33,
 "('ronge', 'sask')": 34,
 "('sask', 'canada')": 35,
 "('residents',)": 36,
 "('asked',)": 37,
 "('to',)": 38,
 "('shelter',)": 39,
 "('in',)": 40,
 "('place',)": 41,
 "('being',)": 42,
 "('notified',)": 43,
 "('by',)": 44,
 "('officers',)": 45,
 "('no',)": 46,
 "('other',)": 47,
 "('evacuation

In [25]:
len(indexer)*len(df)*4/10e9

0.2094671272

In [26]:
n_gram_onehot = np.zeros(shape=(len(df), len(indexer)), dtype=np.float32)
n_gram_onehot.shape

(7613, 68786)

In [27]:
import json
with open('indexes.sav', 'w') as f:
    f.write(json.dumps(indexer))

In [28]:
missed = 0
for i, e in enumerate(df['text']):
    tokens = nltk.tokenize.word_tokenize(e, language="english")
    matched = False
    for j in ngrams(tokens, n):
        j = str(j)
        if j in indexer.keys():
            n_gram_onehot[i][indexer[j]] = 1.0
            matched = True
    if not matched:
        missed +=1
print(missed)

53


In [29]:
labelsOneHot = np.zeros((n_gram_onehot.shape[0], 2))
for i, e in enumerate(df['target']):
    labelsOneHot[i][e]=1
labelsOneHot.shape

(7613, 2)

In [11]:
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf

In [30]:
keras.utils.set_random_seed(1234)
ngram_NN = keras.Sequential()
ngram_NN.bias = 0.2
ngram_NN.add(keras.layers.Input(68786))

ngram_NN.add(keras.layers.Reshape((326, 211)))

ngram_NN.add(keras.layers.Convolution1D(128, (4), strides=(2), activation='relu'))#, padding="causal"))
ngram_NN.add(keras.layers.MaxPool1D(2))

ngram_NN.add(keras.layers.Convolution1D(64,  (4), strides=(2), activation='relu'))#, padding="causal"))
ngram_NN.add(keras.layers.MaxPool1D(2))

ngram_NN.add(keras.layers.Flatten())

ngram_NN.add(keras.layers.Dense(512, activation='relu'))
ngram_NN.add(keras.layers.Dense(128,  activation='relu'))
ngram_NN.add(keras.layers.Dense(32,  activation='relu'))
ngram_NN.add(keras.layers.Dense(8,  activation='relu'))



ngram_NN.add(keras.layers.Dropout(.5))


ngram_NN.add(keras.layers.Dense(2, activation='softmax'))
ngram_NN.output_shape

(None, 2)

In [13]:
callbacks = [
    keras.callbacks.ModelCheckpoint("save_at_{epoch}.h5"),
]

In [31]:
ngram_NN.compile( metrics=["accuracy"], 
optimizer = keras.optimizers.Adam(learning_rate=1e-3), 
loss="categorical_crossentropy")

In [32]:
ngram_NN.fit(n_gram_onehot, labelsOneHot, epochs=25, callbacks=callbacks)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x1f64ff5bcd0>