In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.util import ngrams

In [10]:
df = pd.read_csv('data/tokenized_clean.csv')
df.drop(columns=['Unnamed: 0'], inplace=True)
df.replace(to_replace=r"\[|\]|\' ", value='', inplace=True, regex=True)
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this may allah fo...,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all residents asked to shelter in place are be...,1
3,6,,,people receive evacuation orders in california,1
4,7,,,just got sent this photo from ruby as smoke f...,1


In [11]:
freqs = {}
for i in df['text']:
    tokens = nltk.tokenize.word_tokenize(i, language="english")
    for n in range(1, 4):
        for j in ngrams(tokens, n):
            j = str(j)
            freqs.setdefault(j, 0)
            freqs[j]+=1
freqs

{"('our',)": 99,
 "('deeds',)": 2,
 "('are',)": 401,
 "('the',)": 3263,
 "('reason',)": 20,
 "('of',)": 1823,
 "('this',)": 477,
 "('may',)": 88,
 "('allah',)": 4,
 "('forgive',)": 2,
 "('us',)": 163,
 "('all',)": 257,
 "('our', 'deeds')": 1,
 "('deeds', 'are')": 1,
 "('are', 'the')": 15,
 "('the', 'reason')": 2,
 "('reason', 'of')": 2,
 "('of', 'this')": 9,
 "('this', 'may')": 1,
 "('may', 'allah')": 3,
 "('allah', 'forgive')": 1,
 "('forgive', 'us')": 1,
 "('us', 'all')": 5,
 "('our', 'deeds', 'are')": 1,
 "('deeds', 'are', 'the')": 1,
 "('are', 'the', 'reason')": 1,
 "('the', 'reason', 'of')": 1,
 "('reason', 'of', 'this')": 1,
 "('of', 'this', 'may')": 1,
 "('this', 'may', 'allah')": 1,
 "('may', 'allah', 'forgive')": 1,
 "('allah', 'forgive', 'us')": 1,
 "('forgive', 'us', 'all')": 1,
 "('forest',)": 63,
 "('fire',)": 247,
 "('near',)": 54,
 "('la',)": 25,
 "('ronge',)": 1,
 "('sask',)": 1,
 "('canada',)": 9,
 "('forest', 'fire')": 8,
 "('fire', 'near')": 3,
 "('near', 'la')": 1,


In [12]:
indexer = {}
index = 0
for i in df['text']:
    tokens = nltk.tokenize.word_tokenize(i, language="english")
    for n in range(1, 4):
        for j in ngrams(tokens, n):
            j = str(j)
            if j not in indexer.keys() and freqs[j] > 2:
                indexer[j] = index
                index+=1
indexer

{"('our',)": 0,
 "('are',)": 1,
 "('the',)": 2,
 "('reason',)": 3,
 "('of',)": 4,
 "('this',)": 5,
 "('may',)": 6,
 "('allah',)": 7,
 "('us',)": 8,
 "('all',)": 9,
 "('are', 'the')": 10,
 "('of', 'this')": 11,
 "('may', 'allah')": 12,
 "('us', 'all')": 13,
 "('forest',)": 14,
 "('fire',)": 15,
 "('near',)": 16,
 "('la',)": 17,
 "('canada',)": 18,
 "('forest', 'fire')": 19,
 "('fire', 'near')": 20,
 "('residents',)": 21,
 "('asked',)": 22,
 "('to',)": 23,
 "('shelter',)": 24,
 "('in',)": 25,
 "('place',)": 26,
 "('being',)": 27,
 "('by',)": 28,
 "('officers',)": 29,
 "('no',)": 30,
 "('other',)": 31,
 "('evacuation',)": 32,
 "('or',)": 33,
 "('orders',)": 34,
 "('expected',)": 35,
 "('shelter', 'in')": 36,
 "('in', 'place')": 37,
 "('are', 'being')": 38,
 "('people',)": 39,
 "('california',)": 40,
 "('in', 'california')": 41,
 "('just',)": 42,
 "('got',)": 43,
 "('sent',)": 44,
 "('photo',)": 45,
 "('from',)": 46,
 "('as',)": 47,
 "('smoke',)": 48,
 "('into',)": 49,
 "('a',)": 50,
 "('s

In [13]:
len(indexer)*len(df)*4/10e9

0.0373828752

In [26]:
n_gram_onehot = np.zeros(shape=(len(df), len(indexer)), dtype=np.float32)
n_gram_onehot.shape

(7613, 12276)

In [15]:
import json
with open('indexes.sav', 'w') as f:
    f.write(json.dumps(indexer))

In [16]:
for i, e in enumerate(df['text']):
    tokens = nltk.tokenize.word_tokenize(e, language="english")
    for j in ngrams(tokens, n):
        if j in indexer.keys():
            n_gram_onehot[i][indexer[j]] = 1.0

In [19]:
labelsOneHot = np.zeros((n_gram_onehot.shape[0], 2))
for i, e in enumerate(df['target']):
    labelsOneHot[i][e]=1
labelsOneHot.shape

(7613, 2)

In [20]:
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf

In [21]:
keras.utils.set_random_seed(1234)
ngram_NN = keras.Sequential()
ngram_NN.bias = 0.2
ngram_NN.add(keras.layers.Input(12276))

ngram_NN.add(keras.layers.Reshape((99, 124)))

ngram_NN.add(keras.layers.Convolution1D(128, (4), strides=(2), activation='relu'))
ngram_NN.add(keras.layers.MaxPool1D(2))

ngram_NN.add(keras.layers.Convolution1D(64,  (4), strides=(2), activation='relu'))
ngram_NN.add(keras.layers.MaxPool1D(2))

ngram_NN.add(keras.layers.Flatten())

ngram_NN.add(keras.layers.Dense(512, activation='relu'))
ngram_NN.add(keras.layers.Dense(128,  activation='relu'))
ngram_NN.add(keras.layers.Dense(32,  activation='relu'))
ngram_NN.add(keras.layers.Dense(8,  activation='relu'))



ngram_NN.add(keras.layers.Dropout(.5))


ngram_NN.add(keras.layers.Dense(2, activation='softmax'))
ngram_NN.output_shape

(None, 2)

In [24]:
callbacks = [
    keras.callbacks.ModelCheckpoint("save_at_{epoch}.h5"),
]

In [22]:
ngram_NN.compile( metrics=["accuracy"], 
optimizer = keras.optimizers.Adam(learning_rate=1e-3), 
loss="categorical_crossentropy")

In [27]:
ngram_NN.fit(n_gram_onehot, labelsOneHot, epochs=25, callbacks=callbacks)

Epoch 1/25
Epoch 2/25

KeyboardInterrupt: 