In [0]:
try:
  !pip install tensorflow-gpu==2.1.0
except:
  print("Couldn't install tensorflow-gpu 2.1.0")

import tensorflow as tf
import numpy as np
import pandas as pd
import string
from sklearn.utils import shuffle

In [0]:
# Downloads the train and test csv's
import os

if(os.path.isfile('test.csv') == False): 
  !wget https://raw.githubusercontent.com/Mathisco-01/disaster-tweet-NLP/master/test.csv
else:
  print("test.csv already exists")

if(os.path.isfile('train.csv') == False): 
  !wget https://raw.githubusercontent.com/Mathisco-01/disaster-tweet-NLP/master/train.csv
else:
  print("train.csv already exists")

In [0]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

chars_to_remove = list(string.punctuation) + [str(c) for c in range(10)]

def remove_garbage(text):
  output = ''
  split = text.split(" ")
  for word in split:
    if "http" in word:
      word = "http"
    for char in word:
      output += char.lower()
    output += " "
      
  return output

train['text'] = train['text'].apply(remove_garbage)
test['text'] = test['text'].apply(remove_garbage)

train_X = train.values[:,3]
train_Y = [float(y) for y in train.values[:,4:]]

train_X, train_Y = shuffle(train_X, train_Y)

test_X = test.values[:, 3]

print("train_X len: ({},)".format(len(train_X)))
print(train_X[:5])
print("\n")

print("train_Y len: ({},)".format(len(train_Y)))
print(train_Y[:5])
print("\n")

print("test_X len: ({},)".format(len(test_X)))
print(test_X[:5])
print("\n")

In [0]:
sentence_length = 15 #maximum sentence length, shorter sentences will be padded
vocab_cutoff = 10 #minimum frequency of a word before it becomes a valid token

tokens = {"<oot>":0}
tokens_to_words = {0:"<oot>"}
tokens_frequency = {0:0}
tokens_length = 1

# Build tokenization
for i in range(len(train_X)):
  sentence = []
  for word in train_X[i].split(" "):
    if word not in tokens:
      tokens[word] = tokens_length
      tokens_to_words[tokens_length] = word  
      tokens_frequency[tokens_length] = 1 
      tokens_length += 1
    else:
      tokens_frequency[tokens[word]] += 1 

# Cut off tokens with a frequency less than vocab_cutoff
new_tokens = {"<oot>":0}
new_tokens_length = 1
for i in range(tokens_length):
  if tokens_frequency[i] >= vocab_cutoff:
    new_tokens[tokens_to_words[i]] = new_tokens_length
    new_tokens_length += 1
    
tokens = new_tokens
tokens_length = new_tokens_length

def tokenize_dataset(dataset):
  ds = []
  for i in range(len(dataset)):
    sentence = []
    for word in dataset[i]:
      if word in tokens:
        sentence.append(tokens[word])
      else:
        sentence.append(0) # oot

    if (len(sentence) > sentence_length):
      sentence = sentence[:sentence_length]
    else:
      sentence += [0] * (sentence_length - len(sentence))

    ds.append(sentence)
  
  return np.array(ds)

# Applying tokenization on datasets
train_X = tokenize_dataset(train_X)
test_X = tokenize_dataset(test_X)

print("train_X shape: {}".format(train_X.shape))
print(train_X[:3])
print("\n")

print("test_X shape: {}".format(train_X.shape))
print(test_X[:3])
print("\n")

In [0]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(tokens_length, 100, input_length=sentence_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50)),
    tf.keras.layers.Dropout(.1),
    tf.keras.layers.Dense(1024, activation='relu'),
    tf.keras.layers.Dropout(.1),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(.1),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.summary()

model.compile(optimizer='RMSprop', loss='binary_crossentropy', metrics=['accuracy'])

In [0]:
#
# LOAD THE PRE-TRAINED WEIGHTS
#

if(os.path.isfile('model.h5') == False): 
  !wget https://raw.githubusercontent.com/Mathisco-01/disaster-tweet-NLP/master/model.h5
  
model.load_weights("model.h5")

In [0]:
#
# OR TRAIN THEM YOURSELF!!
#

train_X = np.asarray(train_X)
train_Y = np.asarray(train_Y)
history = model.fit(train_X, train_Y, validation_split=.1, epochs=20)

In [0]:
if 
model.save_weights("model.h5")

In [0]:
preds = model.predict(test_X)

import csv
with open("testfile.csv", "w") as testfile:
  filewriter = csv.writer(testfile)
  filewriter.writerow(['id', 'target'])
  for i in range(len(test_X)):
    filewriter.writerow([test.values[i, 0], str(int(np.round(preds[i])[0]))])

try:
  from google.colab import files
  files.download('testfile.csv')
except:
  print("Couldn't download testfile. You're probably not using a google colab notebook")
  