In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import string

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
def clean_string(dataframe, field,targets=None, is_train=True):

  dataframe[field] = dataframe[field].str.replace("[{}]".format(string.punctuation), " ")

  dataframe[field] = dataframe[field].str.lower()

  # Everything in one line
  lines = []
  target = []

  for idx, line in enumerate(dataframe[field].to_numpy()):
    
    if line != '':
      lines.append(line.strip())
      if targets is not None:
        target.append(targets[idx])

  if is_train:
    return lines, target

  return lines

In [5]:
labels = train.target.to_numpy()

train, labels = clean_string(train, "text", targets=labels)
test = clean_string(test, "text", is_train=False)

  This is separate from the ipykernel package so we can avoid doing imports until


In [6]:
len(train) == len(labels)

True

In [7]:
test[:10]

['just happened a terrible car crash',
 'heard about  earthquake is different cities  stay safe everyone',
 'there is a forest fire at spot pond  geese are fleeing across the street  i cannot save them all',
 'apocalypse lighting   spokane  wildfires',
 'typhoon soudelor kills 28 in china and taiwan',
 'we re shaking   it s an earthquake',
 'they d probably still show more life than arsenal did yesterday  eh  eh',
 'hey  how are you',
 'what a nice hat',
 'fuck off']

In [8]:
def train_val_split(data, target, train_size=0.8):
  train_len = int(len(data) * train_size)
  index = tf.random.shuffle(tf.range(len(data)))
  x_train = tf.gather(data, index[:train_len])
  y_train = tf.gather(target, index[:train_len])

  x_val = tf.gather(data, index[train_len:])
  y_val = tf.gather(target, index[train_len:])

  return (x_train, y_train), (x_val, y_val)

In [9]:
train, val = train_val_split(train, labels)

In [10]:
train = tf.data.Dataset.from_tensor_slices(train)
val = tf.data.Dataset.from_tensor_slices(val)

In [11]:
for X, Y in train.take(1):
  print(X)
  print(Y)

tf.Tensor(b'wrecked tired but not gonna be asleep before 3', shape=(), dtype=string)
tf.Tensor(0, shape=(), dtype=int32)


In [12]:
MAX_SEQUENCE = 250
VOCABSIZE = 10000

vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=VOCABSIZE,
    output_mode='int',
    output_sequence_length=MAX_SEQUENCE
)

In [13]:
text = train.map(lambda text, label: text)
vectorizer.adapt(text)

In [14]:
def vectorize_text(text, labels):
  return vectorizer(text), labels

In [15]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

train = train.map(vectorize_text, num_parallel_calls=AUTOTUNE)
val = val.map(vectorize_text, num_parallel_calls=AUTOTUNE)

In [16]:
for X, Y in train.take(1):
  print(X.shape)
  print(Y.shape)
  print(X)
  print(Y)

(250,)
()
tf.Tensor(
[ 449 1390   34   38  365   29 2992  236   83    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0 

In [17]:
BATCH_SIZE = 64
train = train.cache().shuffle(1000).batch(BATCH_SIZE).prefetch(AUTOTUNE)
val = val.cache().shuffle(1000).batch(BATCH_SIZE).prefetch(AUTOTUNE)

In [18]:
for line, label in train.take(1):
    print(line.shape)
    print(label.shape)

(64, 250)
(64,)


In [57]:
# Defining the Model
embedding_dim = 64
l2 = tf.keras.regularizers.l2(0.1)
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(input_dim=VOCABSIZE,
                            output_dim=embedding_dim,
                            mask_zero=True,
                            input_length=MAX_SEQUENCE),
  tf.keras.layers.Dropout(0.5),
  tf.keras.layers.LSTM(50, return_sequences=True, kernel_regularizer=l2),
  tf.keras.layers.Dropout(0.5),
  tf.keras.layers.LSTM(50, kernel_regularizer=l2),
  tf.keras.layers.Dropout(0.5),
  tf.keras.layers.Dense(1)])

In [58]:
model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 250, 64)           640000    
                                                                 
 dropout_19 (Dropout)        (None, 250, 64)           0         
                                                                 
 lstm_12 (LSTM)              (None, 250, 50)           23000     
                                                                 
 dropout_20 (Dropout)        (None, 250, 50)           0         
                                                                 
 lstm_13 (LSTM)              (None, 50)                20200     
                                                                 
 dropout_21 (Dropout)        (None, 50)                0         
                                                                 
 dense_16 (Dense)            (None, 1)                

In [59]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                  optimizer=tf.keras.optimizers.Adam(),
                  metrics=['accuracy'])

In [60]:
earlystopping = tf.keras.callbacks.EarlyStopping(patience=5)
checkpoint = tf.keras.callbacks.ModelCheckpoint("disaster.h5", save_only_best=True)

In [61]:
history = model.fit(train, validation_data = val, epochs=10, callbacks=[earlystopping, checkpoint])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
