In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from matplotlib import pyplot as plt
import text_hammer as th

## Data loading and preprocessing

#### Data reading
The data comes with the columns 'keyword' and 'location'. However, a quick look into the data shows that there is not much relationship between 'keyword', 'location' and whether or not a disaster happens. Therefore, we will just consider the 'text' column.

In [2]:
df_train = pd.read_csv('data/train.csv', usecols=['text','target'])
df_pred = pd.read_csv('data/test.csv', usecols=['text'])

print('df_train:')
display(df_train.head(5))
print()
print('df_pred:')
display(df_pred.head(5))

df_train:


Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1



df_pred:


Unnamed: 0,text
0,Just happened a terrible car crash
1,"Heard about #earthquake is different cities, s..."
2,"there is a forest fire at spot pond, geese are..."
3,Apocalypse lighting. #Spokane #wildfires
4,Typhoon Soudelor kills 28 in China and Taiwan


#### Data cleanup
We clean the text by the following steps:
1. lowercase all characters, remove '\\' and replace '_' by ' '
2. remove emails, urls
3. remove special characters, accented characters

In [3]:
def clean_up_text(col_data):
    col_data = col_data.apply(lambda x: str(x).lower().replace('\\','')).replace('_',' ')   
    col_data = col_data.apply(lambda x: th.remove_emails(x))
    col_data = col_data.apply(lambda x: th.remove_urls(x))
    col_data = col_data.apply(lambda x: th.remove_special_chars(x))
    col_data = col_data.apply(lambda x: th.remove_accented_chars(x)) 
    return col_data

In [4]:
df_train['text'] = clean_up_text(df_train['text'])
df_pred['text'] = clean_up_text(df_pred['text'])

print('df_train:')
display(df_train.head(5))
print()
print('df_pred:')
display(df_pred.head(5))

df_train:


Unnamed: 0,text,target
0,our deeds are the reason of this earthquake ma...,1
1,forest fire near la ronge sask canada,1
2,all residents asked to shelter in place are be...,1
3,13000 people receive wildfires evacuation orde...,1
4,just got sent this photo from ruby alaska as s...,1



df_pred:


Unnamed: 0,text
0,just happened a terrible car crash
1,heard about earthquake is different cities sta...
2,there is a forest fire at spot pond geese are ...
3,apocalypse lighting spokane wildfires
4,typhoon soudelor kills 28 in china and taiwan


#### Match data length for each label

In [5]:
print('Original train data:')
display(df_train['target'].value_counts())
print()

# collect all negative and all positive data
df_n = df_train[df_train['target']==0]
df_p = df_train[df_train['target']==1]

# randomly select from df_n the same number of samples as df_p
df_n = df_n.sample(df_p.shape[0])

# concatenate df_n and df_p together to get a new df_train
df_train = pd.concat([df_n, df_p])

print('Processed train data:')
display(df_train['target'].value_counts())
print()

Original train data:


0    4342
1    3271
Name: target, dtype: int64


Processed train data:


0    3271
1    3271
Name: target, dtype: int64




#### Seperate train, validation and test data_sets

In [6]:
# split train data
x_train, x_val, y_train, y_val = train_test_split(
df_train['text'],df_train['target'],test_size=0.2, stratify=df_train['target'])
# split validation data and test data
x_val, x_test, y_val, y_test = train_test_split(x_val, y_val, test_size=0.5, stratify=y_val)

print('Length of train data =', len(x_train))
print('Length of val data =', len(x_val))
print('Length of test data =', len(x_test))

Length of train data = 5233
Length of val data = 654
Length of test data = 655


In [7]:
# show part of data:
print('Part of x_train:')
display(x_train.head(5))
print('Corresponding labels:')
display(y_train.head(5))
print('-------------------------')
print()
print('Part of x_val:')
display(x_val.head(5))
print('Corresponding labels:')
display(y_val.head(5))
print('-------------------------')
print()
print('Part of x_test:')
display(x_test.head(5))
print('Corresponding labels:')
display(y_test.head(5))

Part of x_train:


5031    lolly_knickers its a mudslide its like chewing...
4591    judson1360 xtra1360 oline and pass rush rest o...
3122    seriously look like a get electrocuted after i...
2866    tips so that finding the customers ego drought...
2691    ignition knock detonation sensorsenso fits 010...
Name: text, dtype: object

Corresponding labels:


5031    1
4591    0
3122    0
2866    0
2691    1
Name: target, dtype: int64

-------------------------

Part of x_val:


4619    follownflnews michael floyds hand injury shoul...
7432    small bag from the bottom the wounded hero sha...
4062    climate consequences us forest service says sp...
5708    video were picking up bodies from water rescue...
5344    if she dont know bout that pandemonium album s...
Name: text, dtype: object

Corresponding labels:


4619    0
7432    0
4062    1
5708    1
5344    0
Name: target, dtype: int64

-------------------------

Part of x_test:


2101    i tell my cousins i dont wanna hang out and th...
4113      adriasimon_ hailstorm day 2 round2 yyc yycstorm
3929    1 pair new 27w 4 round led work driving flood ...
2552    just made anthonys bed considering i destroy i...
3851    my gang walking round with them brown flames a...
Name: text, dtype: object

Corresponding labels:


2101    0
4113    1
3929    0
2552    1
3851    0
Name: target, dtype: int64

## Model setup and machine learning

#### Model setup
We choose Bert model as out natural language embedding tool

In [8]:
#Load the Bert preprocessor and encoder
preprocessor = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')

In [9]:
# Model definition

# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
encoder_input = preprocessor(text_input)
embeddings = encoder(encoder_input)

# neural network layers
layer = tf.keras.layers.Dropout(0.1, name='dropout_1')(embeddings['pooled_output'])
layer = tf.keras.layers.Dense(128, activation='relu', name='dense_1')(layer)
layer = tf.keras.layers.Dropout(0.1, name='dropout_2')(layer)
layer = tf.keras.layers.Dense(32, activation='relu', name='dense_2')(layer)
output = tf.keras.layers.Dense(1, activation='sigmoid', name='output')(layer)

# Construct the model
model = tf.keras.Model(inputs=[text_input], outputs=[output])

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_mask': (Non  0           ['text[0][0]']                   
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128)}                                                  

In [10]:
METRICS = [
    tf.keras.metrics.BinaryAccuracy(name='accuracy'),
    tf.keras.metrics.Precision(name='precision'),
    tf.keras.metrics.Recall(name='recall')
]

model.compile(
    optimizer = 'adam',
    loss = 'binary_crossentropy',
    metrics = METRICS
)

In [11]:
model.fit(
    x_train,
    y_train,
    batch_size=32,
    epochs=20,
    validation_data=(x_val, y_val),
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x2287c3fcc10>

#### Run data_test

In [12]:
scores = model.evaluate(x_test, y_test)



#### Model save

In [13]:
model.save('NLP_disaster_tweets_v1.h5')