# Model 4: BERT

In [1]:
# We will use the official tokenization script created by the Google team
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [2]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

#from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import math, os, re, time, random, string

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
import tokenization
from collections import defaultdict
import wordcloud
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [16]:
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=2e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [21]:
#Download BERT from the Tensorflow Hub
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

#Read CSV files 
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")

#Download tokenizer from the bert layer
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)


#- Encode the text into tokens, masks, and segment flags
train_input = bert_encode(train.text.values, tokenizer, max_len=160)
test_input = bert_encode(test.text.values, tokenizer, max_len=160)
train_labels = train.target.values

In [6]:
train.head(3)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1


# Text preprocessing

In [22]:
def remove_punctuation(x):
    return x.translate(str.maketrans('', '', string.punctuation))

def remove_stopwords(x):
    return ' '.join([i for i in x.split() if i not in wordcloud.STOPWORDS])    

def remove_non_alphabet(x):
    return ' '.join([i for i in x.split() if i.isalpha()])

#def strip_all_entities(x):
#    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",x).split())

def remove_hashtag(x):
    return " ".join(word.strip() for word in re.split('#|_', x))

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)


In [23]:
train['text'] = train['text'].apply(lambda x: x.lower())
train['text'] = train['text'].apply(lambda x: re.sub(r'https?://\S+|www\.\S+', '', x, flags = re.MULTILINE))
train['text'] = train['text'].apply(lambda x: re.sub(r'http?://\S+|www\.\S+', '', x, flags = re.MULTILINE))
train['text'] = train['text'].apply(remove_punctuation)
train['text'] = train['text'].apply(remove_stopwords)
train['text'] = train['text'].apply(remove_non_alphabet)
train['text'] = train['text'].apply(remove_hashtag)
train['text'] = train['text'].apply(lambda x : remove_URL(x))

In [24]:
# TEST DATA
test['text'] = test['text'].apply(lambda x: x.lower())
test['text'] = test['text'].apply(lambda x: re.sub(r'https?://\S+|www\.\S+', '', x, flags = re.MULTILINE))
test['text'] = test['text'].apply(lambda x: re.sub(r'http?://\S+|www\.\S+', '', x, flags = re.MULTILINE))
test['text'] = test['text'].apply(remove_punctuation)
test['text'] = test['text'].apply(remove_stopwords)
test['text'] = test['text'].apply(remove_non_alphabet)
train['text'] = train['text'].apply(remove_hashtag)
train['text'] = train['text'].apply(lambda x : remove_URL(x))

# Model: Build, Train, Predict

In [25]:
model = build_model(bert_layer, max_len=160)

In [17]:
checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', save_best_only=True)
train_history = model.fit(train_input, train_labels, validation_split=0.2,
    epochs=3, callbacks=[checkpoint], batch_size=16)

Train on 6090 samples, validate on 1523 samples
Epoch 1/2
Epoch 2/2


In [18]:
model.load_weights('model.h5')
test_pred_BERT = pd.DataFrame()
test_pred_BERT = model.predict(test_input)
test_pred_BERT_int = test_pred_BERT.round().astype('int')
train_pred_BERT = model.predict(train_input)
train_pred_BERT_int = train_pred_BERT.round().astype('int')

In [20]:
for i, j in zip(train.text.values[-50:],submission_bert['target'][-50:]):
    print(i)
    print(j)

wrecked stomach help
0
ohhmyjoshh stevenrulles gonna thinking gets shit wrecked first day school
0
wrecked tired gonna asleep
0
cramer igers words wrecked disneys stock
0
wrecked emotions
0
riddler best earlyexit primary presidential wannabe certain chances gets wrecked rich guy
0
marynmck thats beyond adorable hope wont wrecked now noticed
0
cramer igers words wrecked disneys stock cnbc topnews
0
caitsroberts see u night wee barra absolutely wrecked
0
kirafrog mountwario wrecked
0
awesome time gettin wrecked bowling last night
1
cramer words wrecked dis stock
0
bright side wrecked
0
wrecked
0
hes gone relax thought wife wrecked cake goner mind lol whoops
0
cameronhacker wrecked
0
three days work theyve pretty much wrecked hahaha shoutout family one
0
fx forex trading cramer igers words wrecked disneys stock
1
engineshed great atmosphere british lion gig tonight hearing wrecked
1
cramer igers words wrecked disneys stock cnbc
1
pic old pkk suicide bomber detonated bomb turkey army trenc

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns


# Showing Confusion Matrix
def plot_cm(y_true, y_pred, title, figsize=(5,5)):
    cm = confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
    cm_sum = np.sum(cm, axis=1, keepdims=True)
    cm_perc = cm / cm_sum.astype(float) * 100
    annot = np.empty_like(cm).astype(str)
    nrows, ncols = cm.shape
    for i in range(nrows):
        for j in range(ncols):
            c = cm[i, j]
            p = cm_perc[i, j]
            if i == j:
                s = cm_sum[i]
                annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
            elif c == 0:
                annot[i, j] = ''
            else:
                annot[i, j] = '%.1f%%\n%d' % (p, c)
    cm = pd.DataFrame(cm, index=np.unique(y_true), columns=np.unique(y_true))
    cm.index.name = 'Actual'
    cm.columns.name = 'Predicted'
    fig, ax = plt.subplots(figsize=figsize)
    plt.title(title)
    sns.heatmap(cm, cmap= "YlGnBu", annot=annot, fmt='', ax=ax)

    # Showing Confusion Matrix for BERT model
plot_cm(train_pred_BERT_int,train['target'].values,  'Confusion matrix for BERT model', figsize=(6,6))

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,make_scorer
print('accuracy score: ',accuracy_score(train_pred_BERT_int,train["target"].values))

print(classification_report(train["target"].values, train_pred_BERT_int))