##### fine-tuning with BERT 

Use pre-trained model BERT on the challenge: Real Disaster or Not

Here tensorflow implementation of BERT is used as a trainable layer.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
import tensorflow.keras.layers as L
import tensorflow.keras.backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras import Model
from IPython.display import clear_output
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
import tokenization

import warnings 
warnings.filterwarnings('ignore')

print(tf.__version__)

2.1.0


##### well-cleaned data

In [2]:
df_train = pd.read_csv('./data/cleaned_train.csv')
lines = df_train.text.values
y = df_train.target.values

df_test = pd.read_csv('./data/perfect_test.csv')
lines_test = df_test.text.values
y_test = df_test.target.values

##### tensorflow hub bert layer

In [4]:
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",trainable=True)

vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

#functions to prepare data for bert layer
def get_ids(lines,max_len):
    ixs = []
    for line in lines:
        tmp = tokenizer.convert_tokens_to_ids(line)
        tmp += [0] * (max_len - len(tmp))
        ixs.append(tmp)
    return tf.convert_to_tensor(ixs)
    
def get_masks(lines,max_len):
    masks = []
    for line in lines:
        tmp = [1] * len(line) + [0] * (max_len - len(line))
        masks.append(tmp)
    return tf.convert_to_tensor(masks)

def get_segments(lines,max_len):
    return tf.convert_to_tensor([[0] * max_len for _ in range(len(lines))])
    
def preprocess_bert(batch_lines,max_len=None):
    batch_lines = [['[CLS]'] + tokenizer.tokenize(line) + ['[SEP]'] for line in batch_lines]
    if max_len is None:
        max_len = max(list(map(len,batch_lines)))
    
    ixs = get_ids(batch_lines,max_len)
    masks = get_masks(batch_lines,max_len)
    segments = get_segments(batch_lines,max_len)
    
    return ixs,masks,segments

In [5]:
max_len = 120

lines_train, lines_dev, y_train, y_dev = train_test_split(lines,y,test_size=0.2,random_state=42)
X_train = preprocess_bert(lines_train,max_len)
X_dev = preprocess_bert(lines_dev,max_len)
#X_test = preprocess_bert(lines_test,max_len)

In [6]:
def fmeasure(y_true,y_pred):
    true_positives = K.sum(y_true * y_pred)
    pred_positives = K.sum(y_pred)
    real_positives = K.sum(y_true)
    precision = true_positives / (pred_positives + K.epsilon())
    recall = true_positives / (real_positives + K.epsilon())
    return 2 * precision * recall / (precision + recall)

In [7]:
input_ixs = L.Input(shape=(max_len,),dtype=tf.int32,name='token_ids')
input_masks = L.Input(shape=(max_len,),dtype=tf.int32,name='masks')
input_segments = L.Input(shape=(max_len,),dtype=tf.int32,name='segments')

pooled_output, seq_output = bert_layer([input_ixs,input_masks,input_segments])
#directly use embeddings for the first token [CLS]
output = seq_output[:,0,:]
output = L.Dense(1,activation='sigmoid')(output)

model = Model(inputs=[input_ixs,input_masks,input_segments],outputs=output)
#learning rate from 2e-5 to 5e-5
optimizer = tf.optimizers.Adam(learning_rate=2e-5)
model.compile(loss='binary_crossentropy',optimizer=optimizer,metrics=['accuracy',fmeasure])

In [8]:
#train 4 epochs due to memory limit (observed that more epochs will not improve much)
BATCH_SIZE = 32
EPOCHS = 4

es = keras.callbacks.EarlyStopping(patience=5,restore_best_weights=True)
model.fit(X_train,y_train,validation_data=(X_dev,y_dev),epochs=EPOCHS,batch_size=BATCH_SIZE,callbacks=[es])

Train on 5428 samples, validate on 1358 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x24017ee6148>

In [9]:
X_test = preprocess_bert(lines_test,max_len)
pred = model.predict(X_test,batch_size=BATCH_SIZE)

In [10]:
print(f1_score(y_test,np.round(pred)))

0.7780898876404494


In [11]:
df_sub = pd.DataFrame({'id':df_test.id.values,'target':np.round(pred).astype('int').reshape(-1,)})
df_sub.to_csv('./submissions/bert_fine_tuning.csv',index=False)