
Sarcasm Detection: BERT

In [None]:
# We will use the official tokenization script created by the Google team
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [None]:
!pip install sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 7.7MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.95


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
from sklearn import model_selection
from sklearn import metrics
import keras
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint

import tokenization

In [None]:
def bert_encode(texts, tokenizer, max_len=160):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
def build_model(bert_layer, max_len=160):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=2e-6), loss='binary_crossentropy', metrics=['accuracy',keras.metrics.Precision(), keras.metrics.Recall(), keras.metrics.TruePositives()])
    
    return model

In [None]:
%%time
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

CPU times: user 22.9 s, sys: 4.54 s, total: 27.5 s
Wall time: 34.6 s


In [None]:
# Load the Drive helper and mount 
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cod_train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/fake-news/train")
cod_train.rename(columns={'text': 'content'}, inplace=True)
cod_train=cod_train.dropna()

#test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/fake-news/test")
cod_train2, test= train_test_split(cod_train, random_state=0, test_size=0.2)
train, val =  train_test_split(cod_train2, random_state = 0,test_size=0.2)

In [None]:

vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [None]:
train.head()


Unnamed: 0,id,title,author,content,label
11904,11904,Il Regno Unito riprende l’addestramento degli ...,Rachele Marmetti,Il Regno Unito riprende l’addestramento degli ...,1
1512,1512,Beyond Voting: the Limits of Electoral Politics,Ken Knabb,(4) Representative democracy \n(5) Overt minor...,1
10790,10790,"Immunotherapy Offers Hope to a Cancer Patient,...",Matt Richtel,DENVER — A cancer patient nicknamed the Ste...,0
122,122,“Chapo Trap House”: New Left-Wing Podcast is a...,Eric Striker,“Chapo Trap House”: New Left-Wing Podcast is a...,1
8779,8779,Trump Budget Director Mulveney: We Are ‘Dead S...,Charlie Spiering,President Donald Trump’s budget director Mick ...,0


In [None]:

train_input = bert_encode(train.content.values, tokenizer, max_len = 160)
test_input = bert_encode(test.content.values, tokenizer, max_len = 160)
val_input = bert_encode(val.content.values, tokenizer, max_len = 160)

train_labels = train.label.values
test_labels = test.label.values
val_labels = val.label.values

In [None]:
model = build_model(bert_layer, max_len = 160)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 160)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 160)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 160)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 1024), (None 335141889   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

In [None]:
# Save the model after every epoch.
saveBestModel = ModelCheckpoint('best_model.hdf5', monitor='val_acc', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)
# Stop training when a monitored quantity has stopped improving.
earlyStopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')





In [None]:
rain_history = model.fit(
    train_input, train_labels,
    validation_data=(val_input, val_labels),
    epochs=8,
    batch_size=10,
    callbacks=[saveBestModel, earlyStopping]
)

#model.save('model.h5')

Epoch 1/8




Epoch 2/8




Epoch 3/8




Epoch 4/8




Epoch 5/8




In [None]:
test_pred = model.predict(test_input)
test_pred = test_pred.round().astype(int)

#test_pred = model.predict_classes(test_input, batch_size=batch_size)


In [None]:
recall = metrics.recall_score(test_labels,test_pred)
precision = metrics.precision_score(test_labels,test_pred)
f1_score = metrics.f1_score(test_labels,test_pred)
accuracy = metrics.accuracy_score(test_labels,test_pred)
loss = metrics.log_loss(test_labels,test_pred)

In [None]:
print('Loss:',loss)
print('Accuracy:',accuracy)
print('Precision:',precision)
print('Recall:',recall)
print('f1 score:',f1_score)

Loss: 0.4250102622134122
Accuracy: 0.9876948318293683
Precision: 0.9863861386138614
Recall: 0.9857761286332715
f1 score: 0.9860810392824003


In [None]:
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
# kappa
kappa = cohen_kappa_score(test_labels,test_pred)
print('Cohens kappa: %f' % kappa)
# ROC AUC
auc = roc_auc_score(test_labels,test_pred)
print('ROC AUC: %f' % auc)
# confusion matrix
matrix = confusion_matrix(test_labels,test_pred)
print(matrix)

Cohens kappa: 0.975054
ROC AUC: 0.987496
[[2018   22]
 [  23 1594]]


In [None]:
def getFP_FN_lists(test_X, test_y, pred_y):
    FP_text = []
    FP_index = []
    FN_text = []
    FN_index = []
    for i in range(len(test_y)):
        if(pred_y[i]==1 and test_y[test_y.index[i]]==0):
            FP_text.append(test['content'][test_y.index[i]])
            FP_index.append(test_y.index[i])
        elif(pred_y[i]==0 and test_y[test_y.index[i]]==1):
            FN_text.append(test['content'][test_y.index[i]])
            FN_index.append(test_y.index[i]) 
            
    return FP_text,FP_index,FN_text,FN_index


def getFP_FN(test_X, test_y, pred_y):
    FP_text,FP_index,FN_text,FN_index= getFP_FN_lists(test_X, test_y, pred_y)
    d_FP = {'FP_text':FP_text,'FP_index':FP_index}
    df_FP = pd.DataFrame(d_FP)
    d_FN = {'FN_text':FN_text,'FN_index':FN_index}
    df_FN = pd.DataFrame(d_FN)
    
    return df_FP,df_FN


#df_FP,df_FN, df_TP = getFP_FN_TP(test_input, test_labels, test_pred)
df_FP,df_FN = getFP_FN(test['content'], test['label'],test_pred)
df_FP.to_csv('FP_BERT.csv', index=True)
df_FN.to_csv('FN_BERT.csv', index=True)


In [None]:
def getTPTN_lists(test_X, test_y, pred_y):
    TP_text = []
    TP_index = []
    TN_text = []
    TN_index = []
    for i in range(len(test_y)):
        if(pred_y[i]==1 and test_y[test_y.index[i]]==1):
            TP_text.append(test['content'][test_y.index[i]])
            TP_index.append(test_y.index[i])
        elif(pred_y[i]==0 and test_y[test_y.index[i]]==0):
            TN_text.append(test['content'][test_y.index[i]])
            TN_index.append(test_y.index[i])

    return TP_text,TP_index,TN_text,TN_index

def getTPTN(test_X, test_y, pred_y):
    TP_text,TP_index,TN_text,TN_index= getTPTN_lists(test_X, test_y, pred_y)
    d_TP =  {'TP_text':TP_text,'TP_index':TP_index}
    df_TP = pd.DataFrame(d_TP)
    d_TN =  {'TN_text':TN_text,'TN_index':TN_index}
    df_TN = pd.DataFrame(d_TN)

    return df_TP,df_TN

 
df_TP,df_TN = getTPTN(test['content'], test['label'],test_pred)
df_TP.to_csv('TP_BERT.csv', index=True)
df_TN.to_csv('TN_BERT.csv', index=True)