In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [None]:
df = pd.read_csv('../input/biases-dataset/ibc_data.csv')

In [None]:
import re

# clean text from noise
def clean_text(text):
    # filter to allow only alphabets
    text = re.sub(r'[^a-zA-Z\']', ' ', text)
    
    # remove Unicode characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    
    # convert to lowercase to maintain consistency
    text = text.lower()
       
    return text

df['text'] = df.text.apply(clean_text)

In [None]:
from transformers import AutoTokenizer

SEQ_LEN = 350

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def tokenize(sentence):
    tokens = tokenizer.encode_plus(sentence, max_length=SEQ_LEN,
                                  truncation=True, padding='max_length',
                                  add_special_tokens=True, return_attention_mask=True,
                                  return_token_type_ids=False, return_tensors='tf')
    return tokens['input_ids'], tokens['attention_mask']

# initializing two array for imput tensors
Xids = np.zeros((len(df), SEQ_LEN))
Xmask = np.zeros((len(df), SEQ_LEN))

for i, sentence in enumerate(df['text']):
    Xids[i, :], Xmask[i, :] = tokenize(sentence)
    if i % 1000 == 0:
        print(i)  # do this so we can see some progress

In [None]:
from sklearn.preprocessing import LabelEncoder
import pickle
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

pickle_out = open("LabelEncoder.pickle","wb")
pickle.dump(le, pickle_out)
pickle_out.close()

In [None]:
arr = df['label'].values  # take sentiment column in df as array
labels = np.zeros((arr.size, arr.max()+1))  # initialize empty (all zero) label array
labels[np.arange(arr.size), arr] = 1  # add ones in indices where we have a value

In [None]:

with open('biases-xids.npy', 'wb') as f:
    np.save(f, Xids)
with open('biases-xmask.npy', 'wb') as f:
    np.save(f, Xmask)
with open('biases-labels.npy', 'wb') as f:
    np.save(f, labels)

In [None]:
import tensorflow as tf

BATCH_SIZE = 32

# load arrays into tensorflow dataset
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))

# create a mapping function that we use to restructure our dataset
def map_func(input_ids, masks, labels):
    return {'input_ids': input_ids, 'attention_mask': masks}, labels

# using map method to apply map_func to dataset
dataset = dataset.map(map_func)
# shuffle data and batch it
dataset = dataset.shuffle(10000).batch(BATCH_SIZE)

In [None]:
# get the length of the batched dataset
DS_LEN = len([0 for batch in dataset])
SPLIT = 0.8  # 80-20 split

train = dataset.take(round(DS_LEN*SPLIT))  # get first 90% of batches
val = dataset.skip(round(DS_LEN*SPLIT))  # skip first 90% and keep final 10%

del dataset  # optionally, delete dataset to free up disk-space

## Model Definition

In [None]:
from transformers import TFBertModel

bert = TFBertModel.from_pretrained('bert-base-uncased')

input_ids = tf.keras.layers.Input(shape=(350,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(350,), name='attention_mask', dtype='int32')

# we consume the last_hidden_state tensor from bert (discarding pooled_outputs)
embeddings = bert.bert(input_ids, attention_mask=mask)[0]

X = tf.keras.layers.LSTM(64)(embeddings)
X = tf.keras.layers.BatchNormalization()(X)
X = tf.keras.layers.Dense(64, activation='relu')(X)
X = tf.keras.layers.Dropout(0.1)(X)
y = tf.keras.layers.Dense(3, activation='softmax', name='outputs')(X)

# define input and output layers of our model
model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

# freeze the BERT layer - otherwise we will be training 100M+ parameters...
model.layers[2].trainable = False

In [None]:
optimizer = tf.keras.optimizers.Adam(0.001)
loss = tf.keras.losses.CategoricalCrossentropy()  # categorical = one-hot
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')
checkpoint = tf.keras.callbacks.ModelCheckpoint("./Biases_Checkpoint",
                             monitor="accuracy",
                             mode="max",
                             save_best_only = True,
                             verbose=1)
earlystop = tf.keras.callbacks.EarlyStopping(monitor = 'accuracy', # value being monitored for improvement
                          min_delta = 0, #Abs value and is the min change required before we stop
                          patience = 5, #Number of epochs we wait before stopping 
                          verbose = 1,
                          restore_best_weights = True)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor = 'accuracy', factor = 0.01, patience = 3,
                                                 verbose = 1, min_delta = 0.0001)

callbacks = [checkpoint, earlystop, reduce_lr]

model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

history = model.fit(train,
                    validation_data=val,
                    epochs=50,
                    callbacks = callbacks)

In [None]:
tf.saved_model.save(model, "./saved_model")

In [None]:
history_dict = history.history
accuracy = max(history_dict['accuracy'])

In [None]:
import pickle 

pickle_out = open("BIASES_history_{}.pickle".format(accuracy),"wb")
pickle.dump(history.history, pickle_out)
pickle_out.close()

In [None]:
# Plotting our loss charts
import matplotlib.pyplot as plt

history_dict = history.history

loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
epochs = range(1, len(loss_values) + 1)

line1 = plt.plot(epochs, val_loss_values, label='Validation/Test Loss')
line2 = plt.plot(epochs, loss_values, label='Training Loss')
plt.setp(line1, linewidth=2.0, marker = '+', markersize=10.0)
plt.setp(line2, linewidth=2.0, marker = '4', markersize=10.0)
plt.xlabel('Epochs') 
plt.ylabel('Loss')
plt.grid(True)
plt.legend()
plt.show()

In [None]:
# Plotting our accuracy charts
import matplotlib.pyplot as plt

history_dict = history.history

acc_values = history_dict['accuracy']
val_acc_values = history_dict['val_accuracy']
epochs = range(1, len(loss_values) + 1)

line1 = plt.plot(epochs, val_acc_values, label='Validation/Test Accuracy')
line2 = plt.plot(epochs, acc_values, label='Training Accuracy')
plt.setp(line1, linewidth=2.0, marker = '+', markersize=10.0)
plt.setp(line2, linewidth=2.0, marker = '4', markersize=10.0)
plt.xlabel('Epochs') 
plt.ylabel('Accuracy')
plt.grid(True)
plt.legend()
plt.show()