In [109]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import TFAutoModel, BertTokenizer

In [110]:
# Load and preprocess the data
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin', engine='python', header=None)
df.columns = ['target', 'id', 'date', 'flag', 'user', 'text']
df = df.sample(n=2000)

In [111]:
X = df['text']
labels = df['target']
labels[labels == 4] = 1

print(labels)

695325     0
223711     0
491579     0
438065     0
1436152    1
          ..
1526877    1
1226042    1
214676     0
1385448    1
413250     0
Name: target, Length: 2000, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labels[labels == 4] = 1


In [112]:
# Tokenize and pad the sequences
'''tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)'''

'tokenizer = Tokenizer()\ntokenizer.fit_on_texts(X)\nsequences = tokenizer.texts_to_sequences(X)'

In [113]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the input data
input_ids = []
for example in X:
    tokens = tokenizer.encode(example, add_special_tokens=True, max_length=50, padding='max_length', truncation=True)
    input_ids.append(tokens)

In [114]:
print(input_ids[0])

[101, 1030, 2123, 2102, 4783, 19895, 23606, 5480, 1045, 2359, 2000, 2272, 2061, 2919, 999, 999, 2045, 2001, 2053, 2380, 3258, 2290, 2054, 2061, 2412, 2295, 999, 999, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [115]:
labels = tf.keras.utils.to_categorical(labels)

In [116]:
print(labels)

[[1. 0.]
 [1. 0.]
 [1. 0.]
 ...
 [1. 0.]
 [0. 1.]
 [1. 0.]]


In [117]:
# Create the attention mask matrix
attention_mask = np.ones_like(input_ids)
padded_positions = np.equal(input_ids, 0)
attention_mask[padded_positions] = 0

In [118]:
SEQ_LEN = 50

Xids = np.zeros((len(df), SEQ_LEN))
Xmask = np.zeros((len(df), SEQ_LEN))

In [119]:
print(attention_mask.shape)

(2000, 50)


In [120]:
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))

def map_func(sequence, attention, labels):
    return {'input_ids': sequence, 'attention_mask': attention}, labels

dataset = dataset.map(map_func)

for i in dataset.take(1):
    print(i)

({'input_ids': <tf.Tensor: shape=(50,), dtype=float64, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])>, 'attention_mask': <tf.Tensor: shape=(50,), dtype=float64, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])>}, <tf.Tensor: shape=(2,), dtype=float32, numpy=array([1., 0.], dtype=float32)>)


In [121]:
dataset = dataset.batch(32)

In [122]:
DS_LEN = len(df)

train_len = 0.8  # 90-10 split
val_len = 0.5

train = dataset.take(round(DS_LEN*train_len))  # get first 90% of batches
test = dataset.skip(round(DS_LEN*train_len))  # skip first 90% and keep final 10%

test = dataset.take(round(DS_LEN*val_len))  # get first 90% of batches
val = dataset.skip(round(DS_LEN*val_len))  # skip first 90% and keep final 10%



del dataset  # optionally, delete dataset to free up disk-space

In [123]:
# Define the BERT model
bert = TFAutoModel.from_pretrained('bert-base-cased')

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [124]:
input_ids = tf.keras.layers.Input(shape=(50,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(50,), name='attention_mask', dtype='int32')

# we consume the last_hidden_state tensor from bert (discarding pooled_outputs)
embeddings = bert(input_ids, attention_mask=mask)[0]

X = tf.keras.layers.LSTM(64)(embeddings)
X = tf.keras.layers.BatchNormalization()(X)
X = tf.keras.layers.Dense(64, activation='relu')(X)
X = tf.keras.layers.Dropout(0.1)(X)
y = tf.keras.layers.Dense(2, activation='softmax', name='outputs')(X)

# define input and output layers of our model
model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

# freeze the BERT layer - otherwise we will be training 100M+ parameters...
model.layers[2].trainable = False

In [125]:
optimizer = tf.keras.optimizers.Adam(0.01)
loss = tf.keras.losses.CategoricalCrossentropy()  # categorical = one-hot
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

In [126]:
history = model.fit(train, validation_data=val, epochs=40)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40