In [1]:
import math
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization
import os

print(tf.__version__)

2.10.0


In [2]:
tf.config.run_functions_eagerly(True)

In [3]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [4]:
# create training dataframe
sarc_train = pd.read_csv("../shared_data/train-balanced-sarcasm.csv")

# Drop rows with null values
sarc_train = sarc_train.dropna()
# isolate comments and rows
sarc_train = sarc_train[['label', 'comment']]
# rename columns and reindex
sarc_train = sarc_train.rename(columns ={'comment':'text'})
sarc_train = sarc_train.reindex(columns=['text','label'])
# peek at the data
sarc_train.head()

Unnamed: 0,text,label
0,NC and NH.,0
1,You do know west teams play against west teams...,0
2,"They were underdogs earlier today, but since G...",0
3,"This meme isn't funny none of the ""new york ni...",0
4,I could use one of those tools.,0


In [5]:
# we should see that there are only rows with non-null values
sarc_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1010773 entries, 0 to 1010825
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   text    1010773 non-null  object
 1   label   1010773 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 23.1+ MB


In [6]:
subset_size = len(sarc_train.index)
testing_size = int(subset_size * 0.4)
validation_size = int(subset_size * 0.2)
shuffle_size = subset_size - validation_size

data_batch_size = 32

In [7]:
data = sarc_train.sample(frac=1).reset_index(drop=True)
train_data = data.head(subset_size)
test_data = data.tail(testing_size)

train_ds = tf.data.Dataset.from_tensor_slices(
    (
        train_data['text'][validation_size:], 
        train_data['label'][validation_size:]
    )
).shuffle(shuffle_size).batch(data_batch_size)

val_ds = tf.data.Dataset.from_tensor_slices(
    (
        train_data['text'][:validation_size],
        train_data['label'][:validation_size]
    )
).batch(data_batch_size)

test_ds = tf.data.Dataset.from_tensor_slices(
    (
        test_data['text'],
        test_data['label']
    )
)

epochs = 100
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)
init_lr = 3e-5

#define the parameters for tokenizing and padding
vocab_size = 10000
embedding_dim = 32
max_length = 500

In [9]:
preprocessing_layer = hub.KerasLayer(
    'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3', 
    name='preprocessing'
)

bert_encoder = hub.KerasLayer(
    'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/2', 
    trainable=True, 
    name='BERT_encoder'
)

OSError: SavedModel file does not exist at: C:\Users\Joshua\AppData\Local\Temp\tfhub_modules\602d30248ff7929470db09f7385fc895e9ceb4c0\{saved_model.pbtxt|saved_model.pb}

In [None]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    encoder_inputs = preprocessing_layer(text_input)
    outputs = bert_encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
    return tf.keras.Model(text_input, net)

classifier_model = build_classifier_model()

In [None]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

optimizer = optimization.create_optimizer(
    init_lr=init_lr,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    optimizer_type='adamw'
)

classifier_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

saved_model_path = './model_saves/bert_v0/'
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=saved_model_path,
    monitor='val_binary_accuracy',
    mode='max',
    save_best_only=True
)

history = classifier_model.fit(x=train_ds,
                               validation_data=val_ds,
                               epochs=epochs,
                               callbacks=[checkpoint])

history_dict = history.history
print(history_dict.keys())

### Test the model
loss, accuracy = classifier_model.evaluate(test_ds.batch(data_batch_size))

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')