In [1]:
!pip install tensorflow_text

Collecting tensorflow_text
  Downloading tensorflow_text-2.7.3-cp37-cp37m-manylinux2010_x86_64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 4.3 MB/s 
Installing collected packages: tensorflow-text
Successfully installed tensorflow-text-2.7.3


In [2]:
pip install -q tf-models-official

[K     |████████████████████████████████| 1.8 MB 4.2 MB/s 
[K     |████████████████████████████████| 47.7 MB 2.0 MB/s 
[K     |████████████████████████████████| 213 kB 57.7 MB/s 
[K     |████████████████████████████████| 352 kB 18.1 MB/s 
[K     |████████████████████████████████| 90 kB 9.1 MB/s 
[K     |████████████████████████████████| 1.2 MB 41.3 MB/s 
[K     |████████████████████████████████| 1.1 MB 54.2 MB/s 
[K     |████████████████████████████████| 43 kB 2.1 MB/s 
[K     |████████████████████████████████| 99 kB 10.0 MB/s 
[K     |████████████████████████████████| 596 kB 53.3 MB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [3]:
import tensorflow as tf 
import tensorflow_text as text 
import numpy as np 
import os 
import shutil
import tensorflow_hub as hub 
from official.nlp import optimization 
import matplotlib.pyplot as plt 


In [4]:
url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

dataset = tf.keras.utils.get_file('aclImdb_v1.tar.gz', url,
                                  untar=True, cache_dir='.',
                                  cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

train_dir = os.path.join(dataset_dir, 'train')

# remove unused folders to make it easier to load the data
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [5]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed)

class_names = raw_train_ds.class_names
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

val_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)

val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

test_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/test',
    batch_size=batch_size)

test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


In [6]:
bert_preprocessor = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_model = hub.KerasLayer('https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/2')
text = ['This is a piece of input text'] 
tokenized = bert_preprocessor(text)
tokenized #input_type_ids simply returns the segments index we use. Since only one sentence they are all of same index 0, if two then 
#there would also be an array full of 1s. By default the input is padded/trunacted to 128 tokens. So this is suitable for larger sentences
#but not document level. input mask handles not deploying attention on parts of the input that dont matter, and input_word_ids are what
#the BERT model uses to create its vector embeddings. the input type ids again describe which segment we are in. This is what we will
#be inputting into our BERT Model.

{'input_mask': <tf.Tensor: shape=(1, 128), dtype=int32, numpy=
 array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
       dtype=int32)>,
 'input_type_ids': <tf.Tensor: shape=(1, 128), dtype=int32, numpy=
 array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 

In [7]:
vals = bert_model(tokenized)
vals['pooled_output'] #The pooled output represents the entire sentence as one embedding of size 768. So if we input 2 sentences
#then pooled_outputs size is (2,768).
#If we use vals['sequence_output'] we get the embeddings for each token itself, so if 2 sentences, 17 words each then its (2,17,768)
#512 is the default size of the embeddings used to represent our BERT output 
vals['sequence_output'].shape

TensorShape([1, 128, 768])

In [8]:
#Now all we have to do is simply use this BERT model and its preprocessor inside of our network and build dense layers on top.
def FineTunedBERT():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name = "input layer")
  tokensmaskids = bert_preprocessor(text_input)
  output = bert_model(tokensmaskids)['pooled_output'] #Since we are classifying sequence at a time the sentiment 
  output = tf.keras.layers.Dense(768, activation = 'relu')(output)
  output = tf.keras.layers.Dense(1, activation = None)(output)
  return tf.keras.Model(inputs = text_input, outputs = output)

loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.keras.metrics.BinaryAccuracy()
epochs = 5
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [9]:
TextClassifModel = FineTunedBERT()
TextClassifModel.compile(loss = loss, metrics = metrics, optimizer = optimizer)
TextClassifModel.fit(x=train_ds, validation_data = val_ds, epochs = epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fcc0bb982d0>

We can use other pretrained models for better results or even different hyperparameters(such as the loss optimizer), however as we can see above this got about 72.52% accuracy on detecting whether the sentiment is happy or sad which is passing at my university :). 