In [3]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd

In [4]:
imdb_train, ds_info = tfds.load(name="imdb_reviews", 
    split="train", with_info=True, 
    as_supervised=True)
imdb_test = tfds.load(name="imdb_reviews", 
    split="test",
    as_supervised=True)

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]





0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteVXQAX6/imdb_reviews-train.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteVXQAX6/imdb_reviews-test.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteVXQAX6/imdb_reviews-unsupervised.tfrecord


  0%|          | 0/50000 [00:00<?, ? examples/s]

[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [None]:
# Use the default tokenizer settings
tokenizer = tfds.features.text.Tokenizer()
vocabulary_set = set()
MAX_TOKENS = 0
for example, label in imdb_train:
    some_tokens = tokenizer.tokenize(example.numpy())
    if MAX_TOKENS < len(some_tokens):
        MAX_TOKENS = len(some_tokens)
    vocabulary_set.update(some_tokens)

In [None]:
imdb_encoder = tfds.features.text.TokenTextEncoder(vocabulary_set,lowercase=True,tokenizer=tokenizer)

vocab_size = imdb_encoder.vocab_size

print(vocab_size, MAX_TOKENS)

In [None]:
# transformation functions to be used with the dataset
from tensorflow.keras.preprocessing import sequence

def encode_pad_transform(sample):
    encoded = imdb_encoder.encode(sample.numpy())
    pad = sequence.pad_sequences([encoded], padding='post', maxlen=150)
    return np.array(pad[0], dtype=np.int64)

def encode_tf_fn(sample, label):
    encoded = tf.py_function(encode_pad_transform, inp=[sample], Tout=(tf.int64))
    encoded.set_shape([None])
    label.set_shape([])
    return encoded, label

In [None]:
encoded_train = imdb_train.map(encode_tf_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)

encoded_test = imdb_test.map(encode_tf_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)

# **Download GloVe**

In [None]:
# Download the GloVe embeddings
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

In [None]:
dict_w2v = {}
with open('glove.6B.50d.txt', "r") as file:
    for line in file:
        tokens = line.split()
        word = tokens[0]
        vector = np.array(tokens[1:], dtype=np.float32)
        
        if vector.shape[0] == 50:
            dict_w2v[word] = vector
        else:
            print("There was an issue with " + word)
print("Dictionary Size: ", len(dict_w2v))

In [None]:
embedding_dim = 50
embedding_matrix = np.zeros((imdb_encoder.vocab_size, embedding_dim))

In [None]:
unk_cnt = 0
unk_set = set()
for word in imdb_encoder.tokens:
    embedding_vector = dict_w2v.get(word)
    if embedding_vector is not None:
        tkn_id = imdb_encoder.encode(word)[0]
        embedding_matrix[tkn_id] = embedding_vector
    else:
        unk_cnt += 1
        unk_set.add(word)
# Print how many weren't found
print("Total unknown words: ", unk_cnt)

In [None]:
# Length of the vocabulary in chars
vocab_size = imdb_encoder.vocab_size # len(chars)

# Number of RNN units
rnn_units = 64

#batch size
BATCH_SIZE=100

In [None]:
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense

def build_model_bilstm(vocab_size, embedding_dim, rnn_units, batch_size, train_emb=False):
    model = tf.keras.Sequential([
        Embedding(vocab_size, embedding_dim, mask_zero=True, weights=[embedding_matrix], trainable=train_emb),
        Bidirectional(LSTM(rnn_units, return_sequences=True, dropout=0.5)),
        Bidirectional(LSTM(rnn_units, dropout=0.25)),
        Dense(1, activation='sigmoid')
    ])
    return model

# **Feature Extraction**

In [None]:
model_fe = build_model_bilstm(
    vocab_size = vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)

model_fe.summary()

In [None]:
model_fe.compile(loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', 'Precision', 'Recall'])

In [None]:
# Prefetch for performance
encoded_train_batched = encoded_train.batch(BATCH_SIZE).prefetch(100)

model_fe.fit(encoded_train_batched, epochs=10)

In [None]:
model_fe.evaluate(encoded_test.batch(BATCH_SIZE))

# **Fine-tuning**

In [None]:
model_ft = build_model_bilstm(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE,
    train_emb=True)
model_ft.summary()

In [None]:
model_ft.compile(loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', 'Precision', 'Recall'])
model_ft.fit(encoded_train_batched, epochs=10)

In [None]:
model_ft.evaluate(encoded_test.batch(BATCH_SIZE))

# **BERT**

In [1]:
from transformers import BertTokenizer

bert_name = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(bert_name,
    add_special_tokens=True,
    do_lower_case=False,
    max_length=150,
    pad_to_max_length=True)

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [2]:
def bert_encoder(review):
    txt = review.numpy().decode('utf-8')
    encoded = tokenizer.encode_plus(txt, add_special_tokens=True,
        max_length=150,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_token_type_ids=True)

    return encoded['input_ids'], encoded['token_type_ids'], encoded['attention_mask']

In [5]:
bert_train = [bert_encoder(r) for r, l in imdb_train]
bert_lbl = [l for r, l in imdb_train]
bert_train = np.array(bert_train)
bert_lbl = tf.keras.utils.to_categorical(bert_lbl, num_classes=2)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [6]:
# create training and validation splits
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(bert_train,
    bert_lbl,
    test_size=0.2,
    random_state=42)
print(x_train.shape, y_train.shape)

(20000, 3, 150) (20000, 2)


In [7]:
tr_reviews, tr_segments, tr_masks = np.split(x_train, 3, axis=1)
val_reviews, val_segments, val_masks = np.split(x_val, 3, axis=1)

tr_reviews = tr_reviews.squeeze()
tr_segments = tr_segments.squeeze()
tr_masks = tr_masks.squeeze()

val_reviews = val_reviews.squeeze()
val_segments = val_segments.squeeze()
val_masks = val_masks.squeeze()

In [9]:
def example_to_features(input_ids,attention_masks,token_type_ids,y):
    return {"input_ids": input_ids,
        "attention_mask": attention_masks,
        "token_type_ids": token_type_ids},y

In [10]:
train_ds = tf.data.Dataset.from_tensor_slices((tr_reviews,
    tr_masks, tr_segments, y_train)).\
    map(example_to_features).shuffle(100).batch(16)

valid_ds = tf.data.Dataset.from_tensor_slices((val_reviews,
    val_masks, val_segments, y_val)).\
    map(example_to_features).shuffle(100).batch(16)

# **Pre-built BERT classification model**

In [11]:
from transformers import TFBertForSequenceClassification
bert_model = TFBertForSequenceClassification.from_pretrained(bert_name)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/527M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

bert_model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [14]:
bert_model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  108310272 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 108,311,810
Trainable params: 108,311,810
Non-trainable params: 0
_________________________________________________________________


In [None]:
print("Fine-tuning BERT on IMDB")
bert_history = bert_model.fit(train_ds, epochs=3, validation_data=valid_ds)

Fine-tuning BERT on IMDB
Epoch 1/3

In [None]:
# prep data for testing
bert_test = [bert_encoder(r) for r,l in imdb_test]
bert_tst_lbl = [l for r, l in imdb_test]

bert_test2 = np.array(bert_test)
bert_tst_lbl2 = tf.keras.utils.to_categorical (bert_tst_lbl, num_classes=2)

ts_reviews, ts_segments, ts_masks = np.split(bert_test2, 3, axis=1)
ts_reviews = ts_reviews.squeeze()
ts_segments = ts_segments.squeeze()
ts_masks = ts_masks.squeeze()

test_ds = tf.data.Dataset.from_tensor_slices((ts_reviews,
    ts_masks, ts_segments, bert_tst_lbl2)).\
    map(example_to_features).shuffle(100).batch(16)

In [None]:
bert_model.evaluate(test_ds)

# **Custom model with BERT**

In [None]:
from transformers import TFBertModel
bert_name = 'bert-base-cased'
bert = TFBertModel.from_pretrained(bert_name)
bert.summary()

In [None]:
max_seq_len = 150
inp_ids = tf.keras.layers.Input((max_seq_len,), dtype=tf.int64,
    name="input_ids")
att_mask = tf.keras.layers.Input((max_seq_len,), dtype=tf.int64,
    name="attention_mask")
seg_ids = tf.keras.layers.Input((max_seq_len,), dtype=tf.int64,
    name="token_type_ids")

In [None]:
train_ds.element_spec

In [None]:
inp_dict = {"input_ids": inp_ids,
    "attention_mask": att_mask,
    "token_type_ids": seg_ids}
outputs = bert(inp_dict)
# let's see the output structure
outputs

In [None]:
x = tf.keras.layers.Dropout(0.2)(outputs[1])
x = tf.keras.layers.Dense(200, activation='relu')(x)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Dense(2, activation='sigmoid')(x)
custom_model = tf.keras.models.Model(inputs=inp_dict, outputs=x)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
custom_model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [None]:
custom_model.summary()

In [None]:
print("Custom Model: Fine-tuning BERT on IMDB")
custom_history = custom_model.fit(train_ds, epochs=3, validation_data=valid_ds)

In [None]:
custom_model.evaluate(test_ds)

Make the BERT not trainable, so the pre-trained parameters will not be changed a lot. 

In [None]:
bert.trainable = False # don't train BERT any more
optimizer = tf.keras.optimizers.Adam() # standard learning rate
custom_model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [None]:
custom_model.summary()

In [None]:
print("Custom Model: Keep training custom model on IMDB")
custom_history = custom_model.fit(train_ds, epochs=10, validation_data=valid_ds)

In [None]:
custom_model.evaluate(test_ds)