# Text classification

Unfortunally Google has stopped releasing tensorflow-text for Apple Silicon (And Windows as well).
Therefore I recommend uploading this notebook together with the training data to Google Colab (or any other Linux System)


In [1]:
# Install the required packages
!pip install -U "tensorflow-text==2.15.*"
!pip install "tf-models-official==2.15.*"

Collecting tensorflow-text==2.15.*
  Downloading tensorflow_text-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow-text
Successfully installed tensorflow-text-2.15.0
Collecting tf-models-official==2.15.*
  Downloading tf_models_official-2.15.0-py2.py3-none-any.whl (2.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Collecting sacrebleu (from tf-models-official==2.15.*)
  Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from tf-models-official==2.15.*)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.4 M

In [2]:
import os
import shutil
import json
import re

import tensorflow as tf
import tensorflow_text as text
import tensorflow_hub as hub
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

import numpy as np

tf.get_logger().setLevel('ERROR')

In [3]:
# If running on Google Colab, mount the drive
try:
  from google.colab import drive
  drive.mount('/content/drive', force_remount=True)
  pre_path = 'drive/MyDrive/NLP/'
  print('Running on Google Colab')
except:
  print('Not running on Google Colab')
  pre_path = ''

Mounted at /content/drive
Running on Google Colab


# Clean the data

Load the Dataset and remove all residuals.
Then store it in a way tensorflow/keras can read it.


In [4]:
def clean_story(story):
    start = 16
    end_index = story.find("\u2014", start)
    end_index = story.find("\n", start) if end_index == -1 else end_index
    return story[start:end_index].strip()

def clean_review(review):
    start_index = review.find("REZENSION\n") + 10
    end_index = review.find("\nhilfreich", start_index)
    cleaned = review[start_index:end_index].strip().replace("\n", " ")
    pattern = re.compile(r'^\d{1,2} /10\s*')
    return pattern.sub('', cleaned)

In [5]:
with open(pre_path + 'data/scraping_results.json', 'r') as f:
    old = json.load(f)


# Create directories if they don't exist
if not os.path.exists("data/test/review"):
    os.makedirs("data/test/review")
if not os.path.exists("data/test/storyline"):
    os.makedirs("data/test/storyline")
if not os.path.exists("data/train/review"):
    os.makedirs("data/train/review")
if not os.path.exists("data/train/storyline"):
    os.makedirs("data/train/storyline")

reviews = [clean_review(movie['review']) for movie in old]
storylines = [clean_story(movie['storyline']) for movie in old]

# Split data into train and test
train_reviews = reviews[:int(len(reviews) * 0.8)]
train_storylines = storylines[:int(len(storylines) * 0.8)]
test_reviews = reviews[int(len(reviews) * 0.8):]
test_storylines = storylines[int(len(storylines) * 0.8):]

print(f"Train reviews: {len(train_reviews)}")
print(f"Test reviews: {len(test_reviews)}")
print(f"Train storylines: {len(train_storylines)}")
print(f"Test storylines: {len(test_storylines)}")

# Save reviews as text files
for index, review in enumerate(train_reviews):
    with open(f"data/train/review/{index}.txt", "w") as f:
        f.write(review)

for index, review in enumerate(test_reviews):
    with open(f"data/test/review/{index}.txt", "w") as f:
        f.write(review)

# Save storylines as text files
for index, storyline in enumerate(train_storylines):
    with open(f"data/train/storyline/{index}.txt", "w") as f:
        f.write(storyline)

for index, storyline in enumerate(test_storylines):
    with open(f"data/test/storyline/{index}.txt", "w") as f:
        f.write(storyline)

Train reviews: 5608
Test reviews: 1402
Train storylines: 5608
Test storylines: 1402


In [6]:
# Load the Data
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'data/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed)

class_names = raw_train_ds.class_names
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

val_ds = tf.keras.utils.text_dataset_from_directory(
    'data/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)

val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

test_ds = tf.keras.utils.text_dataset_from_directory(
    'data/test',
    batch_size=batch_size)

test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 11216 files belonging to 2 classes.
Using 8973 files for training.
Found 11216 files belonging to 2 classes.
Using 2243 files for validation.
Found 2804 files belonging to 2 classes.


Let's take a look at a few reviews.


In [7]:
for text_batch, label_batch in train_ds.take(1):
  for i in range(3):
    print(f'Review: {text_batch.numpy()[i]}')
    label = label_batch.numpy()[i]
    print(f'Label : {label} ({class_names[label]})')

Review: b"Athena-1, a space station owned by gene manipulation company Energyne, is destroyed after a laboratory rat mutates and wreaks havoc. Dr. Kerry Atkins, the lone surviving crew member, is ordered by CEO Claire Wyden to retrieve research canisters containing a pathogen. Atkins is able to flee in the escape pod when the station implodes, but due to damage from the rat it disintegrates upon re-entry, killing her. The falling pieces leave a trail of debris across the United States, including the Everglades in Florida, where a canister is consumed by an American crocodile, and a forest in Wyoming, where a gray wolf is exposed to the pathogen. Primatologist Davis Okoye, a former US Army Special Forces soldier and member of an anti-poaching unit, works at the San Diego Wildlife Sanctuary. He has befriended a rare albino western lowland gorilla named George, having saved him from poachers, and communicates with George using sign language and hand gestures. At night, one of the canister

In [8]:
def name_to_model_and_preprocess(name):
    map_name_to_handle = {
        'bert_en_uncased_L-12_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
        'bert_en_cased_L-12_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
        'bert_multi_cased_L-12_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
        'small_bert/bert_en_uncased_L-2_H-128_A-2': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
        'small_bert/bert_en_uncased_L-2_H-256_A-4': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
        'small_bert/bert_en_uncased_L-2_H-512_A-8': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
        'small_bert/bert_en_uncased_L-2_H-768_A-12': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
        'small_bert/bert_en_uncased_L-4_H-128_A-2': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
        'small_bert/bert_en_uncased_L-4_H-256_A-4': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
        'small_bert/bert_en_uncased_L-4_H-512_A-8': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
        'small_bert/bert_en_uncased_L-4_H-768_A-12': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',
        'small_bert/bert_en_uncased_L-6_H-128_A-2': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',
        'small_bert/bert_en_uncased_L-6_H-256_A-4': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',
        'small_bert/bert_en_uncased_L-6_H-512_A-8': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',
        'small_bert/bert_en_uncased_L-6_H-768_A-12': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',
        'small_bert/bert_en_uncased_L-8_H-128_A-2': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',
        'small_bert/bert_en_uncased_L-8_H-256_A-4': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',
        'small_bert/bert_en_uncased_L-8_H-512_A-8': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',
        'small_bert/bert_en_uncased_L-8_H-768_A-12': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',
        'small_bert/bert_en_uncased_L-10_H-128_A-2': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',
        'small_bert/bert_en_uncased_L-10_H-256_A-4': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',
        'small_bert/bert_en_uncased_L-10_H-512_A-8': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',
        'small_bert/bert_en_uncased_L-10_H-768_A-12': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',
        'small_bert/bert_en_uncased_L-12_H-128_A-2': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',
        'small_bert/bert_en_uncased_L-12_H-256_A-4': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',
        'small_bert/bert_en_uncased_L-12_H-512_A-8': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',
        'small_bert/bert_en_uncased_L-12_H-768_A-12': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1',
        'albert_en_base': 'https://tfhub.dev/tensorflow/albert_en_base/2',
        'electra_small': 'https://tfhub.dev/google/electra_small/2',
        'electra_base': 'https://tfhub.dev/google/electra_base/2',
        'experts_pubmed': 'https://tfhub.dev/google/experts/bert/pubmed/2',
        'experts_wiki_books': 'https://tfhub.dev/google/experts/bert/wiki_books/2',
        'talking-heads_base': 'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
    }

    map_model_to_preprocess = {
        'bert_en_uncased_L-12_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
        'bert_en_cased_L-12_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
        'small_bert/bert_en_uncased_L-2_H-128_A-2': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
        'small_bert/bert_en_uncased_L-2_H-256_A-4': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
        'small_bert/bert_en_uncased_L-2_H-512_A-8': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
        'small_bert/bert_en_uncased_L-2_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
        'small_bert/bert_en_uncased_L-4_H-128_A-2': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
        'small_bert/bert_en_uncased_L-4_H-256_A-4': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
        'small_bert/bert_en_uncased_L-4_H-512_A-8': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
        'small_bert/bert_en_uncased_L-4_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
        'small_bert/bert_en_uncased_L-6_H-128_A-2': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
        'small_bert/bert_en_uncased_L-6_H-256_A-4': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
        'small_bert/bert_en_uncased_L-6_H-512_A-8': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
        'small_bert/bert_en_uncased_L-6_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
        'small_bert/bert_en_uncased_L-8_H-128_A-2': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
        'small_bert/bert_en_uncased_L-8_H-256_A-4': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
        'small_bert/bert_en_uncased_L-8_H-512_A-8': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
        'small_bert/bert_en_uncased_L-8_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
        'small_bert/bert_en_uncased_L-10_H-128_A-2': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
        'small_bert/bert_en_uncased_L-10_H-256_A-4': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
        'small_bert/bert_en_uncased_L-10_H-512_A-8': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
        'small_bert/bert_en_uncased_L-10_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
        'small_bert/bert_en_uncased_L-12_H-128_A-2': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
        'small_bert/bert_en_uncased_L-12_H-256_A-4': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
        'small_bert/bert_en_uncased_L-12_H-512_A-8': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
        'small_bert/bert_en_uncased_L-12_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
        'bert_multi_cased_L-12_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
        'albert_en_base': 'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
        'electra_small': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
        'electra_base': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
        'experts_pubmed': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
        'experts_wiki_books': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
        'talking-heads_base': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    }

    bert_model = hub.KerasLayer(map_name_to_handle[name], trainable=True, name="BERT_Encoder")
    bert_preprocess_model = hub.KerasLayer(map_model_to_preprocess[name], trainable=False, name="BERT_Preprocessing")
    return bert_model, bert_preprocess_model


In [9]:
def build_classifier_model(model, preprocess):
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = preprocess
    encoder_inputs = preprocessing_layer(text_input)
    encoder = model
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
    return tf.keras.Model(text_input, net)


In [10]:
if not os.path.exists(pre_path + "results"):
    os.makedirs(pre_path + "results")

if not os.path.exists(pre_path + "models"):
    os.makedirs(pre_path + "models")

available_models = [
    "bert_en_uncased_L-12_H-768_A-12", "bert_en_cased_L-12_H-768_A-12", "bert_multi_cased_L-12_H-768_A-12",
    "small_bert/bert_en_uncased_L-2_H-128_A-2", "small_bert/bert_en_uncased_L-2_H-256_A-4", "small_bert/bert_en_uncased_L-2_H-512_A-8",
    "small_bert/bert_en_uncased_L-2_H-768_A-12", "small_bert/bert_en_uncased_L-4_H-128_A-2", "small_bert/bert_en_uncased_L-4_H-256_A-4",
    "small_bert/bert_en_uncased_L-4_H-512_A-8", "small_bert/bert_en_uncased_L-4_H-768_A-12", "small_bert/bert_en_uncased_L-6_H-128_A-2",
    "small_bert/bert_en_uncased_L-6_H-256_A-4", "small_bert/bert_en_uncased_L-6_H-512_A-8", "small_bert/bert_en_uncased_L-6_H-768_A-12",
    "small_bert/bert_en_uncased_L-8_H-128_A-2", "small_bert/bert_en_uncased_L-8_H-256_A-4", "small_bert/bert_en_uncased_L-8_H-512_A-8",N
    "small_bert/bert_en_uncased_L-8_H-768_A-12", "small_bert/bert_en_uncased_L-10_H-128_A-2", "small_bert/bert_en_uncased_L-10_H-256_A-4",
    "small_bert/bert_en_uncased_L-10_H-512_A-8", "small_bert/bert_en_uncased_L-10_H-768_A-12", "small_bert/bert_en_uncased_L-12_H-128_A-2",
    "small_bert/bert_en_uncased_L-12_H-256_A-4", "small_bert/bert_en_uncased_L-12_H-512_A-8", "small_bert/bert_en_uncased_L-12_H-768_A-12",
    "albert_en_base", "electra_small", "electra_base", "experts_pubmed", "experts_wiki_books", "talking-heads_base"
]

epochs = 5
output_file = pre_path + "results/results.json"
data = {}


In [17]:
for model_name in available_models:
    if os.path.exists(output_file):
        with open(output_file, 'r') as file:
            data = json.load(file)
        if model_name in data.keys():
            print(f"skipping {model_name}")
            continue

    print(f"evaluating model {model_name}")

    bert_model, bert_preprocess_model = name_to_model_and_preprocess(model_name)
    classifier_model = build_classifier_model(bert_model, bert_preprocess_model)

    loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    metrics = tf.metrics.BinaryAccuracy()

    steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
    num_train_steps = steps_per_epoch * epochs
    num_warmup_steps = int(0.1 * num_train_steps)

    init_lr = 3e-5
    optimizer = optimization.create_optimizer(init_lr=init_lr, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, optimizer_type='adamw')

    classifier_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

    history = classifier_model.fit(x=train_ds, validation_data=val_ds, epochs=epochs)

    loss, accuracy = classifier_model.evaluate(test_ds)
    trainable_params = np.sum([np.prod(v.get_shape()) for v in classifier_model.trainable_weights])

    data[model_name] = {
        "loss": float(loss),
        "accuracy": float(accuracy),
        "trainable_params": int(trainable_params),
        "history": {
            "loss": history.history["loss"],
            "accuracy": history.history["binary_accuracy"],
            "val_loss": history.history["val_loss"],
            "val_accuracy": history.history["val_binary_accuracy"],
        }
    }

    with open(output_file, 'w') as file:
        json.dump(data, file)

    saved_model_path = pre_path + 'models/{}_bert'.format(model_name.replace('/', '_'))
    classifier_model.save(saved_model_path, include_optimizer=False)


skipping bert_en_uncased_L-12_H-768_A-12
evaluating model bert_en_cased_L-12_H-768_A-12
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
evaluating model bert_multi_cased_L-12_H-768_A-12
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
evaluating model small_bert/bert_en_uncased_L-2_H-128_A-2
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
evaluating model small_bert/bert_en_uncased_L-2_H-256_A-4
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
evaluating model small_bert/bert_en_uncased_L-2_H-512_A-8
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
evaluating model small_bert/bert_en_uncased_L-2_H-768_A-12
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
evaluating model small_bert/bert_en_uncased_L-4_H-128_A-2
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
evaluating model small_bert/bert_en_uncased_L-4_H-256_A-4
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
evaluating model small_bert/bert_en_uncased_L-4_H-512_A-8
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
evalu