# Text classification using pretrained model
---
Sergei Papulin (papulin.study@yandex.ru)

⚠️ **Warning.** It's much better to use `Colab` with `GPU`

In [None]:
# For colab download and install packages
# !pip install datasets==3.6.0 evaluate transformers==4.50.2 tensorflow tf_keras

In [None]:
import numpy as np

In [None]:
from datasets import (
    DatasetDict,
    Dataset,
    ClassLabel,
    Features,
    Value,
    concatenate_datasets
)

## Preparing dataset

In [None]:
# Datasets

def fetch_20news():
  from sklearn.datasets import fetch_20newsgroups
  # download if needed
  dataset = fetch_20newsgroups(
      subset="all",
      shuffle=True,
      remove=("headers", "footers", "quotes"),
      random_state=123
  )
  # create dictionary
  arr_names = np.array(dataset.target_names)
  # Note: We intentionally use names of classes instead of
  # labels here to demonstate ClassLabel
  dataset_dict = {'text': dataset.data, 'target': arr_names[dataset.target]}
  ds = Dataset.from_dict(dataset_dict)
  # convert target to ClassLabel
  unique_target_names = sorted(list(set(ds['target'])))
  target_feature = ClassLabel(names=unique_target_names)
  features = Features({
      **ds.features,
      'target': target_feature  # replace existing target feature
  })
  ds = ds.cast(features)
  # split data on train, val and test
  ds = ds.train_test_split(test_size=0.3, stratify_by_column='target')
  ds_trainval = ds['train'].train_test_split(test_size=0.1, stratify_by_column='target')
  ds['train'] = ds_trainval['train']
  ds['val'] = ds_trainval['test']
  return ds


def fetch_50authors():
  import requests
  from pathlib import Path
  import zipfile
  import io


  def download_and_extract(url, path='downloaded_files'):
      p = Path(path)
      # if p.exists():
      #     return path
      response = requests.get(url, stream=True)
      response.raise_for_status()
      p.mkdir(exist_ok=True)
      with zipfile.ZipFile(io.BytesIO(response.content)) as z:
          z.extractall(path)
      return path


  def gen_data(split_name='train', path='downloaded_files'):
      for file_path in Path(path).glob(f"C50{split_name}/*/*"):
          with open(file_path, mode='rt') as f:
              yield {
                  'target': file_path.parent.name,
                  'text': f.read(),
                  'filename': str(file_path)
              }

  # download data
  DATA_URL = 'https://archive.ics.uci.edu/static/public/217/reuter+50+50.zip'
  path = download_and_extract(url=DATA_URL)
  # create generators
  ds_train = Dataset.from_generator(
    generator=gen_data,
    gen_kwargs={'split_name': 'train', 'path': path},
    features=Features({
      'target': Value(dtype='string', id=None),
      'text': Value(dtype='string', id=None)
    })
  )
  ds_test = Dataset.from_generator(
      generator=gen_data,
      gen_kwargs={'split_name': 'test', 'path': path},
      features=Features({
        'target': Value(dtype='string', id=None),
        'text': Value(dtype='string', id=None)
      })
  )
  # convert target to ClassLabel
  unique_target_names = sorted(list(set(ds_train['target'])))
  target_feature = ClassLabel(names=unique_target_names)
  features = Features({
      **ds_train.features,
      'target': target_feature  # replace existing target feature
  })
  ds_train = ds_train.cast(features)
  ds_test = ds_test.cast(features)
  # split data on train, val and test
  ds = ds_train.train_test_split(test_size=0.1, stratify_by_column="target")
  ds_val = ds.pop("test")
  ds['val'] = ds_val
  ds['test'] = ds_test
  return ds


In [None]:
# load dataset
ds = fetch_20news()
ds

In [None]:
# Access set by name
ds['train']

In [None]:
# Access set elements by index
ds['train'][:2]

In [None]:
# Access set element values by field name
targets = ds['train'][:2]['target']
targets

In [None]:
# reverse target to label
ds['train'].features['target'].int2str(targets)

## Tokenizing

In [None]:
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding

In [None]:
# Pretrained model name
CHECKPOINT = "bert-base-uncased"

In [None]:
# Load tokinezer associated with the model
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
tokenizer

Single example

In [None]:
# sample text
text_input = 'If the function is asynchronous, then map will run your function in parallel'

# tokenize
tokenizer(text_input, padding='max_length', max_length=30)

In [None]:
# encode text to ids
text__encoded = tokenizer.encode(text_input)
text__encoded

In [None]:
# Ids to string
tokenizer.decode(text__encoded)

In [None]:
# Tokens
tokenizer.tokenize(text_input, add_special_tokens=True)

Entire dataset

In [None]:
def tokenize_function(example):
    return tokenizer(example['text'], truncation=True)

In [None]:
ds__tokenized = ds.map(tokenize_function, batched=True)
ds__tokenized

In [None]:
print(f"input_ids: {ds__tokenized['train'][0]['input_ids'][:5]}")
print(f"token_type_ids: {ds__tokenized['train'][0]['token_type_ids'][:5]}")
print(f"attention_mask: {ds__tokenized['train'][0]['attention_mask'][:5]}")

In [None]:
ds__tokenized['train']

In [None]:
# putting together samples inside a batch is called a collate function
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='tf')

In [None]:
# example of use of collate function
samples = ds__tokenized["train"][:10]
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

In [None]:
# create batches and convert to tensorflow dataset
tf_train_dataset = ds__tokenized['train'].to_tf_dataset(
    columns=['attention_mask', 'input_ids', 'token_type_ids'],
    label_cols='target',
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8
)

tf_val_dataset = ds__tokenized['val'].to_tf_dataset(
    columns=['attention_mask', 'input_ids', 'token_type_ids'],
    label_cols='target',
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8
)

tf_test_dataset = ds__tokenized['test'].to_tf_dataset(
    columns=['attention_mask', 'input_ids', 'token_type_ids'],
    label_cols='target',
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8
)

## Model

In [None]:
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tf_keras.optimizers import Adam
from tf_keras.optimizers.schedules import PolynomialDecay

**BERT** (uncased):
- params: `110M`
- layers: `12`
- heads: `12`
- dimensitions: `768`
- context length: `512`
- tokenizer: `WordPiece`

In [None]:
NUM_CLASSES = len(ds['train'].features['target'].names)

In [None]:
# load model
model = TFAutoModelForSequenceClassification.from_pretrained(
    CHECKPOINT,
    num_labels=NUM_CLASSES
)

In [None]:
# model topology
model.summary()

In [None]:
model.num_parameters(), model.num_parameters(only_trainable=True)

In [None]:
# iterate over layers
for layer in model.layers:
  print(f'{layer.name} {layer}')

In [None]:
# layers of BERT model
for layer in model.bert.encoder.layer:
  print(f'{layer.name} {layer}')

In [None]:
# number of neurons of output layer
model.classifier.units

In [None]:
# turn on/off weights from training (freezing)

# trainable parameters
print(f'Trainable parameters: {model.bert.trainable}')

# freeze parameters
model.bert.trainable = False
print(f'Trainable parameters: {model.bert.trainable}')
model.summary()

# freeze only specific layers of BERT
model.bert.trainable = True
model.bert.embeddings.trainable = False
model.bert.pooler.trainable = True
for layer in model.bert.encoder.layer[:-1]:
  layer.trainable = False
print('\nSome layers frozen\n')
model.summary()

for layer in model.bert.encoder.layer:
  print(f'Layer: {layer.name}, trainable: {layer.trainable}')

## Fitting

In [None]:
def get_optimizer(num_batches, batch_size, num_epochs):
  num_train_steps = num_batches * num_epochs
  lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5,
    end_learning_rate=0.0,
    decay_steps=num_train_steps
  )
  return Adam(learning_rate=lr_scheduler)


def build_model(optimizer):
  # load base model
  model = TFAutoModelForSequenceClassification.from_pretrained(
      CHECKPOINT,
      num_labels=NUM_CLASSES
  )
  # setup trainable layers
  model.bert.embeddings.trainable = False
  model.bert.pooler.trainable = True
  for layer in model.bert.encoder.layer[:-1]:
    layer.trainable = False
  # loss
  loss = SparseCategoricalCrossentropy(from_logits=True)
  # compile
  model.compile(
      optimizer=optimizer,
      loss=loss,
      metrics=["accuracy"]
  )
  return model

In [None]:
BATCH_SIZE = 8
NUM_EPOCHS = 3

# setup model
optimizer = get_optimizer(
    num_batches=len(tf_train_dataset),
    batch_size=BATCH_SIZE,
    num_epochs=NUM_EPOCHS
)
model = build_model(optimizer)
model.summary()

In [None]:
# train model
history = model.fit(
    tf_train_dataset,
    validation_data=tf_val_dataset,
    epochs=NUM_EPOCHS,
    batch_size=BATCH_SIZE
)

In [None]:
# TODO: save model

In [None]:
# TODO: load model

## Evaluating

In [None]:
class_label = ds['train'].features['target']
class_label

In [None]:
def get_predictions(outputs):
  logits = outputs['logits']
  predictions = np.argmax(logits, axis=-1)
  return predictions

Single example

In [None]:
# predict
text_input = 'If the function is asynchronous, then map will run your function in parallel'
text__encoded = tokenizer(text_input, return_tensors="tf")
# Not recommended
# text__encoded = tokenizer.encode(text_input)
# text__encoded = tokenizer(text_input, return_tensors="tf")['input_ids']

outputs = model.predict(text__encoded)
outputs

In [None]:
# class prediction
predictions = get_predictions(outputs)
predictions

In [None]:
# class label
class_label.int2str(predictions)

Evaluate on test set

In [None]:
# Option 1: Using TF Dataset with true label
model.evaluate(tf_test_dataset.take(5))

In [None]:
# Option 2: Predictions 
test_true = np.array(ds__tokenized['test'][:8*5]['target'])
test_pred = get_predictions(model.predict(tf_test_dataset.take(5)))

# accuracy
1/len(test_pred) * np.sum(test_pred == test_true)

Hugging Face `evaluate`

In [None]:
import evaluate

In [None]:
# load metrics
accuracy = evaluate.load("accuracy")
print(accuracy.description)

In [None]:
# args
accuracy.features

In [None]:
# run
accuracy.compute(references=test_true, predictions=test_pred)

In [None]:
accuracy = evaluate.load("accuracy")
for inputs, targets in tf_test_dataset.take(5):
    outputs = model.predict(inputs)
    accuracy.add_batch(references=targets, predictions=get_predictions(outputs))
accuracy.compute()

In [None]:
# model evaluation
metrics = evaluate.combine([
    evaluate.load('precision'),
    evaluate.load('recall'),
    evaluate.load('f1')
])

for inputs, targets in tf_test_dataset.take(5):
    outputs = model.predict(inputs)
    metrics.add_batch(references=targets, predictions=get_predictions(outputs))
metrics.compute(average='macro')

## Advanced

Access to `BERT` outputs

In [None]:
# take single batch of 8 items [0] and use only feature inputs [0]
test_input_ids = list(tf_test_dataset.take(1))[0][0]
test_input_ids

In [None]:
outputs = model.bert(test_input_ids)
outputs.keys()

In [None]:
# last hidden state
outputs[0].shape

In [None]:
# pooler_output
outputs[1].shape

Custom classification Head

In [None]:
# TODO