## Keras Datasets Cleanups

In [42]:
import tensorflow_datasets as tfds

# Replace 'anli' with available keras datasets: https://www.tensorflow.org/datasets/catalog/overview
(ds_train, ds_test), ds_info = tfds.load(
    'anli', 
    split=['train', 'test'],
    shuffle_files=True,
    with_info=True,
)



In [26]:
# Replace 'context' with the correct field according to the dataset Feature documentation. 
text = [example['context'].numpy().decode() for example in ds_train]
text = text[:round(len(text)/4)]

In [27]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Tokenize the words from the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)
total_words = len(tokenizer.word_index) + 1
print('Total words: ', total_words)

Total words:  14204


In [28]:
# Convert data to sequence of tokens
input_sequences = []
for paragraph in text:
    # Convert our headline into a sequence of tokens
    token_list = tokenizer.texts_to_sequences([paragraph])[0]

    # Create a series of sequences for each paragraph
    for i in range(1, len(token_list)):
        partial_sequence = token_list[:i+1]
        input_sequences.append(partial_sequence)

print(tokenizer.sequences_to_texts(input_sequences[:5]))
input_sequences[:5]

['joey heindle', 'joey heindle born', 'joey heindle born 14', 'joey heindle born 14 may', 'joey heindle born 14 may 1993']


[[11624, 11625],
 [11624, 11625, 22],
 [11624, 11625, 22, 551],
 [11624, 11625, 22, 551, 78],
 [11624, 11625, 22, 551, 78, 488]]

In [29]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
# The sequences are of various lengths.
# Make all the sequences the same length.

# Determine max sequence length
max_sequence_len = max([len(x) for x in input_sequences])

# Pad all sequences with zeros at the beginning to make them all max length
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
input_sequences[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0, 11624, 11625])

In [30]:
# Predictors are every word except the last
predictors = input_sequences[:,:-1]
# Labels are the last word
labels = input_sequences[:,-1]

In [31]:
from tensorflow.keras import utils

# The targets are categorical.
# We are predicting one word out of our possible total vocabulary.
# Instead of the network predicting scalar numbers, we will have it predict binary categories.
# for example:
# 13810 ----> array([0., 0., 0., ..., 1., 0., 0.], dtype=float32)
labels = utils.to_categorical(labels, num_classes=total_words)

In [32]:
# Thats all. Now just create a model and run:
# model.fit(predictors, labels, epochs=20, verbose=1)

# Hugging Face Datasets Cleanups

Reminders:
1. all Transformers models return logits. (The logits are the output of a model before a softmax activation function is applied to the output.)
2. Hugging Face models automatically choose a loss that is appropriate for their task and model architecture if this argument is left blank. When calling compile().
3. Hugging Face datasets are stored on disk by default.




### With PyTorch

In [None]:
# Transformers installation (to run in Google Colab)
! pip install transformers datasets
# Regular python install
# pip install transformers datasets
# For more installations: https://huggingface.co/docs/transformers/installation

In [None]:
from datasets import load_dataset

dataset = load_dataset("yelp_review_full") # Replace to other dataset here:  
dataset["train"][100]

In [None]:
# Create smaller subset of the full dataset to reduce time (if wanted):
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [None]:
from transformers import AutoTokenizer
# Initialize 
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Function to call on each dataset's example
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

#  Apply a preprocessing function over the entire dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
# Load model and specify the number of expected labels (should match the number of labels of the database page).
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

In [None]:
# TrainingArguments contains all the hyperparameters you can tune as well as flags for activating different training options.
# https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch") #  location of checkpoints from training

In [None]:
import numpy as np
import evaluate
# Define a function to compute and report metrics. 
metric = evaluate.load("accuracy")

In [None]:
# Call compute on metric to calculate the accuracy of your predictions. 
def compute_metrics(eval_pred):
    # Convert the predictions to logits (Before passing your predictions to compute).
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)


In [None]:
# Fine-tune your model 
trainer.train()

### With Keras

#### For small datasets: 

In [None]:
from datasets import load_dataset

dataset = load_dataset("glue", "cola")
dataset = dataset["train"]  # Just take the training split for now

In [None]:
from transformers import AutoTokenizer

# Tokenize the data as NumPy arrays.
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True)
# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
tokenized_data = dict(tokenized_data)

labels = np.array(dataset["label"])  # Label is already an array of 0 and 1

In [None]:
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers import Adam

# Load and compile our model (TFAutoModelForSequenceClassification instead of AutoModelForSequenceClassification of PyTorch)
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
# Lower learning rates are often better for fine-tuning transformers
model.compile(optimizer=Adam(3e-5))  # No loss argument!

model.fit(tokenized_data, labels)

#### For big datasets:

In [None]:
def tokenize_dataset(data):
    # Keys of the returned dictionary will be added to the dataset as columns
    return tokenizer(data["text"])


dataset = dataset.map(tokenize_dataset)

In [None]:
from transformers import AutoTokenizer

# Tokenize the data as NumPy arrays.
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# If you need to do something more complex than just padding samples (e.g. corrupting tokens for masked language modelling),
# you can use the collate_fn argument instead to pass a function that will be called to transform the list of samples into a batch and apply any preprocessing you want.
tf_dataset = model.prepare_tf_dataset(dataset["train"], batch_size=16, shuffle=True, tokenizer=tokenizer)


In [None]:
# Load and compile our model (TFAutoModelForSequenceClassification instead of AutoModelForSequenceClassification of PyTorch)
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")

In [None]:
model.compile(optimizer=Adam(3e-5))  # No loss argument!
model.fit(tf_dataset)