# Hugging face tensorflow example
refer to these links for original examples

https://huggingface.co/docs/transformers/training 

https://github.com/huggingface/transformers/tree/master/examples

https://huggingface.co/docs/transformers/notebooks


In [None]:
import os

from datasets import load_dataset

import tensorflow as tf

from transformers import Trainer
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import TrainingArguments
from transformers import TFAutoModelForSequenceClassification


In [None]:
#boiler plate for limiting allocated GPU memory
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)


In [None]:
raw_datasets = load_dataset("imdb")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

In [None]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) 
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) 
#small_train_dataset = tokenized_datasets["train"].shuffle(seed=42)
#small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42)
full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"]

In [None]:
training_args = TrainingArguments("test_trainer")


In [None]:
#model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

In [None]:
tf_train_dataset = small_train_dataset.remove_columns(["text"]).with_format("tensorflow")
tf_eval_dataset = small_eval_dataset.remove_columns(["text"]).with_format("tensorflow")

In [None]:
batch_size = 2
learning_rate = 1e-5
epochs = 10
train_features = {x: tf_train_dataset[x] for x in tokenizer.model_input_names}
train_tf_dataset = tf.data.Dataset.from_tensor_slices((train_features, tf_train_dataset["label"]))
train_tf_dataset = train_tf_dataset.shuffle(len(tf_train_dataset)).batch(batch_size)

eval_features = {x: tf_eval_dataset[x] for x in tokenizer.model_input_names}
eval_tf_dataset = tf.data.Dataset.from_tensor_slices((eval_features, tf_eval_dataset["label"]))
eval_tf_dataset = eval_tf_dataset.batch(batch_size)

In [None]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_sparse_categorical_accuracy', patience=5)
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="../logs")

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)

model.fit(train_tf_dataset, 
          validation_data=eval_tf_dataset,
          epochs=epochs,
          callbacks=[stop_early,tensorboard_callback])

In [None]:
model.save_pretrained("my_imdb_model")

In [None]:
for data_sample in train_tf_dataset:
    break

In [None]:
data_sample[0]

In [None]:
model.predict(data_sample[0])

In [None]:
data_sample[1]