<a href="https://colab.research.google.com/github/KCL-Health-NLP/nlp_examples/blob/master/fine_tuning_with_huggingface_and_keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine tuning a Hugging Face model with Keras
Based on this [Hugging Face tutorial](https://huggingface.co/docs/transformers/training)

In [None]:
# Hugging Face transformers and datasets installation
!pip install transformers datasets evaluate

## ***Restart***

In [None]:
import numpy as np

from datasets import load_dataset

from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification

from tensorflow.keras.optimizers import Adam

import evaluate

# For displaying models
from tensorflow.keras.utils import plot_model
from tensorflow.keras.utils import model_to_dot
import matplotlib.pyplot as plt
from IPython.display import SVG

#IMDB

## Load the data

In [None]:
ds_imdb = load_dataset("imdb")
ds_imdb

## Reduce size of dataset to speed up

In [None]:
ds_train_sm = ds_imdb['train'].shuffle(seed=42).select(range(600))
ds_train_sm = ds_train_sm.train_test_split(test_size=0.2)
ds_train_sm

## Tokenise

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize(batch):
    text = tokenizer(batch['text'], return_tensors='np', padding=True, truncation=True, max_length=128)
    return (dict(text), np.array(batch['label']))

In [None]:
train_x, train_y = tokenize(ds_train_sm['train'])
val_x, val_y =  tokenize(ds_train_sm['test'])
print(train_y)
print('\n'*4)
print(train_x)

## Create the model

In [None]:
# Load and compile our model
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")

# Lower learning rates are often better for fine-tuning transformers
#model.compile(loss="binary_crossentropy", optimizer=Adam(3e-5), metrics=["accuracy"])
model.compile(optimizer=Adam(3e-5), metrics=["accuracy"])

In [None]:
history = model.fit(train_x, train_y, validation_data=(val_x, val_y), epochs=10)

In [None]:

# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
ds_test_sm = ds_imdb['test'].shuffle(seed=42).select(range(500))
ds_test_sm

In [None]:
test_x, test_y = tokenize(ds_test_sm)

In [None]:
score = model.evaluate(test_x, test_y)
print(f"{'Test loss:':16}{score[0]:.2f}")
print(f"{'Test accuracy:':16}{score[1]:.2f}")