<a href="https://colab.research.google.com/github/KCL-Health-NLP/nlp_examples/blob/master/ann/fine_tuning_with_huggingface_and_keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Hugging Face transformers and datasets installation
!pip install transformers datasets evaluate
!pip install git+https://github.com/huggingface/accelerate

## ***Restart***

In [None]:
import numpy as np

from datasets import load_dataset
from datasets import ClassLabel

from transformers import AutoTokenizer
#from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

from tensorflow.keras.optimizers import Adam

import evaluate

# For displaying models
from tensorflow.keras.utils import plot_model
from tensorflow.keras.utils import model_to_dot
import matplotlib.pyplot as plt
from IPython.display import SVG

## Load data

In [None]:


#dataset = load_dataset("glue", "cola")
#dataset = dataset["train"]  # Just take the training split for now

## Tokenise

## Reduce size of dataset to speed up

In [None]:
#small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
#small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [None]:
#from transformers import AutoTokenizer
#
#tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
#
#tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True, truncation=True, max_length=400)
#
# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
#tokenized_data = dict(tokenized_data)
#
#labels = np.array(dataset["label"])  # Label is already an array of 0 and 1

In [None]:
#from transformers import TFAutoModelForSequenceClassification
#from tensorflow.keras.optimizers import Adam
#
# Load and compile our model
#model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
# Lower learning rates are often better for fine-tuning transformers
#model.compile(optimizer=Adam(3e-5))
#
#model.fit(tokenized_data, labels)

In [None]:


train_url='https://github.com/KCL-Health-NLP/nlp_examples/raw/master/classification/classification_trainingdata.csv'
ds = load_dataset("csv", data_files=train_url, column_names=['label', 'text'], split='train')
ds

In [None]:
ds = ds.train_test_split(test_size=0.2)
ds

In [None]:


tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

classlabels = ClassLabel(num_classes=4, names=['psychiatrypsychology', 'hematology', 'pediatrics', 'pain'])

def tokenize(batch):
    text = tokenizer(batch['text'], return_tensors='np', padding=True, truncation=True, max_length=128)
    labels = [classlabels.str2int(l) for l in batch['label']]
    return (dict(text), np.array(labels))



In [None]:
#pytorch
#tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
#
#labels = ClassLabel(num_classes=4, names=['psychiatrypsychology', 'hematology', 'pediatrics', 'pain'])
#
#def tokenize(batch):
#    tok_batch = tokenizer(batch['text'], padding=True, truncation=True, max_length=128)
#    tok_batch['label'] = [labels.str2int(l) for l in batch['label']]
#    return tok_batch


In [None]:

train_x, train_y = tokenize(ds['train'])
val_x, val_y =  tokenize(ds['test'])
#dev_tok = tokenize(ds['test'])
print(train_y)
print('\n'*4)
print(train_x)

In [None]:


# pytorch
#model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=4)

In [None]:
# pytorch
#training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

In [None]:
# pytorch
#metric = evaluate.load('accuracy')
#
#def compute_metrics(eval_pred):
#    logits, labels = eval_pred
#    predictions = np.argmax(logits, axis=-1)
#    return metric.compute(predictions=predictions, references=labels)





In [None]:
#pytorch
#trainer = Trainer(
#    model=model,
#    args=training_args,
#    train_dataset=train_tok,
#    eval_dataset=dev_tok,
#    compute_metrics=compute_metrics,
#)

In [None]:
# pytorch
#trainer.train()

In [None]:
#from transformers import AutoTokenizer
#
#tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
#
#def tokenize_function(examples):
#    return tokenizer(examples["text"], padding="max_length", truncation=True)
#
#
#tokenized_ds = ds.map(tokenize_function, batched=True)
#tokenized_ds


#from transformers import AutoTokenizer
#
#tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
#
#tok_train = tokenizer(ds['train']['text'], return_tensors='np', padding=True, truncation=True, max_length=400)
#
# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
#tok_train = dict(tok_train)
#
#labels_train = np.array(ds['train']['label'])  # Label is already an array of 0 and 1
#
#print(tok_train)
#print()
#print(labels_train)


In [None]:
# Load and compile our model
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")

# Lower learning rates are often better for fine-tuning transformers
model.compile(loss="binary_crossentropy", optimizer=Adam(3e-5), metrics=["accuracy"])
#model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])

In [None]:

history = model.fit(train_x, train_y, validation_data=(val_x, val_y), epochs=5)

In [None]:


# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
test_url='https://github.com/KCL-Health-NLP/nlp_examples/raw/master/classification/classification_test_data.csv'
test_ds = load_dataset("csv", data_files=test_url, column_names=['label', 'text'])
test_ds

In [None]:
test_x, test_y = tokenize(test_ds['train'])
print(test_y)
print('\n'*4)
print(test_x)

In [None]:
score = model.evaluate(test_x, test_y)
print(f"{'Test loss:':16}{score[0]:.2f}")
print(f"{'Test accuracy:':16}{score[1]:.2f}")