In [None]:
from datasets import load_dataset

dataset = load_dataset("glue", "cola")
dataset = dataset["train"]  # Just take the training split for now

In [None]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True)
# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
tokenized_data = dict(tokenized_data)

labels = np.array(dataset["label"])  # Label is already an array of 0 and 1

In [3]:
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers import Adam

# Load and compile our model
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
# Lower learning rates are often better for fine-tuning transformers
model.compile(optimizer=Adam(3e-5))  # No loss argument!

model.fit(tokenized_data, labels)

# save model
model.save_pretrained("cola_model")
tokenizer.save_pretrained("cola_model_tokenizer")

# # # load model
# model = TFAutoModelForSequenceClassification.from_pretrained("cola_model")
# tokenizer = AutoTokenizer.from_pretrained("cola_model_tokenizer")



2023-11-02 21:19:30.128910: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-02 21:19:31.568752: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at cola_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [6]:
import tensorflow as tf
# test model
test_sentence = "This is a good sentence."
tokenized_test_sentence = tokenizer(test_sentence, return_tensors="tf")
print(tf.nn.softmax(model(tokenized_test_sentence).logits))
# test model
test_sentence = "This a sentence bad."
tokenized_test_sentence = tokenizer(test_sentence, return_tensors="tf")
print(tf.nn.softmax(model(tokenized_test_sentence).logits))

tf.Tensor([[0.02456144 0.9754386 ]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.806944 0.193056]], shape=(1, 2), dtype=float32)


In [5]:
# Code from https://huggingface.co/spaces/evaluate-metric/glue
from evaluate import load
glue_metric = load("glue", "cola")
glue_metric.compute(predictions=model.predict(tokenized_data).logits.argmax(axis=1), references=labels)



{'matthews_correlation': 0.7456700972528084}