In [18]:
# pip3 install datasets
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")
dataset["train"][100]

{'label': 0,
 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. 

In [19]:
type(dataset)

datasets.dataset_dict.DatasetDict

In [21]:
dataset.column_names

{'train': ['label', 'text'], 'test': ['label', 'text']}

In [20]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 50000/50000 [00:08<00:00, 5616.97 examples/s]


In [5]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [6]:
from datasets import load_dataset

dataset = load_dataset("glue", "cola")
dataset = dataset["train"]  # Just take the training split for now

Downloading builder script: 100%|██████████| 28.8k/28.8k [00:00<00:00, 27.4MB/s]
Downloading metadata: 100%|██████████| 28.7k/28.7k [00:00<00:00, 43.8MB/s]
Downloading readme: 100%|██████████| 27.9k/27.9k [00:00<00:00, 27.9MB/s]
Downloading data: 100%|██████████| 377k/377k [00:00<00:00, 18.3MB/s]
Generating train split: 100%|██████████| 8551/8551 [00:00<00:00, 91213.91 examples/s]
Generating validation split: 100%|██████████| 1043/1043 [00:00<00:00, 85399.19 examples/s]
Generating test split: 100%|██████████| 1063/1063 [00:00<00:00, 74036.39 examples/s]


In [7]:
dataset

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 8551
})

In [15]:
dataset['sentence']

["Our friends won't buy this analysis, let alone the next one we propose.",
 "One more pseudo generalization and I'm giving up.",
 "One more pseudo generalization or I'm giving up.",
 'The more we study verbs, the crazier they get.',
 'Day by day the facts are getting murkier.',
 "I'll fix you a drink.",
 'Fred watered the plants flat.',
 'Bill coughed his way out of the restaurant.',
 "We're dancing the night away.",
 'Herman hammered the metal flat.',
 'The critics laughed the play off the stage.',
 'The pond froze solid.',
 'Bill rolled out of the room.',
 'The gardener watered the flowers flat.',
 'The gardener watered the flowers.',
 'Bill broke the bathtub into pieces.',
 'Bill broke the bathtub.',
 'They drank the pub dry.',
 'They drank the pub.',
 'The professor talked us into a stupor.',
 'The professor talked us.',
 'We yelled ourselves hoarse.',
 'We yelled ourselves.',
 'We yelled Harry hoarse.',
 'Harry coughed himself into a fit.',
 'Harry coughed himself.',
 'Harry coug

In [8]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True)
# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
tokenized_data = dict(tokenized_data)

labels = np.array(dataset["label"])  # Label is already an array of 0 and 1

In [9]:
tokenized_data

{'input_ids': array([[ 101, 3458, 2053, ...,    0,    0,    0],
        [ 101, 1448, 1167, ...,    0,    0,    0],
        [ 101, 1448, 1167, ...,    0,    0,    0],
        ...,
        [ 101, 1135, 1110, ...,    0,    0,    0],
        [ 101,  146, 1125, ...,    0,    0,    0],
        [ 101, 1327, 1155, ...,    0,    0,    0]]),
 'token_type_ids': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]),
 'attention_mask': array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]])}

In [23]:
print(tokenized_data['input_ids'].shape)
print(tokenized_data['token_type_ids'].shape)
print(tokenized_data['attention_mask'].shape)

(8551, 47)
(8551, 47)
(8551, 47)


In [11]:
labels.shape

(8551,)

In [12]:
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers.legacy import Adam

# Load and compile our model
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
# Lower learning rates are often better for fine-tuning transformers
model.compile(optimizer=Adam(3e-5))  # No loss argument!

model.fit(tokenized_data, labels)

2023-11-13 20:05:16.047804: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-13 20:05:16.065410: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-13 20:05:16.065438: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-13 20:05:16.065448: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-13 20:05:16.069457: I tensorflow/core/platform/cpu_feature_g



<keras.src.callbacks.History at 0x7f3a6cf1eeb0>