In [1]:
import evaluate
import transformers
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DistilBertForSequenceClassification,
)




In [2]:
"""
Load dataset.
"""
dataset = load_dataset("yelp_review_full")
print(dataset)

Downloading readme:   0%|          | 0.00/6.72k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/299M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/23.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/650000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})


In [3]:
"""
Initialize tokenizer and model.
"""
model_id = "distilbert-base-uncased"

# init tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# init model
model = DistilBertForSequenceClassification.from_pretrained(
    model_id,
    num_labels=5,
)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
"""
Tokenize dataset.
"""
def tokenize(X):
    return tokenizer(
        X["text"],
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )

# tokenize data
tokenized_datasets = dataset.map(tokenize, batched=True)
print(tokenized_datasets)

Map:   0%|          | 0/650000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 50000
    })
})


In [6]:
"""
Create dataset splits.
"""
seed = 777
n_samples = 1000

train_dataset = tokenized_datasets["train"].shuffle(seed=seed).select(range(n_samples))
eval_dataset = tokenized_datasets["test"].shuffle(seed=seed).select(range(0, n_samples))
test_dataset = tokenized_datasets["test"].shuffle(seed=seed).select(range(n_samples, 2 * n_samples))

In [7]:
"""
Create Trainer.
"""
# define metric
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# training args
training_args = TrainingArguments(
    output_dir="results",
    eval_strategy="epoch",
)

# init trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [8]:
"""
Train model.
"""
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.132435,0.49
2,No log,1.059448,0.535
3,No log,1.06829,0.538


TrainOutput(global_step=375, training_loss=0.9854241536458334, metrics={'train_runtime': 4196.4745, 'train_samples_per_second': 0.715, 'train_steps_per_second': 0.089, 'total_flos': 397423457280000.0, 'train_loss': 0.9854241536458334, 'epoch': 3.0})

In [9]:
"""
Evaluate trained model.
"""
trainer.evaluate(test_dataset)

{'eval_loss': 1.0660573244094849,
 'eval_accuracy': 0.546,
 'eval_runtime': 304.769,
 'eval_samples_per_second': 3.281,
 'eval_steps_per_second': 0.41,
 'epoch': 3.0}