In [1]:
pip install accelerate -U evaluate scikit-learn


[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from huggingface_hub import notebook_login

In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
raw_datasets = load_dataset("mwarchalowski/names-data", split="train[:10%]")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


In [5]:
raw_datasets

Dataset({
    features: ['name', 'label'],
    num_rows: 468400
})

In [6]:
def tokenize_function(example):
    return tokenizer(example["name"], truncation=True)

In [7]:
raw_datasets = raw_datasets.train_test_split(test_size=0.1)

In [8]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['name', 'label'],
        num_rows: 421560
    })
    test: Dataset({
        features: ['name', 'label'],
        num_rows: 46840
    })
})

In [9]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)


Map:   0%|          | 0/421560 [00:00<?, ? examples/s]

Map:   0%|          | 0/46840 [00:00<?, ? examples/s]

In [10]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [11]:
tokenized_datasets = tokenized_datasets.remove_columns(["name"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names


['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [12]:
from torch.utils.data import DataLoader


In [45]:
train_dataloader = DataLoader(
    tokenized_datasets["train"], batch_size=100, collate_fn=data_collator
)

In [46]:
eval_dataloader = DataLoader(
    tokenized_datasets["test"], batch_size=100, collate_fn=data_collator
)


In [29]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}


{'labels': torch.Size([400]),
 'input_ids': torch.Size([400, 15]),
 'token_type_ids': torch.Size([400, 15]),
 'attention_mask': torch.Size([400, 15])}

In [16]:
batch

{'labels': tensor([0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
        0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0,
        0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
        1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
        1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0,
        1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,
        1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0,
        1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0,
        1, 1, 0, 1, 0, 0, 1, 

In [17]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
outputs = model(**batch)
outputs

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [19]:
print(outputs.loss, outputs.logits.shape)


tensor(0.6610, grad_fn=<NllLossBackward0>) torch.Size([800, 2])


In [20]:
from torch.optim import AdamW

In [21]:
optimizer = AdamW(model.parameters(), lr=5e-5)


The learning rate scheduler used by default is just a linear decay from the maximum value (5e-5) to 0. To properly define it, we need to know the number of training steps we will take, which is the number of epochs we want to run multiplied by the number of training batches (which is the length of our training dataloader). The Trainer uses three epochs by default, so we will follow that:



In [47]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)


12648


device(type='cuda')

In [33]:
from tqdm.auto import tqdm


In [49]:
progress_bar = tqdm(range(num_training_steps))


  0%|          | 0/12648 [00:00<?, ?it/s]

In [53]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)


0

In [54]:
import evaluate

metric = evaluate.combine(["accuracy", "f1", "precision", "recall"])
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()


{'accuracy': 0.9841588385994876,
 'f1': 0.9778692436172751,
 'precision': 0.9718977885812533,
 'recall': 0.9839145309405197}