In [1]:
pip install accelerate -U

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
from transformers import AutoTokenizer
from datasets import load_dataset
from huggingface_hub import notebook_login

In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
checkpoint = "bert-base-uncased"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [6]:
dataset = load_dataset("mwarchalowski/names-data")

In [7]:
dataset = load_dataset("mwarchalowski/names-data", split="train[:10%]")

In [8]:
dataset = dataset.filter(lambda e: e["name"] is not None)

In [9]:
def tokenize_function(examples):
    return tokenizer(examples["name"], truncation=True)
    

In [10]:
dataset

Dataset({
    features: ['name', 'label'],
    num_rows: 468400
})

In [11]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [12]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [13]:
samples = tokenized_datasets[:8]
samples.items()

dict_items([('name', ['Murphy Facility Solutions', 'Mansell Building Solutions', 'Johnny Boulos', 'Gurdeep Singh', 'Stefan Birrer', 'ECG Egineering Consulting Group', 'Topdeck Travel', 'Tamar Stiebel']), ('label', [0, 0, 1, 1, 1, 0, 0, 1]), ('input_ids', [[101, 7104, 4322, 7300, 102], [101, 16042, 5349, 2311, 7300, 102], [101, 5206, 8945, 18845, 2015, 102], [101, 19739, 25547, 13699, 5960, 102], [101, 8852, 12170, 14343, 2099, 102], [101, 14925, 2290, 1041, 11528, 20550, 10552, 2177, 102], [101, 2327, 26547, 3604, 102], [101, 17214, 2906, 2358, 2666, 8671, 102]]), ('token_type_ids', [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0]]), ('attention_mask', [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]])])

In [14]:
samples = {k: v for k, v in samples.items() if k not in ["name", "label"]}

In [15]:
batch = data_collator(samples)
batch

{'input_ids': tensor([[  101,  7104,  4322,  7300,   102,     0,     0,     0,     0],
        [  101, 16042,  5349,  2311,  7300,   102,     0,     0,     0],
        [  101,  5206,  8945, 18845,  2015,   102,     0,     0,     0],
        [  101, 19739, 25547, 13699,  5960,   102,     0,     0,     0],
        [  101,  8852, 12170, 14343,  2099,   102,     0,     0,     0],
        [  101, 14925,  2290,  1041, 11528, 20550, 10552,  2177,   102],
        [  101,  2327, 26547,  3604,   102,     0,     0,     0,     0],
        [  101, 17214,  2906,  2358,  2666,  8671,   102,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0,

In [16]:
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 9]),
 'token_type_ids': torch.Size([8, 9]),
 'attention_mask': torch.Size([8, 9])}

In [17]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")


In [18]:
from transformers import AutoModelForSequenceClassification

In [20]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
train_validation_set = tokenized_datasets.train_test_split(test_size=0.1)
train_validation_set.remove_columns(["


DatasetDict({
    train: Dataset({
        features: ['name', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 421560
    })
    test: Dataset({
        features: ['name', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 46840
    })
})

In [21]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)


KeyError: "Column train not in the dataset. Current columns in the dataset: ['name', 'label', 'input_ids', 'token_type_ids', 'attention_mask']"