In [1]:
from transformers import BertConfig, BertModel

config = BertConfig()

model = BertModel(config)

config

  from .autonotebook import tqdm as notebook_tqdm


BertConfig {
  "_attn_implementation_autoset": true,
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.47.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [2]:
MODEL_ID = "bert-base-cased"
model = BertModel.from_pretrained(MODEL_ID)

In [3]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_ID)

In [4]:
from pprint import pprint
pprint(tokenizer("Using a Transformer network is simple"))

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1],
 'input_ids': [101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [5]:
tokenizer.save_pretrained("./bert/")

('./bert/tokenizer_config.json',
 './bert/special_tokens_map.json',
 './bert/vocab.txt',
 './bert/added_tokens.json')

In [6]:
sequence = "Using a Transformer network is simple"
tokens = tokenizer.tokenize(sequence)
pprint(tokens)

['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']


In [7]:
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[7993, 170, 13809, 23763, 2443, 1110, 3014]


In [8]:
decoded_string = tokenizer.decode(ids)
print(decoded_string)

Using a Transformer network is simple


In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequences = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
print(tokens)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor([ids, ids])
pprint(input_ids)
output = model(input_ids)
pprint(output.logits)

['using', 'a', 'transform', '##er', 'network', 'is', 'simple']
tensor([[ 2478,  1037, 10938,  2121,  2897,  2003,  3722],
        [ 2478,  1037, 10938,  2121,  2897,  2003,  3722]])
tensor([[ 2.5189, -2.1906],
        [ 2.5189, -2.1906]], grad_fn=<AddmmBackward0>)


In [10]:
tokenized_inputs = tokenizer(sequence, return_tensors="pt")
pprint(tokenized_inputs)

{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'input_ids': tensor([[  101,  2478,  1037, 10938,  2121,  2897,  2003,  3722,   102]])}


In [17]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much."
]

# Pad the sequences up to max seq length
model_inputs = tokenizer(sequences, padding="longest")
print("Tokens with padding='longest': ", model_inputs)
# Pad the sequences up to max length (512 for BERT or distilBERT)
model_inputs = tokenizer(sequences, padding="max_length")
print("Tokens with padding='max_length': ", model_inputs)
# Pad the seqences up to SPECIFIED max length
model_inputs = tokenizer(sequences, padding="max_length", max_length=8)
print("Tokens with padding='max_length' & max_length='8': ", model_inputs)

Tokens with padding='longest':  {'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 1045, 5223, 2023, 2061, 2172, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
Tokens with padding='max_length':  {'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [26]:
# Putting it all together
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequences = [
    "Learning transformers is pretty easy.",
    "I totally agree."
]

tokens = tokenizer(sequences, 
                         padding=True,
                         truncation=True,
                         return_tensors="pt"
                        )

output = model(**tokens)

probs = torch.nn.functional.softmax(output.logits, dim=1)

print(output.logits)
print(probs)


tensor([[-1.8175,  1.7472],
        [-4.2654,  4.5955]], grad_fn=<AddmmBackward0>)
tensor([[2.7525e-02, 9.7247e-01],
        [1.4181e-04, 9.9986e-01]], grad_fn=<SoftmaxBackward0>)


In [1]:
from datasets import load_dataset

raw_dataset = load_dataset("glue", "mrpc")
raw_dataset

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 3668/3668 [00:00<00:00, 779209.23 examples/s]
Generating validation split: 100%|██████████| 408/408 [00:00<00:00, 197652.58 examples/s]
Generating test split: 100%|██████████| 1725/1725 [00:00<00:00, 852400.38 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [16]:
raw_train_dataset = raw_dataset["train"]
raw_train_dataset[0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [21]:
raw_train_dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [40]:
from datasets import Dataset
from pprint import pprint

def get_label(idx: int, dataset: Dataset):
    labels = dataset.features["label"].names
    sample = dataset[idx]
    sample_label = sample["label"]
    return sample, labels[sample_label]

pprint(get_label(87, raw_train_dataset))

({'idx': 100,
  'label': 1,
  'sentence1': 'Tuition at four-year private colleges averaged $ 19,710 this '
               'year , up 6 percent from 2002 .',
  'sentence2': 'For the current academic year , tuition at public colleges '
               'averaged $ 4,694 , up almost $ 600 from the year before .'},
 'equivalent')


In [53]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentence_1 = tokenizer(raw_dataset["train"]["sentence1"])
tokenized_sentence_2 = tokenizer(raw_dataset["train"]["sentence2"])

In [58]:
tokenized_dataset = tokenizer(
    raw_dataset["train"]["sentence1"],
    raw_dataset["train"]["sentence2"],
    padding=True,
    truncation=True,
)

In [55]:
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

In [66]:
tokenized_dataset = raw_dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map: 100%|██████████| 408/408 [00:00<00:00, 2883.26 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [67]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [69]:
samples = tokenized_dataset["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
[len(x) for x in samples["input_ids"]]

[50, 59, 47, 67, 59, 50, 62, 32]

In [70]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}

In [71]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_dataset = load_dataset("glue", "mrpc")
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"],
                     example["sentence2"],
                     truncation=True)
    
tokenized_dataset = raw_dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 3668/3668 [00:00<00:00, 30643.41 examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 17901.69 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 29195.63 examples/s]


In [72]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

In [73]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [74]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [75]:
trainer.train()

 36%|███▋      | 500/1377 [00:48<01:24, 10.42it/s]

{'loss': 0.6352, 'grad_norm': 2.9263598918914795, 'learning_rate': 3.184458968772695e-05, 'epoch': 1.09}


 73%|███████▎  | 1000/1377 [01:38<00:35, 10.56it/s]

{'loss': 0.6032, 'grad_norm': 3.2335994243621826, 'learning_rate': 1.3689179375453886e-05, 'epoch': 2.18}


100%|██████████| 1377/1377 [02:17<00:00, 10.02it/s]

{'train_runtime': 137.3896, 'train_samples_per_second': 80.093, 'train_steps_per_second': 10.023, 'train_loss': 0.5990198893051868, 'epoch': 3.0}





TrainOutput(global_step=1377, training_loss=0.5990198893051868, metrics={'train_runtime': 137.3896, 'train_samples_per_second': 80.093, 'train_steps_per_second': 10.023, 'total_flos': 419446300011600.0, 'train_loss': 0.5990198893051868, 'epoch': 3.0})

In [76]:
predictions = trainer.predict(tokenized_dataset["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

100%|██████████| 51/51 [00:00<00:00, 53.70it/s]

(408, 2) (408,)





In [79]:
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

In [81]:
import evaluate

metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.7426470588235294, 'f1': 0.8361934477379095}

In [82]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [83]:
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [84]:
trainer.train()

                                                  
 33%|███▎      | 460/1377 [00:48<09:16,  1.65it/s]

{'eval_loss': 0.59553062915802, 'eval_accuracy': 0.7205882352941176, 'eval_f1': 0.8235294117647058, 'eval_runtime': 2.9578, 'eval_samples_per_second': 137.938, 'eval_steps_per_second': 17.242, 'epoch': 1.0}


 36%|███▋      | 500/1377 [00:52<01:28,  9.89it/s]

{'loss': 0.636, 'grad_norm': 2.600222587585449, 'learning_rate': 3.184458968772695e-05, 'epoch': 1.09}


                                                  
 67%|██████▋   | 918/1377 [01:37<00:43, 10.64it/s]

{'eval_loss': 0.5403149724006653, 'eval_accuracy': 0.7622549019607843, 'eval_f1': 0.8462757527733756, 'eval_runtime': 2.9369, 'eval_samples_per_second': 138.922, 'eval_steps_per_second': 17.365, 'epoch': 2.0}


 73%|███████▎  | 1000/1377 [01:46<00:37, 10.05it/s]

{'loss': 0.5335, 'grad_norm': 3.487730026245117, 'learning_rate': 1.3689179375453886e-05, 'epoch': 2.18}


                                                   
100%|██████████| 1377/1377 [02:28<00:00,  9.29it/s]

{'eval_loss': 0.5452971458435059, 'eval_accuracy': 0.7941176470588235, 'eval_f1': 0.86, 'eval_runtime': 2.9242, 'eval_samples_per_second': 139.527, 'eval_steps_per_second': 17.441, 'epoch': 3.0}
{'train_runtime': 148.1602, 'train_samples_per_second': 74.271, 'train_steps_per_second': 9.294, 'train_loss': 0.5337902060780909, 'epoch': 3.0}





TrainOutput(global_step=1377, training_loss=0.5337902060780909, metrics={'train_runtime': 148.1602, 'train_samples_per_second': 74.271, 'train_steps_per_second': 9.294, 'total_flos': 419446300011600.0, 'train_loss': 0.5337902060780909, 'epoch': 3.0})

In [85]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_dataset = load_dataset("glue", "mrpc")
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(
        example["sentence1"],
        example["sentence2"],
        truncation=True,
    )
    
tokenized_dataset = raw_dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 3668/3668 [00:00<00:00, 29678.89 examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 17838.43 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 28386.70 examples/s]
