In [1]:
from transformers import AutoTokenizer, DataCollatorWithPadding, AdamW, AutoModelForSequenceClassification, get_scheduler
from datasets import load_dataset
from accelerate import Accelerator
import evaluate
from torch.utils.data import DataLoader
import torch
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


### Data Processing

In [2]:
# get dataset
raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [3]:
# check values of dataset
raw_datasets["validation"][86]

{'sentence1': 'He was arrested Friday night at an Alpharetta seafood restaurant while dining with his wife , singer Whitney Houston .',
 'sentence2': 'He was arrested again Friday night at an Alpharetta restaurant where he was having dinner with his wife .',
 'label': 1,
 'idx': 796}

In [4]:
# get model name and its tokenizer
checkpoint = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [5]:
# define a tokenizer function for mapping to dataset (all values in dataset tokenized optimally through mapping)
def tokenizer_fn(dataset):
    return tokenizer(dataset["sentence1"], dataset["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenizer_fn, batched=True)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [6]:
# define collator (padding)
collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [7]:
# check lengths of samples
samples = tokenized_datasets["train"][:5]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
[len(x) for x in samples["input_ids"]] # different lengths

[50, 59, 47, 67, 59]

In [8]:
# test collator (padding should make each sample the same length)
batch = collator(samples)
{k: v.shape for k, v in batch.items()} # with padding now all same length

{'input_ids': torch.Size([5, 67]),
 'token_type_ids': torch.Size([5, 67]),
 'attention_mask': torch.Size([5, 67]),
 'labels': torch.Size([5])}

In [9]:
# remove unnecessary columns for training
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])

# rename for continuity
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

# set to torch tensor
tokenized_dataset = tokenized_datasets.set_format("torch")

tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [10]:
batch_size = 16

# make dataloaders from dataset
train_dl = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=batch_size, collate_fn=collator)
test_dl = DataLoader(tokenized_datasets["test"], shuffle=True, batch_size=batch_size, collate_fn=collator)

In [11]:
# check shapes for dataloaders (batch, sample length)
batch = next(iter(train_dl))
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([16]),
 'input_ids': torch.Size([16, 67]),
 'token_type_ids': torch.Size([16, 67]),
 'attention_mask': torch.Size([16, 67])}

### Load Model

In [12]:
# define accelerator that eases train-test process
accelerator = Accelerator()

In [13]:
# define model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# define optimizer
optimizer = AdamW(model.parameters(), lr=0.00001)



In [15]:
# set dataloaders, model and optimizer to accelerator (to gpu or smth)
train_dl, test_dl, model, optimizer = accelerator.prepare(train_dl, test_dl, model, optimizer)

In [16]:
# define epochs
epochs = 10
training_steps = epochs * len(train_dl)

In [17]:
# define scheduler for training
scheduler = get_scheduler(
    "linear",
    optimizer = optimizer,
    num_warmup_steps=0,
    num_training_steps=training_steps
)

### Train Model

In [18]:
# get metrics for dataset
metric = evaluate.load("glue", "mrpc")

In [19]:
prog = tqdm(range(epochs*training_steps))

for epoch in range(epochs):
    
    # TRAINING
    model.train()

    for batch in train_dl:

        # forward pass
        output = model(**batch)
        loss = output.loss
        
        # metrics
        logits = output.logits
        preds = torch.argmax(logits, dim=1)
        metric.add_batch(predictions=preds, references=batch["labels"])
        
        # backprop
        accelerator.backward(loss)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        prog.update(1)

    print("Training: ", metric.compute())

    
    # TESTING
    model.eval()

    with torch.inference_mode():
        for batch in test_dl:

            # forward pass
            output = model(**batch)
            
            # metrics
            logits = output.logits
            preds = torch.argmax(logits, dim=1)
            metric.add_batch(predictions=preds, references=batch["labels"])

    print("Test: ", metric.compute())

  attn_output = torch.nn.functional.scaled_dot_product_attention(
  1%|          | 229/23000 [00:41<1:10:48,  5.36it/s]

Training:  {'accuracy': 0.6968375136314068, 'f1': 0.8023462495556346}


  1%|          | 231/23000 [00:47<9:09:34,  1.45s/it]

Test:  {'accuracy': 0.7397101449275363, 'f1': 0.8}


  2%|▏         | 459/23000 [01:28<1:08:52,  5.46it/s]

Training:  {'accuracy': 0.8107960741548528, 'f1': 0.8614217252396166}


  2%|▏         | 461/23000 [01:34<9:05:37,  1.45s/it]

Test:  {'accuracy': 0.8046376811594202, 'f1': 0.854427645788337}


  3%|▎         | 690/23000 [02:15<58:13,  6.39it/s]  

Training:  {'accuracy': 0.8996728462377317, 'f1': 0.9248979591836735}


  3%|▎         | 691/23000 [02:21<11:19:40,  1.83s/it]

Test:  {'accuracy': 0.8289855072463768, 'f1': 0.8745214802211825}


  4%|▍         | 919/23000 [03:02<1:06:41,  5.52it/s] 

Training:  {'accuracy': 0.9511995637949836, 'f1': 0.9635214998981048}


  4%|▍         | 921/23000 [03:08<8:50:05,  1.44s/it]

Test:  {'accuracy': 0.8144927536231884, 'f1': 0.863130881094953}


  5%|▍         | 1149/23000 [03:49<1:04:50,  5.62it/s]

Training:  {'accuracy': 0.9762813522355507, 'f1': 0.982313478349258}


  5%|▌         | 1151/23000 [03:55<8:42:14,  1.43s/it]

Test:  {'accuracy': 0.8197101449275362, 'f1': 0.8663515255694026}


  6%|▌         | 1379/23000 [04:37<1:02:49,  5.74it/s]

Training:  {'accuracy': 0.9874591057797165, 'f1': 0.9906769355492501}


  6%|▌         | 1381/23000 [04:42<8:43:39,  1.45s/it]

Test:  {'accuracy': 0.8197101449275362, 'f1': 0.8704706372344856}


  7%|▋         | 1609/23000 [05:24<1:06:22,  5.37it/s]

Training:  {'accuracy': 0.9877317339149401, 'f1': 0.9908851529268786}


  7%|▋         | 1611/23000 [05:30<8:40:31,  1.46s/it]

Test:  {'accuracy': 0.8243478260869566, 'f1': 0.8735919899874843}


  8%|▊         | 1839/23000 [06:11<59:58,  5.88it/s]  

Training:  {'accuracy': 0.9945474372955289, 'f1': 0.9959514170040485}


  8%|▊         | 1841/23000 [06:17<8:30:58,  1.45s/it]

Test:  {'accuracy': 0.8220289855072463, 'f1': 0.8731928954977282}


  9%|▉         | 2039/23000 [06:54<1:02:14,  5.61it/s]

KeyboardInterrupt: 