In [1]:
#@title import libraries
from transformers import BertTokenizerFast, BertForSequenceClassification, BertModel
import torch.nn as nn
from sklearn.datasets import fetch_20newsgroups
from torch.optim import *
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch

In [2]:
#@title model + tokenizer
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', do_lower_case=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [3]:
#@title hyperparameters + design the model
num_labels = 20
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)
max_length = 512


In [4]:
#@title read data
def read_20newsgroups(test_size=0.2):
  # download & load 20newsgroups dataset from sklearn's repos
  dataset = fetch_20newsgroups(subset="all", shuffle=True, remove=("headers", "footers", "quotes"))
  documents = dataset.data
  labels = dataset.target
  # split into training & testing a return data as well as label names
  return train_test_split(documents, labels, test_size=test_size), dataset.target_names

# call the function
(train_texts, valid_texts, train_labels, valid_labels), target_names = read_20newsgroups()
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=max_length)

In [6]:
#@title A much faster way to load data :)
import torch

# Convert tokenized encodings to tensors
train_inputs = torch.tensor(train_encodings['input_ids'])
train_masks = torch.tensor(train_encodings['attention_mask'])
train_labels = torch.tensor(train_labels)

valid_inputs = torch.tensor(valid_encodings['input_ids'])
valid_masks = torch.tensor(valid_encodings['attention_mask'])
valid_labels = torch.tensor(valid_labels)

# Create TensorDatasets
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
valid_dataset = TensorDataset(valid_inputs, valid_masks, valid_labels)

# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16)


In [7]:
#@title this is the traditional way
class TextDataset(Dataset):
    def __init__(self, inputs, masks, labels):
        self.inputs = inputs
        self.masks = masks
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {'input_ids': self.inputs[idx], 'attention_mask': self.masks[idx], 'labels': self.labels[idx]}

# Create custom Dataset objects
train_dataset = TextDataset(train_inputs, train_masks, train_labels)
valid_dataset = TextDataset(valid_inputs, valid_masks, valid_labels)

# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False)


In [2]:
#@title training function
def train(model, optimizer, train_loader, criterion):
    model.train()
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch['input_ids'], batch['attention_mask'], batch['labels']
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Training loss: {total_loss/len(train_loader)}')



In [9]:
#@title evaluation function
def evaluate(model, test_loader, criterion):
    model.eval()
    total_loss = 0
    total_acc = 0

    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = batch['input_ids'], batch['attention_mask'], batch['labels']
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            predictions = torch.argmax(outputs, dim=1)
            total_acc += (predictions == labels).sum().item()

    print(f'Test loss: {total_loss/len(test_loader)} Test acc: {total_acc/len(valid_text)*100}%')

In [3]:
#@title inference
for epoch in range(1):
    train(model, optimizer, train_loader, criterion)
    evaluate(model, valid_loader, criterion)

NameError: name 'model' is not defined

In [None]:
#@title testing model
class CustomBERTClassifier(nn.Module):
    def __init__(self, num_labels):
        super(CustomBERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        logits = self.classifier(pooled_output)
        return logits

# Instantiate the model
model = CustomBERTClassifier(num_labels)
for epoch in range(3):
    train(model, optimizer, train_loader, criterion)
    evaluate(model, valid_loader, criterion)



In [None]:
#@title experimenting
print(train_labels[0])
for batch in train_loader:
    text = batch
    print(text)
    break

tensor(11)
{'input_ids': tensor([[ 101, 2748, 1010,  ...,    0,    0,    0],
        [ 101, 2054, 2052,  ...,    0,    0,    0],
        [ 101, 1045, 2079,  ...,    0,    0,    0],
        ...,
        [ 101, 4346, 3808,  ...,    0,    0,    0],
        [ 101, 1996, 2878,  ...,    0,    0,    0],
        [ 101, 2006, 1037,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([12,  4,  8,  4, 14, 14, 12,  9,  8,  4,  8, 10, 18, 17, 19,  8])}
