<a href="https://colab.research.google.com/github/LUMII-AILab/NLP_Course/blob/main/notebooks/BERT.ipynb" target="_new"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab"/></a>

# BERT-based classifier

## Setting up the environment

In [None]:
!pip install transformers
!pip install scikit-learn
!pip install nltk

In [2]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

Note: enable a free GPU in your Colab environment.

In [9]:
# Load a pre-trained BERT model and tokenizer for English from Hugging Face
bert_tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-uncased')
bert_model = BertModel.from_pretrained('google-bert/bert-base-uncased')

# Check for GPU availability and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Send the BERT model to the device
bert_model = bert_model.to(device)

Using device: cuda


In [None]:
!wget https://raw.githubusercontent.com/LUMII-AILab/NLP_Course/main/notebooks/resources/news20/20_newsgroup.tsv

## Text preprocessing & vectorization

In [10]:
def get_sentence_vector(sentence):
    # Tokenize and encode the sentence into token ids,
    # paying attention to BERT's maximum sequence length
    inputs = bert_tokenizer(sentence, return_tensors="pt", padding='max_length', truncation=True, max_length=512)

    # Move tensors to the GPU device
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Get BERT's output (last hidden state)
    with torch.no_grad(): outputs = bert_model(**inputs)

    # Extract the embeddings for the [CLS] token (index 0)
    sentence_vector = outputs.last_hidden_state[:,0,:].squeeze().cpu().numpy()
    # Note: BERT embeddings are moved back to the CPU
    # before converting them to a NumPy array,
    # since scikit-learn logistic regression does not operate on GPU.

    return sentence_vector

In [11]:
def load_dataset(filename):
    data = pd.read_csv(filename, sep='\t', header=None, names=['label', 'sentence'])
    sentences, labels = [], []

    counter = 0

    for _, row in data.iterrows():
        sent_vec = get_sentence_vector(row['sentence'])
        sentences.append(sent_vec)
        labels.append(row['label'])

        counter += 1
        if counter % 1000 == 0:
            print(f"Progress: {counter}")

    print("[I] Samples loaded and vectorized:", len(sentences))

    return np.array(sentences), np.array(labels)

## Experimentation & evaluation

In [12]:
# Load and vectorize the dataset
X, y = load_dataset("20_newsgroup.tsv")

Progress: 1000
Progress: 2000
Progress: 3000
Progress: 4000
Progress: 5000
Progress: 6000
Progress: 7000
Progress: 8000
Progress: 9000
Progress: 10000
Progress: 11000
Progress: 12000
Progress: 13000
Progress: 14000
Progress: 15000
Progress: 16000
Progress: 17000
Progress: 18000
Progress: 19000
[I] Samples loaded and vectorized: 19885


In [14]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a logistic regression model
lr_model = LogisticRegression(max_iter=10000)
lr_model.fit(X_train, y_train)

# Evaluate the model
predictions = lr_model.predict(X_test)
print(classification_report(y_test, predictions))

                          precision    recall  f1-score   support

             alt.atheism       0.47      0.56      0.51       171
           comp.graphics       0.53      0.55      0.54       177
 comp.os.ms-windows.misc       0.66      0.57      0.61       227
comp.sys.ibm.pc.hardware       0.53      0.54      0.54       194
   comp.sys.mac.hardware       0.63      0.64      0.63       209
          comp.windows.x       0.67      0.74      0.70       199
            misc.forsale       0.75      0.74      0.75       200
               rec.autos       0.80      0.78      0.79       212
         rec.motorcycles       0.87      0.83      0.85       212
      rec.sport.baseball       0.88      0.93      0.90       194
        rec.sport.hockey       0.96      0.90      0.93       187
               sci.crypt       0.81      0.77      0.79       193
         sci.electronics       0.63      0.61      0.62       207
                 sci.med       0.86      0.86      0.86       218
         

Note: increased `max_iter` to avoid `ConvergenceWarning: failed to converge (TOTAL NO. of ITERATIONS REACHED LIMIT)`.

In [None]:
# Save the model for later use

import pickle

with open("bert_lr_classifier.pickle", "wb") as dmp:
		pickle.dump(lr_model, dmp)
		print("[I] BERT-based LR classifier stored in a file")

# Pre-trained vs. fine-tuned BERT

The embeddings extracted from a pre-trained BERT without fine-tuning are static. They do not adapt during the logistic regression training process - they may not capture task-specific nuances as effectively as they could if the model were fine-tuned.

In [2]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.optim import AdamW

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import pandas as pd
import numpy as np

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('google-bert/bert-base-uncased', num_labels=20)
# Update num_labels according to the task (20_newsgroup: 20)

# Set device for model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Using device:", device)

def preprocess_data(sentences, labels, max_len=512):
    input_ids = []
    attention_masks = []

    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(
            sent,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)

    return TensorDataset(input_ids, attention_masks, labels)

def load_dataset(filename):
    df = pd.read_csv(filename, delimiter='\t', header=None, names=['label', 'sentence'])

    sentences = df.sentence.values
    labels = df.label.values

    return sentences, labels

# Load dataset
sentences, labels = load_dataset("20_newsgroup.tsv")

# Encode the class labels with LabelEncoder
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)
labels_encoded = np.array(labels_encoded) # 0-19 for 20_Newsgroup

# Split data into train and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(sentences, labels_encoded, test_size=0.2, random_state=42)

# Convert to DataLoader
train_data = preprocess_data(train_sentences, train_labels)
val_data = preprocess_data(val_sentences, val_labels)

# Adjust batch size:
# - OutOfMemoryError with batch_size=64 on A100 40GB (=> batch_size=48)
# - Squeezing maximum with batch_size=32 on L4 22.5GB
train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=48)
validation_dataloader = DataLoader(val_data, sampler=SequentialSampler(val_data), batch_size=48)

# Setup the optimizer (experiment with hyperparams)
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Training and validation loop
for epoch in range(3):
    print(f'Epoch {epoch + 1}/{3}')

    # Training phase
    model.train()
    total_train_loss = 0

    for step, batch in enumerate(train_dataloader):
        b_input_ids, b_input_mask, b_labels = batch
        b_input_ids = b_input_ids.to(device)
        b_input_mask = b_input_mask.to(device)
        b_labels = b_labels.to(device)

        model.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f'Average training loss: {avg_train_loss:.2f}')

    # Validation phase
    model.eval()
    total_val_accuracy = 0
    total_val_loss = 0

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

        loss = outputs.loss
        total_val_loss += loss.item()

        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions = np.argmax(logits, axis=1)
        total_val_accuracy += np.sum(predictions == label_ids)

    avg_val_loss = total_val_loss / len(validation_dataloader)
    avg_val_accuracy = total_val_accuracy / (len(validation_dataloader) * validation_dataloader.batch_size)
    print(f'Validation loss: {avg_val_loss:.2f}')
    print(f'Validation accuracy: {avg_val_accuracy:.2f}')

print('Training complete')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
Epoch 1/3
Average training loss: 1.55
Validation loss: 0.73
Validation accuracy: 0.77
Epoch 2/3
Average training loss: 0.62
Validation loss: 0.54
Validation accuracy: 0.83
Epoch 3/3
Average training loss: 0.39
Validation loss: 0.50
Validation accuracy: 0.85
Training complete


In [3]:
# Save the model's state (weights) for later use
# Using PyTorch (recommended) instead of pickle
torch.save(model.state_dict(), 'model_state_dict.pth')

# Save the entire model configuration along with the tokenizer
model.save_pretrained('./model')
tokenizer.save_pretrained('./model')

('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.txt',
 './model/added_tokens.json')