In [1]:
#Fine-Tuning BERT Models using the code given in the BERT Lesson

In [3]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
import torch

# Load the dataset
dataset = load_dataset('imdb')

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Prepare data for PyTorch
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(2000)) # Use a subset for quick training
test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(500))

# Load the pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    #evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=10,
)

# Define a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()

# Evaluate the model
eval_result = trainer.evaluate()
print(f"Evaluation results: {eval_result}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,0.7143
20,0.6647
30,0.6559
40,0.551
50,0.407
60,0.5018
70,0.4868
80,0.5668
90,0.4267
100,0.4646


Evaluation results: {'eval_loss': 0.4383493661880493, 'eval_runtime': 95.109, 'eval_samples_per_second': 5.257, 'eval_steps_per_second': 0.336, 'epoch': 3.0}


In [6]:
model2 = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    #evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=10,
)

# Define a Trainer instance
trainer = Trainer(
    model=model2,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Evaluate the model pre-training
eval_result = trainer.evaluate()
print(f"Evaluation results: {eval_result}")

# Train the model
trainer.train()

# Evaluate the model
eval_result = trainer.evaluate()
print(f"Evaluation results: {eval_result}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation results: {'eval_loss': 0.698914647102356, 'eval_model_preparation_time': 0.0013, 'eval_runtime': 69.7159, 'eval_samples_per_second': 7.172, 'eval_steps_per_second': 0.23}


Step,Training Loss
10,0.6992
20,0.6659
30,0.6135
40,0.5993
50,0.4742
60,0.4192
70,0.4119
80,0.3236
90,0.3203
100,0.3572


Evaluation results: {'eval_loss': 0.4207938015460968, 'eval_model_preparation_time': 0.0013, 'eval_runtime': 85.7102, 'eval_samples_per_second': 5.834, 'eval_steps_per_second': 0.187, 'epoch': 3.0}


In [7]:
model3 = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    #evaluation_strategy="epoch",
    learning_rate=5e-6,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=10,
)

# Define a Trainer instance
trainer = Trainer(
    model=model3,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Evaluate the model pre-training
eval_result = trainer.evaluate()
print(f"Evaluation results: {eval_result}")

# Train the model
trainer.train()

# Evaluate the model
eval_result = trainer.evaluate()
print(f"Evaluation results: {eval_result}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation results: {'eval_loss': 0.7042055726051331, 'eval_model_preparation_time': 0.003, 'eval_runtime': 86.108, 'eval_samples_per_second': 5.807, 'eval_steps_per_second': 0.186}


Step,Training Loss
10,0.7101
20,0.7022
30,0.6606
40,0.6709
50,0.6458
60,0.6361
70,0.6315
80,0.6077
90,0.5862
100,0.5495


Evaluation results: {'eval_loss': 0.5041405558586121, 'eval_model_preparation_time': 0.003, 'eval_runtime': 109.1304, 'eval_samples_per_second': 4.582, 'eval_steps_per_second': 0.147, 'epoch': 3.0}


In [8]:
model4 = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    #evaluation_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=10,
)

# Define a Trainer instance
trainer = Trainer(
    model=model4,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Evaluate the model pre-training
eval_result = trainer.evaluate()
print(f"Evaluation results: {eval_result}")

# Train the model
trainer.train()

# Evaluate the model
eval_result = trainer.evaluate()
print(f"Evaluation results: {eval_result}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation results: {'eval_loss': 0.7042054533958435, 'eval_model_preparation_time': 0.0032, 'eval_runtime': 85.711, 'eval_samples_per_second': 5.834, 'eval_steps_per_second': 0.373}


Step,Training Loss
10,0.733
20,0.6798
30,0.6424
40,0.5907
50,0.4792
60,0.5525
70,0.4765
80,0.4446
90,0.5579
100,0.4688


Evaluation results: {'eval_loss': 0.5861983299255371, 'eval_model_preparation_time': 0.0032, 'eval_runtime': 100.8455, 'eval_samples_per_second': 4.958, 'eval_steps_per_second': 0.317, 'epoch': 3.0}


In [15]:
def predict_sentiment(texts):
    if isinstance(texts, str):
        texts = [texts]
    
    # Reuse tokenizer config from training
    inputs = tokenizer(texts, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    
    model2.eval()
    with torch.no_grad():
        outputs = model2(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
    
    return predictions.tolist()

In [20]:
test_texts = [
    "This movie was amazing!",
    "Terrible plot and acting.",
    "GOT was very slow paced but immersive",
    "Sherlock made me think a lot",
    "Gintama didn't have a plot. It was mostly parody"
]
label_map = {0: "Negative", 1: "Positive"}

for test in test_texts:
    preds = predict_sentiment(test)
    print("Testing statement:", test)
    print(label_map[preds[0]])

Testing statement: This movie was amazing!
Positive
Testing statement: Terrible plot and acting.
Negative
Testing statement: GOT was very slow paced but immersive
Positive
Testing statement: Sherlock made me think a lot
Positive
Testing statement: Gintama didn't have a plot. It was mostly parody
Negative
