In [35]:
!pip -q install transformers datasets accelerate evaluate

In [36]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
)

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


In [37]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd

file_path = "IMDB Dataset.csv"

df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "lakshmi25npathi/imdb-dataset-of-50k-movie-reviews",
  file_path,)

  df = kagglehub.load_dataset(


Using Colab cache for faster access to the 'imdb-dataset-of-50k-movie-reviews' dataset.


In [38]:
df = df.copy()

df["label"] = df["sentiment"].map({
    "negative": 0,
    "positive": 1
}).astype(int)


In [39]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df["label"],   # keeps 50/50 balance
    random_state=42
)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,          # half of 20% → 10%
    stratify=temp_df["label"],
    random_state=42
)
print("Train:", train_df.shape)
print("Val:", val_df.shape)
print("Test:", test_df.shape)

print("\nTrain label balance:")
print(train_df["label"].value_counts(normalize=True))

Train: (40000, 3)
Val: (5000, 3)
Test: (5000, 3)

Train label balance:
label
1    0.5
0    0.5
Name: proportion, dtype: float64


In [40]:
import re

def clean_text(text):
    text = re.sub(r"<.*?>", " ", text)   # remove html
    text = re.sub(r"\s+", " ", text)    # remove extra spaces
    return text.strip()

train_df["review"] = train_df["review"].apply(clean_text)
val_df["review"]   = val_df["review"].apply(clean_text)
test_df["review"]  = test_df["review"].apply(clean_text)


In [41]:
from datasets import Dataset

train_ds = Dataset.from_pandas(train_df[["review", "label"]])
val_ds   = Dataset.from_pandas(val_df[["review", "label"]])
test_ds  = Dataset.from_pandas(test_df[["review", "label"]])

In [42]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
MAX_LEN = 128

def tok(batch):
    return tokenizer(batch["review"], truncation=True, max_length=MAX_LEN)

train_ds = train_ds.map(tok, batched=True)
val_ds   = val_ds.map(tok, batched=True)
test_ds  = test_ds.map(tok, batched=True)

# remove raw text column (Trainer doesn’t need it)
train_ds = train_ds.remove_columns(["review"])
val_ds   = val_ds.remove_columns(["review"])
test_ds  = test_ds.remove_columns(["review"])

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [43]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import numpy as np
import evaluate

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
collator = DataCollatorWithPadding(tokenizer=tokenizer)

acc = evaluate.load("accuracy")
f1  = evaluate.load("f1")

def compute_metrics(p):
    preds = p.predictions
    if isinstance(preds, tuple):  # some versions return (logits,)
        preds = preds[0]
    pred_labels = np.argmax(preds, axis=1)
    return {
        "accuracy": acc.compute(predictions=pred_labels, references=p.label_ids)["accuracy"],
        "f1": f1.compute(predictions=pred_labels, references=p.label_ids, average="binary")["f1"],
    }

args = TrainingArguments(
    output_dir="bert-imdb",
    eval_strategy="epoch",      # for your older version
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.284966,0.265261,0.8862,0.880937
2,0.176015,0.258858,0.899,0.89998


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.output.LayerNorm.weight', 'bert.encoder.layer.3.attention.output.LayerNorm.bias', 'bert.encoder.layer.3.output.LayerNorm.weight', 'bert.encoder.layer.3.output.LayerNorm.bias', 'bert.encoder.layer.4.attention.output.La

TrainOutput(global_step=2500, training_loss=0.24601055297851562, metrics={'train_runtime': 2104.1498, 'train_samples_per_second': 38.02, 'train_steps_per_second': 1.188, 'total_flos': 5262221107200000.0, 'train_loss': 0.24601055297851562, 'epoch': 2.0})

In [44]:
trainer.evaluate(test_ds)

{'eval_loss': 0.26079341769218445,
 'eval_accuracy': 0.9046,
 'eval_f1': 0.9045045045045045,
 'eval_runtime': 41.8626,
 'eval_samples_per_second': 119.438,
 'eval_steps_per_second': 3.75,
 'epoch': 2.0}