In [1]:
import torch

print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
else:
    print("⚠ No GPU. Go to Runtime → Change runtime type → GPU.")

CUDA available: True
GPU: Tesla T4


In [2]:
!pip install -q transformers datasets accelerate evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
from pathlib import Path
import os
import re
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
import evaluate

In [4]:
# Download and extract IMDB
!wget -q https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xzf aclImdb_v1.tar.gz
!rm aclImdb_v1.tar.gz

# Quick sanity check
print("Train pos:", len(os.listdir("aclImdb/train/pos")))
print("Train neg:", len(os.listdir("aclImdb/train/neg")))
print("Test  pos:", len(os.listdir("aclImdb/test/pos")))
print("Test  neg:", len(os.listdir("aclImdb/test/neg")))

Train pos: 12500
Train neg: 12500
Test  pos: 12500
Test  neg: 12500


Build DataFrame (train + test)

In [5]:
def load_split(split: str) -> pd.DataFrame:
    base = Path("aclImdb") / split
    rows = []
    for label_name, label_int in [("pos", 1), ("neg", 0)]:
        for file in (base / label_name).glob("*.txt"):
            text = file.read_text(encoding="utf-8", errors="ignore")
            rows.append((text, label_int, split))
    return pd.DataFrame(rows, columns=["review", "label", "split"])

df_train = load_split("train")
df_test  = load_split("test")

df = pd.concat([df_train, df_test], ignore_index=True)
df.head(), df["label"].value_counts()

(                                              review  label  split
 0  The Last Hard Men finds James Coburn an outlaw...      1  train
 1  Darius Goes West is a film depicting American ...      1  train
 2  ****SPOILERS**** Powerhouse movie that shows h...      1  train
 3  Simply one of the best ever! Richard Brooks' a...      1  train
 4  Have you seen The Graduate? It was hailed as t...      1  train,
 label
 1    25000
 0    25000
 Name: count, dtype: int64)

Simple text cleaning

In [6]:
def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"<br\s*/?>", " ", text)        # remove HTML breaks
    text = re.sub(r"[^a-z0-9\s]", " ", text)      # keep letters/numbers/spaces
    text = re.sub(r"\s+", " ", text)              # squeeze spaces
    return text.strip()

df["clean"] = df["review"].apply(clean_text)
df[["review", "clean"]].head()

Unnamed: 0,review,clean
0,The Last Hard Men finds James Coburn an outlaw...,the last hard men finds james coburn an outlaw...
1,Darius Goes West is a film depicting American ...,darius goes west is a film depicting american ...
2,****SPOILERS**** Powerhouse movie that shows h...,spoilers powerhouse movie that shows how men i...
3,Simply one of the best ever! Richard Brooks' a...,simply one of the best ever richard brooks ada...
4,Have you seen The Graduate? It was hailed as t...,have you seen the graduate it was hailed as th...


Train/validation split

In [7]:
df_train_full = df[df["split"] == "train"].reset_index(drop=True)

X_train, X_val, y_train, y_val = train_test_split(
    df_train_full["clean"].values,
    df_train_full["label"].values,
    test_size=0.2,
    random_state=42,
    stratify=df_train_full["label"].values,
)

len(X_train), len(X_val)

(20000, 5000)

HuggingFace Datasets + tokenizer

In [8]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

train_ds = Dataset.from_dict({"text": X_train, "label": y_train})
val_ds   = Dataset.from_dict({"text": X_val,   "label": y_val})

def tokenize(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=256,
    )

train_ds = train_ds.map(tokenize, batched=True)
val_ds   = val_ds.map(tokenize,   batched=True)

train_ds = train_ds.remove_columns(["text"])
val_ds   = val_ds.remove_columns(["text"])

train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_ds.set_format(type="torch",   columns=["input_ids", "attention_mask", "label"])

train_ds, val_ds

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

(Dataset({
     features: ['label', 'input_ids', 'attention_mask'],
     num_rows: 20000
 }),
 Dataset({
     features: ['label', 'input_ids', 'attention_mask'],
     num_rows: 5000
 }))

Model

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,
)
model.to("cuda" if torch.cuda.is_available() else "cpu")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


Metrics (accuracy + F1)

In [10]:
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    acc = accuracy.compute(predictions=preds, references=labels)["accuracy"]
    f1_score = f1.compute(predictions=preds, references=labels, average="binary")["f1"]
    return {"accuracy": acc, "f1": f1_score}

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

TrainingArguments & Trainer

In [11]:
batch_size = 16

training_args = TrainingArguments(
    output_dir="distilbert_imdb",
    eval_strategy="epoch",           # <— important
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
)

Train

In [12]:
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2642,0.235957,0.9018,0.90225
2,0.2003,0.27546,0.9022,0.903835


TrainOutput(global_step=2500, training_loss=0.2391310317993164, metrics={'train_runtime': 988.9271, 'train_samples_per_second': 40.448, 'train_steps_per_second': 2.528, 'total_flos': 2649347973120000.0, 'train_loss': 0.2391310317993164, 'epoch': 2.0})

Final eval

In [13]:
metrics = trainer.evaluate()
metrics

{'eval_loss': 0.27546045184135437,
 'eval_accuracy': 0.9022,
 'eval_f1': 0.9038348082595871,
 'eval_runtime': 32.128,
 'eval_samples_per_second': 155.627,
 'eval_steps_per_second': 9.742,
 'epoch': 2.0}

In [14]:
def predict_sentiment(text: str):
    model.eval()
    device = model.device
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=256,
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        probs = outputs.logits.softmax(dim=1)[0].cpu().numpy()

    pred_class = int(probs.argmax())
    confidence = float(probs.max())
    label = "positive" if pred_class == 1 else "negative"
    return label, confidence

In [15]:
print(predict_sentiment("This movie was absolutely amazing, I loved it!"))
print(predict_sentiment("This was a boring waste of time."))

('positive', 0.9941772222518921)
('negative', 0.9912511110305786)
