In [1]:
!pip install evaluation

Collecting evaluation
  Downloading evaluation-0.0.2.tar.gz (2.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting glog (from evaluation)
  Downloading glog-0.3.1-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting python-gflags>=3.1 (from glog->evaluation)
  Downloading python-gflags-3.1.2.tar.gz (52 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m52.1/52.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading glog-0.3.1-py2.py3-none-any.whl (7.8 kB)
Building wheels for collected packages: evaluation, python-gflags
  Building wheel for evaluation (setup.py) ... [?25l[?25hdone
  Created wheel for evaluation: filename=evaluation-0.0.2-py3-none-any.whl size=2445 sha256=8e3ae3185c754d25dcceb527c42a302636b9656351d12d84cc003e599f119f1e
  Stored in directory: /root/.cache/pip/wheels/c2/1b/52/3a6b0472227ed68ab68fb4

In [3]:
# ===============================
# 1Ô∏è‚É£ Imports
# ===============================
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


# ===============================
# 2Ô∏è‚É£ Load & Explore Dataset
# ===============================
dataset = load_dataset("sms_spam")

print("Column Names:", dataset["train"].column_names)
print("Label Names:", dataset["train"].features["label"].names)

# Expected:
# text column -> "text"
# label column -> "label"
# labels -> ['ham', 'spam']


# ===============================
# 3Ô∏è‚É£ Label Dictionaries
# ===============================
label_map = {0: "ham", 1: "spam"}
id_map = {"ham": 0, "spam": 1}


# ===============================
# 4Ô∏è‚É£ Tokenizer
# ===============================
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(example):
    return tokenizer(
        example["sms"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)


# ===============================
# 5Ô∏è‚É£ Shuffle & Split Dataset
# ===============================
tokenized_dataset = tokenized_dataset.shuffle(seed=42)

train_dataset = tokenized_dataset["train"].select(range(4000))
eval_dataset = tokenized_dataset["train"].select(range(1000))

train_dataset.set_format(
    "torch",
    columns=["input_ids", "attention_mask", "label"]
)
eval_dataset.set_format(
    "torch",
    columns=["input_ids", "attention_mask", "label"]
)


# ===============================
# 6Ô∏è‚É£ Load Model
# ===============================
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,
    id2label=label_map,
    label2id=id_map
)


# ===============================
# 7Ô∏è‚É£ Metrics
# ===============================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average="binary"
    )
    accuracy = accuracy_score(labels, predictions)

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


# ===============================
# 8Ô∏è‚É£ Training Arguments
# ===============================
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)


# ===============================
# 9Ô∏è‚É£ Trainer
# ===============================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


# ===============================
# üîü Train Model
# ===============================
trainer.train()


# ===============================
# 1Ô∏è‚É£1Ô∏è‚É£ Save Model
# ===============================
trainer.save_model("./spam_model")
tokenizer.save_pretrained("./spam_model")


# ===============================
# 1Ô∏è‚É£2Ô∏è‚É£ Load Model for Inference
# ===============================
tokenizer = AutoTokenizer.from_pretrained("./spam_model")
model = AutoModelForSequenceClassification.from_pretrained("./spam_model")
model.eval()


# ===============================
# 1Ô∏è‚É£3Ô∏è‚É£ Prediction Function
# ===============================
def predict_with_label(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128
    )

    with torch.no_grad():
        outputs = model(**inputs)

    pred_id = torch.argmax(outputs.logits, dim=1).item()
    return label_map[pred_id]


# ===============================
# 1Ô∏è‚É£4Ô∏è‚É£ Test Examples
# ===============================
texts = [
    "Congratulations! You've won a free ticket.",
    "Hey, are we meeting tomorrow?",
]

for t in texts:
    print("Text:", t)
    print("Prediction:", predict_with_label(t))
    print("-" * 50)


Column Names: ['sms', 'label']
Label Names: ['ham', 'spam']


Map:   0%|          | 0/5574 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0459,0.022379,0.995,0.976,0.983871,0.97992
2,0.0232,0.013517,0.998,1.0,0.983871,0.99187


Text: Congratulations! You've won a free ticket.
Prediction: ham
--------------------------------------------------
Text: Hey, are we meeting tomorrow?
Prediction: ham
--------------------------------------------------
