In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments,Trainer
import numpy as np
%pip install evaluate
import evaluate


dataset = load_dataset("sms_spam")

print(dataset)


Note: you may need to restart the kernel to use updated packages.
DatasetDict({
    train: Dataset({
        features: ['sms', 'label'],
        num_rows: 5574
    })
})


In [3]:

split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
print(split_dataset)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

print(train_dataset)
print(test_dataset)


DatasetDict({
    train: Dataset({
        features: ['sms', 'label'],
        num_rows: 4459
    })
    test: Dataset({
        features: ['sms', 'label'],
        num_rows: 1115
    })
})
Dataset({
    features: ['sms', 'label'],
    num_rows: 4459
})
Dataset({
    features: ['sms', 'label'],
    num_rows: 1115
})


In [4]:

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess(examples):
    return tokenizer(
        examples["sms"],
        truncation=True,
        padding=False,
        max_length=128
    )

train_dataset = train_dataset.map(preprocess, batched=True)
test_dataset = test_dataset.map(preprocess, batched=True)


In [5]:
num_labels = dataset["train"].features["label"].num_classes

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=num_labels,
    id2label={0: "ham", 1: "spam"},
    label2id={"ham": 0, "spam": 1}
)

for param in model.bert.parameters():
    param.requires_grad = False

for param in model.bert.encoder.layer[-2:].parameters():
    param.requires_grad = True

for param in model.bert.pooler.parameters():
    param.requires_grad = True
    
for param in model.classifier.parameters():
    param.requires_grad = True


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
import evaluate
import numpy as np
accuracy_metrics = evaluate.load("accuracy")
def compute_metrics(eval_pred):
  logits,labels = eval_pred
  predictions = np.argmax(logits, axis=1)
  return accuracy_metrics.compute(predictions=predictions,references=labels)

In [7]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="no",

    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,

    report_to="none"
)


In [8]:


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [9]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.039156,0.990135
2,0.084000,0.034777,0.992825
3,0.084000,0.035252,0.992825




TrainOutput(global_step=837, training_loss=0.06107061994545776, metrics={'train_runtime': 1784.1181, 'train_samples_per_second': 7.498, 'train_steps_per_second': 0.469, 'total_flos': 434916921733500.0, 'train_loss': 0.06107061994545776, 'epoch': 3.0})

In [10]:
save_dir = "./spam_classifier"

trainer.save_model(save_dir)
