In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments,Trainer
import numpy as np
%pip install evaluate
import evaluate

dataset = load_dataset("sms_spam")

print(dataset)


In [None]:

split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
print(split_dataset)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

print(train_dataset)
print(test_dataset)


In [None]:

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess(examples):
    return tokenizer(
        examples["sms"],
        truncation=True,
        padding=False,
        max_length=128
    )

train_dataset = train_dataset.map(preprocess, batched=True)
test_dataset = test_dataset.map(preprocess, batched=True)


In [None]:
num_labels = dataset["train"].features["label"].num_classes

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=num_labels,
    id2label={0: "ham", 1: "spam"},
    label2id={"ham": 0, "spam": 1}
)

for param in model.bert.parameters():
    param.requires_grad = False

for param in model.bert.encoder.layer[-2:].parameters():
    param.requires_grad = True

for param in model.bert.pooler.parameters():
    param.requires_grad = True
    
for param in model.classifier.parameters():
    param.requires_grad = True


In [None]:
import evaluate
import numpy as np
accuracy_metrics = evaluate.load("accuracy")
def compute_metrics(eval_pred):
  logits,labels = eval_pred
  predictions = np.argmax(logits, axis=1)
  return accuracy_metrics.compute(predictions=predictions,references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="no",

    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,

    report_to="none"
)


In [None]:


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()

In [None]:
save_dir = "spam_classifier"

trainer.save_model(save_dir)
print("model trained successfully")