In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
dataset_path = "smsspamcollection.zip"

# Download the dataset
import requests
import zipfile

response = requests.get(url)
with open(dataset_path, 'wb') as f:
    f.write(response.content)

with zipfile.ZipFile(dataset_path, 'r') as zip_ref:
    zip_ref.extractall()

# Load the data into a DataFrame
data = pd.read_csv("SMSSpamCollection", sep='\t', header=None, names=['label', 'text'])

# Encode labels
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# Split dataset into training and test sets
texts = list(data['text'])
labels = list(data['label'])
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tokenize the dataset
def tokenize_function(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)
    encodings['labels'] = labels
    return encodings

# Tokenize and convert to Dataset format
train_encodings = tokenize_function(X_train, y_train)
test_encodings = tokenize_function(X_test, y_test)

# Convert to PyTorch dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = CustomDataset(train_encodings)
test_dataset = CustomDataset(test_encodings)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:")
print(results)

# Predict on the test set
predictions = trainer.predict(test_dataset)
pred_labels = torch.argmax(torch.tensor(predictions.predictions), dim=1)

# Compute accuracy and classification report
accuracy = accuracy_score(y_test, pred_labels.numpy())
report = classification_report(y_test, pred_labels.numpy(), target_names=['ham', 'spam'])

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

# Save the model and tokenizer
model.save_pretrained('./spam_classifier_model')
tokenizer.save_pretrained('./spam_classifier_tokenizer')

# Load the model and tokenizer for inference
model = BertForSequenceClassification.from_pretrained('./spam_classifier_model')
tokenizer = BertTokenizer.from_pretrained('./spam_classifier_tokenizer')

# Define a prediction function
def predict(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)
    return predictions

# Test the model with some example texts
texts = ["Congratulations, you've won a $1,000 gift card! Call now to claim your prize.",
         "Meeting at 10 AM tomorrow. Please confirm your availability."]
predictions = predict(texts)
print("Predictions:", predictions)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.0862,0.041482
2,0.0361,0.037638
3,0.0042,0.054581


Evaluation Results:
{'eval_loss': 0.05458051711320877, 'eval_runtime': 8.2206, 'eval_samples_per_second': 135.636, 'eval_steps_per_second': 17.03, 'epoch': 3.0}
Accuracy: 0.9901
Classification Report:
              precision    recall  f1-score   support

         ham       1.00      0.99      0.99       966
        spam       0.95      0.98      0.96       149

    accuracy                           0.99      1115
   macro avg       0.97      0.99      0.98      1115
weighted avg       0.99      0.99      0.99      1115

Predictions: tensor([1, 0])
