In [None]:
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim

# Load dataset
data_path = '/content/spam.csv'
df = pd.read_csv(data_path, encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'text']
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Tokenization
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

class SpamDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

train_dataset = SpamDataset(train_texts, train_labels, tokenizer)
test_dataset = SpamDataset(test_texts, test_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Define model
class SpamClassifier(nn.Module):
    def __init__(self, pretrained_model='distilbert-base-uncased'):
        super(SpamClassifier, self).__init__()
        self.bert = DistilBertModel.from_pretrained(pretrained_model)
        self.fc = nn.Linear(self.bert.config.hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.fc(outputs.last_hidden_state[:, 0, :])
        return self.sigmoid(logits).squeeze()

# Training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SpamClassifier().to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

def train_model(model, train_loader, criterion, optimizer, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            input_ids, attention_mask, labels = (
                batch['input_ids'].to(device),
                batch['attention_mask'].to(device),
                batch['labels'].float().to(device)
            )
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")

train_model(model, train_loader, criterion, optimizer)

Epoch 1, Loss: 0.09422464074323281
Epoch 2, Loss: 0.018386523417066983
Epoch 3, Loss: 0.0040707023780731195


In [None]:
# Define BERT-based classifier with attention
class BertWithAttention(nn.Module):
    def __init__(self, pretrained_model='distilbert-base-uncased'):
        super(BertWithAttention, self).__init__()
        self.bert = DistilBertModel.from_pretrained(pretrained_model)
        self.attention = nn.Linear(self.bert.config.hidden_size, 1)
        self.fc = nn.Linear(self.bert.config.hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        attention_weights = torch.softmax(self.attention(outputs.last_hidden_state), dim=1)
        weighted_output = torch.sum(attention_weights * outputs.last_hidden_state, dim=1)
        logits = self.fc(weighted_output)
        return self.sigmoid(logits).squeeze()

# Initialize and train the attention-based model
attention_model = BertWithAttention().to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(attention_model.parameters(), lr=2e-5)

train_model(attention_model, train_loader, criterion, optimizer)

Epoch 1, Loss: 0.07406837985630557
Epoch 2, Loss: 0.02228009310491737
Epoch 3, Loss: 0.007328823308012278


In [None]:
# Evaluation function
from sklearn.metrics import precision_score, recall_score, roc_auc_score
def evaluate_model(model, test_loader):
    model.eval()
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = (
                batch['input_ids'].to(device),
                batch['attention_mask'].to(device),
                batch['labels'].to(device)
            )
            outputs = model(input_ids, attention_mask)
            preds = (outputs > 0.5).float()
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    auc_roc = roc_auc_score(all_labels, all_preds)
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, AUC-ROC: {auc_roc:.4f}")

# Evaluate the model
evaluate_model(attention_model, test_loader)

Precision: 0.9799, Recall: 0.9733, AUC-ROC: 0.9851


In [None]:
import torch

# Save the trained model
torch.save(model.state_dict(), "/content/spam_classifier.pth")
print("Model saved successfully!")

Model saved successfully!


In [None]:
import torch

# Initialize the model
model = SpamClassifier()

# Load the saved model weights
model.load_state_dict(torch.load("/content/spam_classifier.pth", map_location=torch.device('cpu')))
model.eval()  # Set the model to evaluation mode

print("Model loaded successfully!")


  model.load_state_dict(torch.load("/content/spam_classifier.pth", map_location=torch.device('cpu')))


Model loaded successfully!
