In [1]:
import pandas as pd
import numpy as np
import os
import sys
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [3]:
df = pd.read_csv("SpamClassifier-master/smsspamcollection/SMSSpamCollection", sep="\t", names=["label", "message"])

In [4]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [60]:
from tqdm import tqdm
class SpamWord2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SpamWord2Vec, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, 1)
        self.sigmoid = nn.Sigmoid()
        self.criterion = nn.BCELoss()
        self.optimizer = optim.AdamW(self.parameters(), lr=0.001, weight_decay=0.0001)

    def forward(self, x):
        x = self.embeddings(x)
        x = torch.mean(x, dim=1)
        x = self.linear(x)
        x = self.sigmoid(x)
        return x
    
    def to_empty(self, *, device, recurse = True):
        return super().to_empty(device=device, recurse=recurse)
    
    def train_model(self, train_loader, test_loader, num_epochs=5):
        train_losses = []
        test_losses = []
        train_accuracies = []
        test_accuracies = []

        for epoch in range(num_epochs):
            self.train()
            train_loss = 0
            correct = 0
            total = 0

            for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
                self.optimizer.zero_grad()
                inputs, labels = batch
                outputs = self(inputs)
                outputs = outputs.reshape(-1)
                loss = self.criterion(outputs, labels.float())
                loss.backward()
                self.optimizer.step()

                train_loss += loss.item()
                predicted = (outputs > 0.5).float()
                correct += (predicted == labels).sum().item()
                total += labels.size(0)

            train_losses.append(train_loss / len(train_loader))
            train_accuracies.append(correct / total)

            # Evaluate on test set
            self.eval()
            test_loss = 0
            correct = 0
            total = 0

            with torch.no_grad():
                for batch in test_loader:
                    inputs, labels = batch
                    outputs = self(inputs)
                    outputs = outputs.reshape(-1)
                    loss = self.criterion(outputs, labels.float())
                    test_loss += loss.item()
                    predicted = (outputs > 0.5).float()
                    correct += (predicted == labels).sum().item()
                    total += labels.size(0)

            test_losses.append(test_loss / len(test_loader))
            test_accuracies.append(correct / total)
            tqdm.write(f"Train Loss: {train_losses[-1]:.4f}, Train Accuracy: {train_accuracies[-1]:.4f}")
            tqdm.write(f"Test Loss: {test_losses[-1]:.4f}, Test Accuracy: {test_accuracies[-1]:.4f}")
        # return train_losses, test_losses, train_accuracies, test_accuracies

In [61]:
import nltk
import torch
from torch.utils.data import Dataset
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import pandas as pd

# Ensure NLTK resources are downloaded
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

class SpamDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.texts = df['message'].values
        self.labels = df['label'].map({'ham': 0, 'spam': 1}).values
        self.vocab = set()
        self.word_to_index = {}
        self.index_to_word = {}
        self.processed_texts = []  # Store preprocessed texts as token indices
        self.build_vocab()

    @staticmethod
    def preprocess_text(text):
        lemmatizer = WordNetLemmatizer()
        stop_words = set(stopwords.words('english'))
        text = text.lower()
        tokens = word_tokenize(text)  # Use word_tokenize for tokenization
        tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in stop_words]
        return tokens

    def text_to_indices(self, tokens):
        indices = [self.word_to_index[token] for token in tokens if token in self.word_to_index]
        return indices

    def build_vocab(self):
        for text in self.texts:
            tokens = self.preprocess_text(text)
            self.processed_texts.append(tokens)  # Store preprocessed tokens
            for token in tokens:
                self.vocab.add(token)
        self.word_to_index = {word: i + 1 for i, word in enumerate(self.vocab)}  # Start indexing from 1
        self.index_to_word = {i + 1: word for i, word in enumerate(self.vocab)}
        self.vocab_size = len(self.word_to_index) + 1  # +1 for padding index
        self.texts = [self.text_to_indices(tokens) for tokens in self.processed_texts]  # Convert texts to indices

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = torch.tensor(self.texts[idx], dtype=torch.long)
        label = torch.tensor(self.labels[idx], dtype=torch.long)  # Use long for classification
        return text, label

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kausik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/kausik/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/kausik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [62]:
dataset = SpamDataset(df)

In [63]:
train_df, test_df = train_test_split(dataset, test_size=0.2, random_state=42, shuffle=True)

In [64]:
def collate_fn(batch):
    texts, labels = zip(*batch)
    texts = nn.utils.rnn.pad_sequence(texts, batch_first=True, padding_value=0)
    labels = torch.stack(labels)
    return texts, labels

In [65]:
train_loader = DataLoader(train_df, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_df, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [66]:
model = SpamWord2Vec(vocab_size=dataset.vocab_size, embedding_dim=300)

In [67]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

SpamWord2Vec(
  (embeddings): Embedding(7603, 300)
  (linear): Linear(in_features=300, out_features=1, bias=True)
  (sigmoid): Sigmoid()
  (criterion): BCELoss()
)

In [68]:
model.train_model(train_loader, test_loader, num_epochs=15)

Epoch 1/15: 100%|██████████| 140/140 [00:02<00:00, 60.43it/s]


Train Loss: 0.3444, Train Accuracy: 0.8438
Test Loss: 0.3034, Test Accuracy: 0.8789


Epoch 2/15: 100%|██████████| 140/140 [00:02<00:00, 62.15it/s]


Train Loss: 0.2667, Train Accuracy: 0.8975
Test Loss: 0.2473, Test Accuracy: 0.8951


Epoch 3/15: 100%|██████████| 140/140 [00:02<00:00, 59.75it/s]


Train Loss: 0.2051, Train Accuracy: 0.9242
Test Loss: 0.1858, Test Accuracy: 0.9291


Epoch 4/15: 100%|██████████| 140/140 [00:02<00:00, 61.83it/s]


Train Loss: 0.1472, Train Accuracy: 0.9479
Test Loss: 0.1415, Test Accuracy: 0.9444


Epoch 5/15: 100%|██████████| 140/140 [00:02<00:00, 61.25it/s]


Train Loss: 0.1091, Train Accuracy: 0.9616
Test Loss: 0.1130, Test Accuracy: 0.9650


Epoch 6/15: 100%|██████████| 140/140 [00:02<00:00, 58.72it/s]


Train Loss: 0.0877, Train Accuracy: 0.9708
Test Loss: 0.0966, Test Accuracy: 0.9740


Epoch 7/15: 100%|██████████| 140/140 [00:02<00:00, 55.54it/s]


Train Loss: 0.0720, Train Accuracy: 0.9758
Test Loss: 0.0847, Test Accuracy: 0.9749


Epoch 8/15: 100%|██████████| 140/140 [00:02<00:00, 52.28it/s]


Train Loss: 0.0613, Train Accuracy: 0.9791
Test Loss: 0.0767, Test Accuracy: 0.9767


Epoch 9/15: 100%|██████████| 140/140 [00:02<00:00, 58.29it/s]


Train Loss: 0.0503, Train Accuracy: 0.9843
Test Loss: 0.0709, Test Accuracy: 0.9776


Epoch 10/15: 100%|██████████| 140/140 [00:02<00:00, 59.87it/s]


Train Loss: 0.0492, Train Accuracy: 0.9836
Test Loss: 0.0677, Test Accuracy: 0.9821


Epoch 11/15: 100%|██████████| 140/140 [00:02<00:00, 61.61it/s]


Train Loss: 0.0391, Train Accuracy: 0.9865
Test Loss: 0.0634, Test Accuracy: 0.9812


Epoch 12/15: 100%|██████████| 140/140 [00:02<00:00, 56.59it/s]


Train Loss: 0.0343, Train Accuracy: 0.9901
Test Loss: 0.0631, Test Accuracy: 0.9794


Epoch 13/15: 100%|██████████| 140/140 [00:02<00:00, 62.79it/s]


Train Loss: 0.0323, Train Accuracy: 0.9901
Test Loss: 0.0602, Test Accuracy: 0.9803


Epoch 14/15: 100%|██████████| 140/140 [00:02<00:00, 59.85it/s]


Train Loss: 0.0287, Train Accuracy: 0.9906
Test Loss: 0.0576, Test Accuracy: 0.9821


Epoch 15/15: 100%|██████████| 140/140 [00:02<00:00, 61.25it/s]

Train Loss: 0.0267, Train Accuracy: 0.9926
Test Loss: 0.0562, Test Accuracy: 0.9821



