In [1]:
import pandas as pd
import numpy as np
import os
import sys
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("SpamClassifier-master/smsspamcollection/SMSSpamCollection", sep="\t", names=["label", "message"])

In [3]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
from tqdm import tqdm
class SpamWord2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SpamWord2Vec, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, 1)
        self.sigmoid = nn.Sigmoid()
        self.criterion = nn.BCELoss()
        self.optimizer = optim.AdamW(self.parameters(), lr=0.001, weight_decay=0.0001)

    def forward(self, x):
        x = self.embeddings(x)
        x = torch.mean(x, dim=1)
        x = self.linear(x)
        x = self.sigmoid(x)
        return x
    
    def to_empty(self, *, device, recurse = True):
        return super().to_empty(device=device, recurse=recurse)
    
    def train_model(self, train_loader, test_loader, num_epochs=5):
        train_losses = []
        test_losses = []
        train_accuracies = []
        test_accuracies = []

        for epoch in range(num_epochs):
            self.train()
            train_loss = 0
            correct = 0
            total = 0

            for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
                self.optimizer.zero_grad()
                inputs, labels = batch
                outputs = self(inputs)
                outputs = outputs.reshape(-1)
                loss = self.criterion(outputs, labels.float())
                loss.backward()
                self.optimizer.step()

                train_loss += loss.item()
                predicted = (outputs > 0.5).float()
                correct += (predicted == labels).sum().item()
                total += labels.size(0)

            train_losses.append(train_loss / len(train_loader))
            train_accuracies.append(correct / total)

            # Evaluate on test set
            self.eval()
            test_loss = 0
            correct = 0
            total = 0

            with torch.no_grad():
                for batch in test_loader:
                    inputs, labels = batch
                    outputs = self(inputs)
                    outputs = outputs.reshape(-1)
                    loss = self.criterion(outputs, labels.float())
                    test_loss += loss.item()
                    predicted = (outputs > 0.5).float()
                    correct += (predicted == labels).sum().item()
                    total += labels.size(0)

            test_losses.append(test_loss / len(test_loader))
            test_accuracies.append(correct / total)
            tqdm.write(f"Train Loss: {train_losses[-1]:.4f}, Train Accuracy: {train_accuracies[-1]:.4f}")
            tqdm.write(f"Test Loss: {test_losses[-1]:.4f}, Test Accuracy: {test_accuracies[-1]:.4f}")
        # return train_losses, test_losses, train_accuracies, test_accuracies

In [5]:
import nltk
import torch
from torch.utils.data import Dataset
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import pandas as pd

# Ensure NLTK resources are downloaded
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

class SpamDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.texts = df['message'].values
        self.labels = df['label'].map({'ham': 0, 'spam': 1}).values
        self.vocab = set()
        self.word_to_index = {}
        self.index_to_word = {}
        self.processed_texts = []  # Store preprocessed texts as token indices
        self.build_vocab()

    @classmethod
    def preprocess_text(cls, text):
        lemmatizer = WordNetLemmatizer()
        stop_words = set(stopwords.words('english'))
        text = text.lower()
        tokens = word_tokenize(text)  # Use word_tokenize for tokenization
        tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in stop_words]
        return tokens

    def text_to_indices(self, tokens):
        indices = [self.word_to_index[token] for token in tokens if token in self.word_to_index]
        return indices

    def build_vocab(self):
        for text in self.texts:
            tokens = self.preprocess_text(text)
            self.processed_texts.append(tokens)  # Store preprocessed tokens
            for token in tokens:
                self.vocab.add(token)
        self.word_to_index = {word: i + 1 for i, word in enumerate(self.vocab)}  # Start indexing from 1
        self.index_to_word = {i + 1: word for i, word in enumerate(self.vocab)}
        self.vocab_size = len(self.word_to_index) + 1  # +1 for padding index
        self.texts = [self.text_to_indices(tokens) for tokens in self.processed_texts]  # Convert texts to indices

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = torch.tensor(self.texts[idx], dtype=torch.long, device=self.device)
        label = torch.tensor(self.labels[idx], dtype=torch.long, device=self.device)  # Use long for classification
        return text, label

    def predict(self, text):
        tokens = self.preprocess_text(text)
        indices = self.text_to_indices(tokens)
        text_tensor = torch.tensor(indices, dtype=torch.long, device=self.device)
        return text_tensor

[nltk_data] Downloading package stopwords to /home/kausik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/kausik/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/kausik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
dataset = SpamDataset(df)

In [7]:
train_df, test_df = train_test_split(dataset, test_size=0.2, random_state=42, shuffle=True)

In [8]:
def collate_fn(batch):
    texts, labels = zip(*batch)
    texts = nn.utils.rnn.pad_sequence(texts, batch_first=True, padding_value=0)
    labels = torch.stack(labels)
    return texts, labels

In [9]:
train_loader = DataLoader(train_df, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_df, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [10]:
model = SpamWord2Vec(vocab_size=dataset.vocab_size, embedding_dim=100)

In [11]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

SpamWord2Vec(
  (embeddings): Embedding(7603, 100)
  (linear): Linear(in_features=100, out_features=1, bias=True)
  (sigmoid): Sigmoid()
  (criterion): BCELoss()
)

In [12]:
model.train_model(train_loader, test_loader, num_epochs=15)

Epoch 1/15: 100%|██████████| 140/140 [00:00<00:00, 537.83it/s]


Train Loss: 0.3258, Train Accuracy: 0.8670
Test Loss: 0.3096, Test Accuracy: 0.8726


Epoch 2/15: 100%|██████████| 140/140 [00:00<00:00, 1127.61it/s]


Train Loss: 0.2905, Train Accuracy: 0.8838
Test Loss: 0.2805, Test Accuracy: 0.8807


Epoch 3/15: 100%|██████████| 140/140 [00:00<00:00, 1248.35it/s]


Train Loss: 0.2445, Train Accuracy: 0.9037
Test Loss: 0.2360, Test Accuracy: 0.8969


Epoch 4/15: 100%|██████████| 140/140 [00:00<00:00, 1236.05it/s]


Train Loss: 0.2027, Train Accuracy: 0.9219
Test Loss: 0.1898, Test Accuracy: 0.9157


Epoch 5/15: 100%|██████████| 140/140 [00:00<00:00, 1186.48it/s]


Train Loss: 0.1538, Train Accuracy: 0.9468
Test Loss: 0.1533, Test Accuracy: 0.9381


Epoch 6/15: 100%|██████████| 140/140 [00:00<00:00, 1195.89it/s]


Train Loss: 0.1212, Train Accuracy: 0.9583
Test Loss: 0.1277, Test Accuracy: 0.9570


Epoch 7/15: 100%|██████████| 140/140 [00:00<00:00, 1197.37it/s]


Train Loss: 0.1027, Train Accuracy: 0.9686
Test Loss: 0.1103, Test Accuracy: 0.9686


Epoch 8/15: 100%|██████████| 140/140 [00:00<00:00, 1203.60it/s]


Train Loss: 0.0834, Train Accuracy: 0.9737
Test Loss: 0.0980, Test Accuracy: 0.9722


Epoch 9/15: 100%|██████████| 140/140 [00:00<00:00, 1214.38it/s]


Train Loss: 0.0773, Train Accuracy: 0.9735
Test Loss: 0.0892, Test Accuracy: 0.9722


Epoch 10/15: 100%|██████████| 140/140 [00:00<00:00, 1217.66it/s]


Train Loss: 0.0687, Train Accuracy: 0.9771
Test Loss: 0.0826, Test Accuracy: 0.9731


Epoch 11/15: 100%|██████████| 140/140 [00:00<00:00, 1202.89it/s]


Train Loss: 0.0580, Train Accuracy: 0.9798
Test Loss: 0.0773, Test Accuracy: 0.9749


Epoch 12/15: 100%|██████████| 140/140 [00:00<00:00, 1207.03it/s]


Train Loss: 0.0558, Train Accuracy: 0.9818
Test Loss: 0.0728, Test Accuracy: 0.9767


Epoch 13/15: 100%|██████████| 140/140 [00:00<00:00, 1214.73it/s]


Train Loss: 0.0477, Train Accuracy: 0.9832
Test Loss: 0.0695, Test Accuracy: 0.9776


Epoch 14/15: 100%|██████████| 140/140 [00:00<00:00, 1202.28it/s]


Train Loss: 0.0492, Train Accuracy: 0.9825
Test Loss: 0.0678, Test Accuracy: 0.9785


Epoch 15/15: 100%|██████████| 140/140 [00:00<00:00, 1235.64it/s]

Train Loss: 0.0404, Train Accuracy: 0.9877
Test Loss: 0.0648, Test Accuracy: 0.9785





In [13]:
torch.save(model.state_dict(), "SpamWord2Vec_StateDict.pth")

In [14]:
torch.save(model, "SpamWord2Vec_Model.pth")

# CBOW Implementation

In [24]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)
        self.relu = nn.ReLU()
        self.softmax = nn.LogSoftmax(dim=1) # Added softmax layer

    def forward(self, inputs):
        embeds = self.embeddings(inputs).sum(dim=1)
        out = self.relu(self.linear1(embeds))
        out = self.linear2(out)
        out = self.softmax(out) # Apply softmax
        return out

In [16]:
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    return tokens

def create_vocab(corpus):
    vocab = set()
    for sentence in corpus:
        for word in sentence:
            vocab.add(word)
    word_to_index = {word: index for index, word in enumerate(vocab)}
    index_to_word = {index: word for index, word in enumerate(vocab)}
    return word_to_index, index_to_word

def create_training_data(corpus, word_to_index, context_size):
    data = []
    for sentence in corpus:
        for i in range(context_size, len(sentence) - context_size):
            context_words = [sentence[j] for j in range(i - context_size, i + context_size + 1) if j != i]
            target_word = sentence[i]
            context_indices = [word_to_index[word] for word in context_words]
            target_index = word_to_index[target_word]
            data.append((context_indices, target_index))
    return data

In [17]:
corpus = df['message'].values

In [20]:
# Preprocess the corpus
corpus = [preprocess_text(sentence) for sentence in corpus]

In [21]:
# Create vocabulary
word_to_index, index_to_word = create_vocab(corpus)
vocab_size = len(word_to_index)

In [22]:
# Create training data
context_size = 3
training_data = create_training_data(corpus, word_to_index, context_size)

In [23]:
# Define the model
embedding_dim = 100
model = CBOW(vocab_size, embedding_dim, context_size)
model.to("cuda")

CBOW(
  (embeddings): Embedding(8109, 100)
  (linear1): Linear(in_features=100, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=8109, bias=True)
  (relu): ReLU()
)

In [25]:
# Define loss function and optimizer
loss_function = nn.NLLLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.00001)

In [27]:
# Train the model
num_epochs = 50
for epoch in tqdm(range(num_epochs)):
    total_loss = 0
    for context, target in training_data:
        context_var = torch.tensor(context, dtype=torch.long, device="cuda")
        target_var = torch.tensor([target], dtype=torch.long, device="cuda")

        optimizer.zero_grad()
        log_probs = model(context_var)
        loss = loss_function(log_probs, target_var)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    tqdm.write(f"Epoch: {epoch+1}, Loss: {total_loss:.4f}")

  0%|          | 0/50 [00:00<?, ?it/s]


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x6 and 100x128)

In [28]:
# python
import nltk
import torch
from torch.utils.data import Dataset
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Ensure NLTK resources are downloaded
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

class CBOWDataset(Dataset):
    def __init__(self, df, context_size):
        """
        Expects a pandas DataFrame with a column "message".
        The dataset will preprocess messages and build vocabulary.
        """
        self.context_size = context_size  # number of words on left/right
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.messages = df['message'].values
        self.processed_texts = [self.preprocess_text(text) for text in self.messages]
        self.build_vocab()
        self.data = self.create_training_data()

    @classmethod
    def preprocess_text(cls, text):
        lemmatizer = WordNetLemmatizer()
        stop_words = set(stopwords.words('english'))
        text = text.lower()
        tokens = word_tokenize(text)
        tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in stop_words]
        return tokens

    def build_vocab(self):
        vocab = set()
        for tokens in self.processed_texts:
            vocab.update(tokens)
        # index 0 is reserved for padding; start vocab indexing at 1
        self.word2idx = {word: idx+1 for idx, word in enumerate(sorted(vocab))}
        self.idx2word = {idx: word for word, idx in self.word2idx.items()}
        self.vocab_size = len(self.word2idx) + 1

    def create_training_data(self):
        """
        Creates training pairs (context, target). For each word in a sentence,
        the context is defined as the surrounding 2*context_size words.
        """
        data = []
        # require at least 2*context_size+1 tokens per sentence
        for tokens in self.processed_texts:
            if len(tokens) < 2 * self.context_size + 1:
                continue
            for i in range(self.context_size, len(tokens) - self.context_size):
                # context is all words except the target
                context_words = tokens[i - self.context_size:i] + tokens[i + 1:i + self.context_size + 1]
                target_word = tokens[i]
                # convert tokens to indices (ignore words not in vocab)
                context_indices = [self.word2idx[w] for w in context_words if w in self.word2idx]
                if len(context_indices) != 2 * self.context_size:
                    continue
                target_index = self.word2idx.get(target_word)
                if target_index is not None:
                    data.append((context_indices, target_index))
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target = self.data[idx]
        # return as tensors on the proper device
        context_tensor = torch.tensor(context, dtype=torch.long, device=self.device)
        target_tensor = torch.tensor(target, dtype=torch.long, device=self.device)
        return context_tensor, target_tensor

    def get_vocab(self):
        return self.word2idx, self.idx2word

    def get_vocab_size(self):
        return self.vocab_size

[nltk_data] Downloading package stopwords to /home/kausik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/kausik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/kausik/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [29]:
# python
import torch
import torch.nn as nn
import torch.nn.functional as F

class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim=128):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, vocab_size)
        self.relu = nn.ReLU()
        self.log_softmax = nn.LogSoftmax(dim=1)
        # these must be set externally after creating dataset
        self.word2idx = None
        self.idx2word = None

    def forward(self, inputs):
        # inputs: shape [batch, 2*context_size]
        embeds = self.embeddings(inputs)
        # sum embeddings to form a context representation [batch, embedding_dim]
        context_vector = torch.sum(embeds, dim=1)
        out = self.relu(self.linear1(context_vector))
        out = self.linear2(out)
        out = self.log_softmax(out)
        return out

    def predict(self, context_tokens):
        """
        Given a list of context tokens (length = 2*context_size),
        returns the target word predicted by the model.
        Ensure that self.word2idx and self.idx2word are set.
        """
        if not self.word2idx or not self.idx2word:
            raise ValueError("Model vocabulary not set. Assign word2idx and idx2word.")
        indices = []
        for token in context_tokens:
            idx = self.word2idx.get(token)
            if idx is None:
                raise ValueError(f'Token \{token\} not in vocabulary.')
            indices.append(idx)
        # Convert to tensor and add batch dimension
        context_tensor = torch.tensor(indices, dtype=torch.long).unsqueeze(0)
        context_tensor = context_tensor.to(self.embeddings.weight.device)
        self.eval()
        with torch.no_grad():
            out = self.forward(context_tensor)
            predicted_idx = torch.argmax(out, dim=1).item()
        return self.idx2word.get(predicted_idx, "<UNK>")

SyntaxError: f-string expression part cannot include a backslash (3363294344.py, line 40)