Load and Preprocess the Data

In [30]:
import os
import string
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

# Define path to data
data_dir = 'enron1'

# Define categories
categories = ['ham', 'spam']
num_categories = len(categories)

# Define dataset class
class EmailDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index], self.labels[index]

# Initialize variables
docs = []
labels = []
categories_to_count = {'ham': 0, 'spam' : 0}

# Load data
for category in categories:
    # Get list of files
    path = os.path.join(data_dir, category)
    files = os.listdir(path)
    
    for file in files:
        # Read file
        with open(os.path.join(path, file), 'r', encoding='utf-8', errors='ignore') as f:
            text = f.read()
            
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        
        # Add to docs and labels
        docs.append(text)
        labels.append(category)
        categories_to_count[category] = categories_to_count[category]+1
        
# Tokenize text
word_to_idx = {}
idx_to_word = {}
for doc in docs:
    for word in doc.split():
        if word not in word_to_idx:
            idx = len(word_to_idx)
            word_to_idx[word] = idx
            idx_to_word[idx] = word
vocab_size = len(word_to_idx)


# Convert text to sequence of indices
sequences = []
for doc in docs:
    sequence = [word_to_idx[word] for word in doc.split()]
    sequences.append(sequence)

# Pad sequences to be of equal length
max_len = max([len(seq) for seq in sequences])
padded_sequences = []
for seq in sequences:
    seq = torch.LongTensor(seq)
    padded_seq = torch.zeros(max_len, dtype=torch.long)
    padded_seq[:len(seq)] = seq
    padded_sequences.append(padded_seq)
    
# Convert labels to categorical
cat_labels = torch.tensor([categories.index(label) for label in labels], dtype=torch.long)


# Shuffle data
indices = np.arange(len(padded_sequences))
np.random.shuffle(indices)
padded_sequences = [padded_sequences[i] for i in indices]
cat_labels = cat_labels[indices]

# Split data into training and testing sets
split_idx = int(len(padded_sequences) * 0.8)
train_data = padded_sequences[:split_idx]
train_labels = cat_labels[:split_idx]
test_data = padded_sequences[split_idx:]
test_labels = cat_labels[split_idx:]

# Define data loaders
train_dataset = EmailDataset(train_data, train_labels)
test_dataset = EmailDataset(test_data, test_labels)
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print("Vocab_Size:", vocab_size, "NumSpam:", categories_to_count['spam'], "NumHam:", categories_to_count['ham'] , "MaxSequenceLen:", max_len)

Vocab_Size: 50550 NumSpam: 1500 NumHam: 3672 MaxSequenceLen: 3568


Define Model

In [33]:
import torch.nn.functional as F
class WordLSTM(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, output_size, batch_size):
        super(WordLSTM, self).__init__()
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def init_hidden(self):
        return (torch.zeros(self.num_layers, self.batch_size, self.hidden_size),
                torch.zeros(self.num_layers, self.batch_size, self.hidden_size))
    
    def forward(self, x, h):
        
        x = self.embedding(x)
        out, hn = self.lstm(x, h)
        # only pass in last out
        out = self.fc(out[:, -1])
        out = F.log_softmax(out, dim=1)
        return out, hn
        

Train Model

In [35]:
import torch.optim as optim

    
if torch.cuda.is_available():
    print("CUDA is available!")
    device = torch.device("cuda")
    print("GPU:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available.")
    print(torch.__version__)
    device = torch.device("cpu")


# Define hyperparameters

hidden_size = 256
embedding_size = 100
num_layers = 2
learning_rate = 0.001
num_epochs = 2


# Initialize model, loss function, and optimizer
model = WordLSTM(vocab_size, embedding_size, hidden_size, num_layers, num_categories, batch_size)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Define data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Train the model
train_loss_arr = []
test_loss_arr = []
for epoch in range(num_epochs):
    train_loss = 0
    test_loss = 0
    

    
    # Train the model on the training set
    model.train()
    for i, (inputs, labels) in enumerate(train_loader):
        # shape is batch_size, seqLength
        inputs = inputs.to(device)
        # shape is batch_size
        labels = labels.to(device)
        h = (torch.zeros(num_layers, inputs.size(0), hidden_size).to(device), torch.zeros(num_layers, inputs.size(0), hidden_size).to(device))

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs, hn = model(inputs, h)
        loss = criterion(outputs, labels)
        train_loss += loss.item()

        # Backward pass
        loss.backward()
        optimizer.step()
        
        if(i%10==0): 
            print(i, "out of", len(train_loader), "processed")
        
    train_loss /= len(train_loader)
    train_loss_arr.append(train_loss)
    
    # Test the model on the test set
    model.eval()
    with torch.no_grad():
       
        for i, (inputs, labels) in enumerate(test_loader):
            inputs = inputs.to(device)
            labels = labels.to(device)
            h = (torch.zeros(num_layers, inputs.size(0), hidden_size).to(device), torch.zeros(num_layers, inputs.size(0), hidden_size).to(device))

            # Initialize hidden and cell states to 0 for the first batch of each epoch
            

            # Forward pass
            outputs, hn = model(inputs, h)
            loss = criterion(outputs, labels)
            test_loss += loss.item()

        test_loss /= len(test_loader)
        test_loss_arr.append(test_loss)

    # Print the average train and test losses for this epoch
    print("Epoch {}/{} Train Loss: {:.4f} Test Loss: {:.4f}".format(epoch+1, num_epochs, train_loss, test_loss))

# Plot the train and test loss curves
import matplotlib.pyplot as plt
plt.plot(train_loss_arr, label='Train Loss')
plt.plot(test_loss_arr, label='Test Loss')
plt.xlabel("Epoch")
plt.ylabel("Average Loss")
plt.legend()
plt.show()


CUDA is available!
GPU: NVIDIA GeForce GTX 1080 Ti
0 out of 130 processed
10 out of 130 processed


KeyboardInterrupt: 

In [None]:
import torch.optim as optim

    
if torch.cuda.is_available():
    print("CUDA is available!")
    device = torch.device("cuda")
    print("GPU:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available.")
    print(torch.__version__)
    device = torch.device("cpu")
torch.cuda.empty_cache()

# Define hyperparameters

hidden_size = 256
embedding_size=100
num_layers = 2
learning_rate = 0.001
num_epochs = 2


# Initialize model, loss function, and optimizer
model = WordLSTM(vocab_size,embedding_size, hidden_size, num_layers, num_categories)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)



# Train the model
train_loss_arr = []
test_loss_arr = []
for epoch in range(num_epochs):
    train_loss = 0
    test_loss = 0
    

    
    # Train the model on the training set
    model.train()
    for i, (inputs, labels) in enumerate(train_loader):
        # shape is batch_size, seqLength
        inputs = inputs.to(device)
        # shape is batch_size
        labels = labels.to(device)
        h = (torch.zeros(num_layers, inputs.size(0), hidden_size).to(device), torch.zeros(num_layers, inputs.size(0), hidden_size).to(device))

        # Zero the gradients
        optimizer.zero_grad()
        # Forward pass
        outputs, hn = model(inputs, h)
        loss = criterion(outputs, labels)
        train_loss += loss.item()

        # Backward pass
        loss.backward()
        optimizer.step()
        
        if(i%10==0): 
            print(outputs)
            print(labels)
            print(i, "out of", len(train_loader), "processed")
            
        
    train_loss /= len(train_loader)
    train_loss_arr.append(train_loss)
    
    # Test the model on the test set
    model.eval()
    with torch.no_grad():
       
        for i, (inputs, labels) in enumerate(test_loader):
            inputs = inputs.to(device)
            labels = labels.to(device)
            h = (torch.zeros(num_layers, inputs.size(0), hidden_size).to(device), torch.zeros(num_layers, inputs.size(0), hidden_size).to(device))

            # Initialize hidden and cell states to 0 for the first batch of each epoch
            

            # Forward pass
            outputs, hn = model(inputs, h)
            loss = criterion(outputs, labels)
            test_loss += loss.item()

        test_loss /= len(test_loader)
        test_loss_arr.append(test_loss)

    # Print the average train and test losses for this epoch
    print("Epoch {}/{} Train Loss: {:.4f} Test Loss: {:.4f}".format(epoch+1, num_epochs, train_loss, test_loss))

# Plot the train and test loss curves
import matplotlib.pyplot as plt
plt.plot(train_loss_arr, label='Train Loss')
plt.plot(test_loss_arr, label='Test Loss')
plt.xlabel("Epoch")
plt.ylabel("Average Loss")
plt.legend()
plt.show()

Evaluate Model

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

model.eval()
preds = []
targets = []
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        # Initialize hidden and cell states to 0 for the first batch of each epoch
        h = (torch.zeros(num_layers, inputs.size(0), hidden_size).to(device), 
             torch.zeros(num_layers, inputs.size(0), hidden_size).to(device))

        # Forward pass
        outputs, h = model(inputs, h)
        preds.append(torch.argmax(outputs, dim=1).cpu().numpy())
        targets.append(labels.cpu().numpy())

preds = np.concatenate(preds)
targets = np.concatenate(targets)

acc = accuracy_score(targets, preds)
prec = precision_score(targets, preds, zero_division=0)
rec = recall_score(targets, preds)
f1 = f1_score(targets, preds)
roc_auc = roc_auc_score(targets, preds)

print("Accuracy: {:.4f}".format(acc))
print("Precision: {:.4f}".format(prec))
print("Recall: {:.4f}".format(rec))
print("F1-score: {:.4f}".format(f1))
print("ROC-AUC score: {:.4f}".format(roc_auc))

with torch.no_grad():
    # Print predicted and target labels for each batch
    for i, (inputs, labels) in enumerate(test_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Initialize hidden and cell states to 0 for the first batch of each epoch
        h = (torch.zeros(num_layers, inputs.size(0), hidden_size).to(device), 
             torch.zeros(num_layers, inputs.size(0), hidden_size).to(device))

        # Forward pass
        outputs, h = model(inputs, h)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        targets = labels.cpu().numpy()
        probs = torch.softmax(outputs, dim=1).cpu().numpy()

        print("Batch {}".format(i))
        print("Predicted labels:", preds)
        print("Target labels:", targets)
        print("Probabilities:", probs)