In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd

# Preprocessing the dataset
dataset = pd.read_csv('/kaggle/input/spam-dataset/spam_or_not_spam.csv')
texts = dataset['email'].astype(str)
labels = dataset['label']

NUM_WORDS = 20000
MAX_LENGTH = 200  # Truncate or pad sequences to this length

tokenizer = Tokenizer(num_words=NUM_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.long)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float)
X_test_tensor = torch.tensor(X_test, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float)

# defining the encoder
class Encoder(nn.Module):
    def __init__(self, embed_dim, vocab_size, n_layers_E, n_hidden_E, dim_z):
        super(Encoder, self).__init__()
        self.n_layers_E = n_layers_E
        self.n_hidden_E = n_hidden_E
        self.lstm = nn.LSTM(embed_dim, n_hidden_E, n_layers_E, batch_first=True, bidirectional=True)
        self.hidden_to_mu = nn.Linear(2 * n_hidden_E, dim_z)
        self.hidden_to_logvar = nn.Linear(2 * n_hidden_E, dim_z)

    def forward(self, x):
        out, _ = self.lstm(x)
        e_hidden = out[:, -1, :]  # Take the last hidden state
        mu = self.hidden_to_mu(e_hidden)
        logvar = self.hidden_to_logvar(e_hidden)
        epsilon = torch.randn_like(mu)
        z = mu + torch.exp(logvar * 0.5) * epsilon
        return mu, logvar, z

# defining the decoder
class Decoder(nn.Module):
    def __init__(self, n_hidden_D, n_layers_D, embedding_dim, dim_z, vocab_size):
        super(Decoder, self).__init__()
        self.lstm = nn.LSTM(embedding_dim + dim_z, n_hidden_D, n_layers_D, batch_first=True)
        self.fc = nn.Linear(n_hidden_D, vocab_size)

    def forward(self, x, z):
        batch_size, seq_len, _ = x.size()
        z_expanded = z.unsqueeze(1).repeat(1, seq_len, 1)  # Expand z across sequence length
        x = torch.cat([x, z_expanded], dim=2)
        out, _ = self.lstm(x)
        logits = self.fc(out)
        return logits

# Variational Autoencoder
class VAE(nn.Module):
    def __init__(self, embedding_dim, vocab_size, n_layers_E, n_hidden_E, dim_z, n_hidden_D, n_layers_D):
        super(VAE, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = Encoder(embedding_dim, vocab_size, n_layers_E, n_hidden_E, dim_z)
        self.decoder = Decoder(n_hidden_D, n_layers_D, embedding_dim, dim_z, vocab_size)

    def forward(self, x):
        x_embed = self.embedding(x)
        mu, logvar, z = self.encoder(x_embed)
        kld = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
        logits = self.decoder(x_embed, z)
        return logits, kld

# Preparing DataLoader
BATCH_SIZE = 32
train_data = TensorDataset(X_train_tensor, y_train_tensor)
test_data=TensorDataset(X_test_tensor,y_test_tensor)
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True)

# Initializing the VAE
EMBEDDING_DIM = 64
N_LAYERS_E = 1
N_HIDDEN_E = 128
DIM_Z = 32
N_HIDDEN_D = 128
N_LAYERS_D = 1

vae = VAE(EMBEDDING_DIM, NUM_WORDS, N_LAYERS_E, N_HIDDEN_E, DIM_Z, N_HIDDEN_D, N_LAYERS_D)

# Saving model to a directory
import pickle
pickle.dump(vae, open('/kaggle/working/VAE_LSTM_model_init', 'wb'))

# Training the VAE
optimizer = torch.optim.Adam(vae.parameters(), lr=0.001, weight_decay=1e-5)
criterion = nn.CrossEntropyLoss()

EPOCHS = 20

print("start training")

for epoch in range(EPOCHS):
    vae.train()
    epoch_loss = 0
    for batch in train_loader:
        input_seq, _ = batch
        optimizer.zero_grad()
        
        logits, kld = vae(input_seq)
        logits_reshaped = logits.view(-1, NUM_WORDS)
        input_seq_reshaped = input_seq.view(-1)
        
        reconstruction_loss = criterion(logits_reshaped, input_seq_reshaped)
        beta = min(0.9, epoch/10)  # KL annealing
        total_loss = reconstruction_loss + beta * kld
        
        total_loss.backward()
        optimizer.step()
        epoch_loss += total_loss.item()
    
    print(f"Epoch {epoch + 1}, Loss: {epoch_loss:.4f}")

    vae.eval()  # Set model to evaluation mode
    test_loss = 0
    for batch in test_loader:
            input_seq, labels = batch
            logits, kld = vae(input_seq)
            logits_reshaped = logits.view(-1, NUM_WORDS)
            input_seq_reshaped = input_seq.view(-1)
            reconstruction_loss = criterion(logits_reshaped, input_seq_reshaped)
            beta = min(0.9, epoch/10)  # KL annealing
            loss = reconstruction_loss + beta * kld

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            test_loss += loss.item()

    test_loss /= len(test_loader)
    print(f"Test Loss: {test_loss:.4f}")

import tensorflow as tf

# Saving model to a directory
pickle.dump(vae, open('/kaggle/working/VAE_LSTM_model', 'wb'))

start training
Epoch 1, Loss: 432.9745
Test Loss: 4.2604
Epoch 2, Loss: 5162.5807
Test Loss: 3.6167
Epoch 3, Loss: 239.1407
Test Loss: 2.8100
Epoch 4, Loss: 185.2234
Test Loss: 2.2109
Epoch 5, Loss: 146.6892
Test Loss: 1.7738
Epoch 6, Loss: 119.2183
Test Loss: 1.4627
Epoch 7, Loss: 99.1529
Test Loss: 1.2350
Epoch 8, Loss: 84.2666
Test Loss: 1.0623
Epoch 9, Loss: 72.9130
Test Loss: 0.9290
Epoch 10, Loss: 64.0935
Test Loss: 0.8242
Epoch 11, Loss: 56.9486
Test Loss: 0.7390
Epoch 12, Loss: 51.2804
Test Loss: 0.6709
Epoch 13, Loss: 46.6876
Test Loss: 0.6140
Epoch 14, Loss: 42.9275
Test Loss: 0.5670
Epoch 15, Loss: 39.8322
Test Loss: 0.5308
Epoch 16, Loss: 37.2253
Test Loss: 0.4952
Epoch 17, Loss: 35.0335
Test Loss: 0.4688
Epoch 18, Loss: 33.1731
Test Loss: 0.4446
Epoch 19, Loss: 31.5638
Test Loss: 0.4246
Epoch 20, Loss: 30.1642
Test Loss: 0.4064
