In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
'''
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
'''        

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

"\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n    for filename in filenames:\n        print(os.path.join(dirname, filename))\n"

In [3]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.3.1


In [5]:
import numpy as np
from sentence_transformers import SentenceTransformer

# Load the generated captions
captions = np.load('/kaggle/input/generated-captions/generated_captions.npy', allow_pickle=True)

# Initialize the Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')  # A fast and efficient model

# Generate embeddings for all captions
captions_embeddings = model.encode(captions, show_progress_bar=True)

# Save the generated embeddings as a numpy array
np.save('generated_captions_embeddings.npy', captions_embeddings)

# Print the shape of the embeddings
print(f"Generated captions embeddings shape: {captions_embeddings.shape}")


Batches:   0%|          | 0/517 [00:00<?, ?it/s]

Generated captions embeddings shape: (16540, 384)


In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import BertTokenizer
from tqdm import tqdm
import numpy as np

# Define the VAE Architecture for Text
class VAEText(nn.Module):
    def __init__(self, vocab_size, embedding_dim, latent_dim, hidden_dim, seq_length):
        super(VAEText, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc_mu = nn.Linear(hidden_dim, latent_dim)
        self.fc_logvar = nn.Linear(hidden_dim, latent_dim)
        self.fc_decode = nn.Linear(latent_dim, hidden_dim)
        self.rnn_decode = nn.LSTM(hidden_dim, hidden_dim, batch_first=True)
        self.output_layer = nn.Linear(hidden_dim, vocab_size)
        self.seq_length = seq_length

    def encode(self, x):
        x = self.embedding(x)  # (batch_size, seq_length, embedding_dim)
        _, (hidden, _) = self.rnn(x)  # hidden: (1, batch_size, hidden_dim)
        hidden = hidden[-1]  # (batch_size, hidden_dim)
        mu = self.fc_mu(hidden)  # (batch_size, latent_dim)
        logvar = self.fc_logvar(hidden)  # (batch_size, latent_dim)
        return mu, logvar

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        z = self.fc_decode(z)  # (batch_size, hidden_dim)
        z = z.unsqueeze(1).repeat(1, self.seq_length, 1)  # (batch_size, seq_length, hidden_dim)
        output, _ = self.rnn_decode(z)  # (batch_size, seq_length, hidden_dim)
        return self.output_layer(output)  # (batch_size, seq_length, vocab_size)

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

# Loss Function
def vae_loss(recon_x, x, mu, logvar):
    # Reshape recon_x to (batch_size × sequence_length, vocab_size)
    recon_x = recon_x.view(-1, vocab_size)
    
    # Flatten x to (batch_size × sequence_length)
    x = x.view(-1)
    
    # Compute reconstruction loss
    BCE = nn.CrossEntropyLoss()(recon_x, x)
    
    # Compute KL divergence
    KLD = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())
    
    return BCE + KLD

# Load Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
text_data = np.load('/kaggle/input/generated-captions/generated_captions.npy', allow_pickle=True)

# Tokenize and Prepare Dataset
def text_to_tensor(texts, tokenizer, seq_length=50):
    return torch.tensor([
        tokenizer.encode(text, add_special_tokens=True, padding='max_length', max_length=seq_length, truncation=True)
        for text in texts
    ])

# Parameters
seq_length = 50
batch_size = 32
epochs = 10
embedding_dim = 256
latent_dim = 50
hidden_dim = 512

# Tokenized text tensor
text_tensor = text_to_tensor(text_data, tokenizer, seq_length)
train_loader = DataLoader(text_tensor, batch_size=batch_size, shuffle=True)

# Initialize VAE
vocab_size = len(tokenizer.vocab)
vae = VAEText(vocab_size, embedding_dim, latent_dim, hidden_dim, seq_length)

# Move model and data to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vae.to(device)

# Optimizer
optimizer = optim.Adam(vae.parameters(), lr=1e-3)

# Training Loop with GPU and Progress Bar
for epoch in range(epochs):
    vae.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}")
    for batch in progress_bar:
        batch = batch.to(device)  # Move batch to GPU
        optimizer.zero_grad()
        recon_batch, mu, logvar = vae(batch)
        loss = vae_loss(recon_batch, batch, mu, logvar)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())
    print(f'Epoch {epoch + 1}/{epochs}, Average Loss: {total_loss / len(train_loader)}')

# Save embeddings (latent variables)
vae.eval()
embeddings = []
with torch.no_grad():
    for batch in tqdm(train_loader, desc="Generating Embeddings"):
        batch = batch.to(device)  # Move batch to GPU
        mu, logvar = vae.encode(batch)
        embeddings.append(mu)
embeddings = torch.cat(embeddings, dim=0)

# Save the embeddings
np.save('generated_captions_vae_embeddings.npy', embeddings.cpu().numpy())

print("Text embeddings saved!")


Epoch 1/10: 100%|██████████| 517/517 [00:33<00:00, 15.55it/s, loss=1.14] 


Epoch 1/10, Average Loss: 1.5028174487257835


Epoch 2/10: 100%|██████████| 517/517 [00:33<00:00, 15.34it/s, loss=1.23] 


Epoch 2/10, Average Loss: 1.1651920292317521


Epoch 3/10: 100%|██████████| 517/517 [00:34<00:00, 15.08it/s, loss=0.992]


Epoch 3/10, Average Loss: 1.1433599894466437


Epoch 4/10: 100%|██████████| 517/517 [00:34<00:00, 14.88it/s, loss=1.28] 


Epoch 4/10, Average Loss: 1.134742703843624


Epoch 5/10: 100%|██████████| 517/517 [00:35<00:00, 14.77it/s, loss=1.13] 


Epoch 5/10, Average Loss: 1.1287499858747367


Epoch 6/10: 100%|██████████| 517/517 [00:35<00:00, 14.66it/s, loss=1.24] 


Epoch 6/10, Average Loss: 1.1251896934536947


Epoch 7/10: 100%|██████████| 517/517 [00:35<00:00, 14.57it/s, loss=1.1]  


Epoch 7/10, Average Loss: 1.1219714738183602


Epoch 8/10: 100%|██████████| 517/517 [00:35<00:00, 14.53it/s, loss=1.19] 


Epoch 8/10, Average Loss: 1.1200425624847412


Epoch 9/10: 100%|██████████| 517/517 [00:35<00:00, 14.51it/s, loss=1.28] 


Epoch 9/10, Average Loss: 1.1184419596448858


Epoch 10/10: 100%|██████████| 517/517 [00:35<00:00, 14.48it/s, loss=1.19] 


Epoch 10/10, Average Loss: 1.1167916065258492


Generating Embeddings: 100%|██████████| 517/517 [00:01<00:00, 373.27it/s]

Text embeddings saved!



