In [1]:
#!pip install datasets

In [2]:
import numpy as np 
from datasets import load_dataset
import torch
import torch.nn as nn
import torchvision.models as models
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import spacy
from random import seed
from random import random
import torchtext
import pickle
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
data_path = "preprocessed"
word2index_path = "word2index.pkl"

In [5]:
class DiffusionDataset(Dataset):
    def __init__(self, data_path, word2index_path, train_split_ratio=0.8, train=True):
        self.data_path = data_path
        with open(word2index_path, 'rb') as f:
            self.word2index = pickle.load(f)
        self.vocab_size = len(self.word2index)
        self.eos_index = self.word2index["<EOS>"]
        self.train = train
        self.train_split_ratio = train_split_ratio
        self.files = os.listdir(data_path)

        n_image_prompt = int(len(self.files) / 2) # len(self.files) must be an even number

        # Calculate the split index
        self.split_index = int((self.train_split_ratio * n_image_prompt))
        
        # Calculate the total number of rows
        self.total_rows = 0

        if not train:
            self.total_rows = n_image_prompt - self.split_index
        else:
            self.total_rows = self.split_index

    def __len__(self):
        return self.total_rows

    def __getitem__(self, idx):
        # TODO: Recalculate the preprocessed data and then remove the +1 everywhere in this function
        if isinstance(idx, int):
            image = np.load(f"{data_path}\\image_{idx + 1}.npy")
            prompt = np.load(f"{data_path}\\prompt_{idx + 1}.npy")
            return torch.tensor(image), torch.tensor(prompt)
        else:
            image_batch = []
            prompt_batch = []
            for i in idx:
                image = np.load(f"{data_path}\\image_{i + 1}.npy")
                prompt = np.load(f"{data_path}\\prompt_{i + 1}.npy")
                image_batch.append(image)
                prompt_batch.append(prompt)
                return torch.tensor(image_batch), torch.tensor(prompt_batch)

In [6]:
class EncoderCNN(nn.Module):
    def __init__(self, embed_size, n_layers, hid_dim):
        super().__init__()

        self.resnet = models.resnet50(pretrained=True)
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, hid_dim)

        self.hidden_size = hid_dim
        self.n_layers = n_layers

    def forward(self, images):
        """Extract feature vectors from input images."""
        with torch.no_grad():
            features = self.resnet(images)

        batch_size = features.size(0)
        hidden = features.unsqueeze(0).expand(self.n_layers, batch_size, self.hidden_size)
        # Initialize the cell state with zeros
        cell = torch.zeros(self.n_layers, batch_size, self.hidden_size).to(features.device)
        return hidden, cell



class DecoderRNN(nn.Module):
    def __init__(self, output_dim, emb_dim, n_layers, hid_dim, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        
        embedded = self.dropout(self.embedding(input))
        
        output, (hidden, cell) = self.rnn(embedded, (hidden.contiguous(), cell.contiguous()))
        
        prediction = self.fc_out(output.squeeze(0))
        
        return prediction, hidden, cell

In [7]:
from torch.jit import script_if_tracing
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
       
    def forward(self, src, trg, teacher_forcing_ratio = float(0.5)):

        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        
        #first input to the decoder is the <sos> tokens
        input = trg[:, 0]

        for t in range(1, trg_len):
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)

            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 

            #if teacher forcing, use actual next token as next input
            #if not, use predicted token)

            input = trg[:,t] if random() < teacher_forcing_ratio else top1
        return outputs

In [8]:
dataset = DiffusionDataset(data_path, word2index_path, train_split_ratio=0.8, train=True)

In [9]:
embed_size = 512
hidden_size = 256
output_size = dataset.vocab_size
n_layers = 2
dec_dropout = 0.5

batch_size = 32
num_epochs = 10
clip = 1

# seed random number generator
seed(1)

In [10]:
dataset.vocab_size

10003

In [11]:
encoder = EncoderCNN(embed_size, n_layers, hidden_size).to(device)
decoder = DecoderRNN(output_size, embed_size, n_layers, hidden_size, dec_dropout).to(device)
model = Seq2Seq(encoder, decoder, device).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())



In [12]:
def collate_fn(data):
    # Sort a data list by caption length (descending order).
    data.sort(key=lambda x: len(x[1]), reverse=True)
    images, prompts = zip(*data)

    # Merge images (from tuple of 3D tensor to 4D tensor).
    images = torch.stack(images, 0)

    # Merge prompts (from tuple of 1D tensor to 2D tensor).
    lengths = [len(prompt) for prompt in prompts]
    padded_prompts = torch.zeros(len(prompts), max(lengths)).long()
    for i, cap in enumerate(prompts):
        end = lengths[i]
        padded_prompts[i, :end] = cap[:end]

    return images, padded_prompts, lengths

In [13]:
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

In [14]:
x = [ x[1] for x in next(iter(data_loader)) ]
x[1]

tensor([   1,  145, 2074, 1225, 1289,  643,    0, 1684,    0, 1143,    0,  628,
        2657,    0,    0,   14,   30,   24,    0, 1502,    0,  125,    6, 4016,
           0,   91,   34,    0,    0,   14,   67,   66,    0,   38,    0,   85,
           0, 2455,  746,    0, 4340,    6,   89, 4341,    0, 1077, 1522,    0,
        3745,  150, 4139,    0, 2282, 4140,    0, 1902,  480, 1950,    0, 3064,
        1737,    0, 3388,    0, 3505,    0, 2701, 4988,    2,    0,    0,    0,
           0,    0,    0,    0,    0])

In [15]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): EncoderCNN(
    (resnet): ResNet(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (relu): ReLU(inplac

In [16]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 33,039,699 trainable parameters


In [17]:
#TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
#TODO
#criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [18]:
def translate_output(output, word2index):
    index2word = {index: word for word, index in word2index.items()}  # Create index-to-word dictionary
    translated_sentences = []
    for seq in output:
        sentence = []
        for idx in seq:
            word = index2word.get(idx.item(), "<UNK>")
            if word == "<EOS>":
                break
            sentence.append(word)
        translated_sentence = " ".join(sentence)
        translated_sentences.append(translated_sentence)
    return translated_sentences

In [19]:
for i, (images, prompts, trg_lengths) in enumerate(data_loader):
    example_images = images
    example_prompts = prompts
    break

# Translate the example prompt
translated_example_prompts = translate_output(prompts, dataset.word2index)


In [20]:
def get_translations(images, prompts): 
    model.eval()
    with torch.no_grad():
        # Move images and prompts to the device
        images = images.to(device)
        prompts = prompts.to(device)

        # Perform forward pass for the images and prompts
        outputs = model(images, prompts)

        # Get the predicted words with the highest probability
        top1 = outputs.argmax(2).transpose(0, 1)

        # Translate the predicted output to words
        translated_output = translate_output(top1, dataset.word2index)

        return translated_output
    

In [21]:
# Training loop
translations_list = []  # List to store translated sentences

for epoch in range(num_epochs):
    model.train()
    for i, (images, prompts, trg_lengths) in enumerate(data_loader):
        images = images.to(device)
        prompts = prompts.to(device)

        # TODO add packing?
        #targets = pack_padded_sequence(prompts, trg_lengths, batch_first=True)[0]

        optimizer.zero_grad()
        output = model(images, prompts)

        # Remove the <sos> token and reshape the output and target tensors
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim).contiguous()

        trg = prompts.transpose(0, 1)[1:].contiguous().view(-1)

        loss = criterion(output, trg)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        if i % batch_size == 0:
            print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                  .format(epoch + 1, num_epochs, i, len(data_loader), loss.item(), np.exp(loss.item())))

    # Get translations after each epoch and append to the list
    translations_list.append(get_translations(example_images, example_prompts))

Epoch [1/10], Step [0/2500], Loss: 9.2096, Perplexity: 9992.6797
Epoch [1/10], Step [32/2500], Loss: 2.8922, Perplexity: 18.0333
Epoch [1/10], Step [64/2500], Loss: 2.7286, Perplexity: 15.3118
Epoch [1/10], Step [96/2500], Loss: 2.3791, Perplexity: 10.7954
Epoch [1/10], Step [128/2500], Loss: 2.0263, Perplexity: 7.5857
Epoch [1/10], Step [160/2500], Loss: 2.9899, Perplexity: 19.8828
Epoch [1/10], Step [192/2500], Loss: 2.5203, Perplexity: 12.4324
Epoch [1/10], Step [224/2500], Loss: 2.3879, Perplexity: 10.8908
Epoch [1/10], Step [256/2500], Loss: 2.7214, Perplexity: 15.2020
Epoch [1/10], Step [288/2500], Loss: 1.9260, Perplexity: 6.8623
Epoch [1/10], Step [320/2500], Loss: 2.7853, Perplexity: 16.2046
Epoch [1/10], Step [352/2500], Loss: 2.7107, Perplexity: 15.0397
Epoch [1/10], Step [384/2500], Loss: 2.8930, Perplexity: 18.0482
Epoch [1/10], Step [416/2500], Loss: 2.2851, Perplexity: 9.8268
Epoch [1/10], Step [448/2500], Loss: 2.3994, Perplexity: 11.0167
Epoch [1/10], Step [480/2500], 

In [22]:
torch.save(model.state_dict(), 'seq2seq.pth')

In [23]:
encoder = EncoderCNN(embed_size, n_layers, hidden_size).to(device)
decoder = DecoderRNN(output_size, embed_size, n_layers, hidden_size, dec_dropout).to(device)
model = Seq2Seq(encoder, decoder, device).to(device)
model.load_state_dict(torch.load("seq2seq.pth"))
model.eval()

Seq2Seq(
  (encoder): EncoderCNN(
    (resnet): ResNet(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (relu): ReLU(inplac

In [24]:
translations_first_example, translations_second_example = zip(*translations_list)

print("Translated first Prompt:")
print(translated_example_prompts[0])


print("Translated outputs over epochs:")
print()
# Print the translated prompts over epochs
for i, sentence in enumerate(translations_first_example):
    print(f"Epoch {i+1} predicted sentence {sentence}")

#print()

#print("Translated second Prompt:")
#print(translated_example_prompts[0])

# Print the translated prompts over epochs
#for i, sentence in enumerate(translations_first_example):
#    print(f"Epoch {i+1} predicted sentence {sentence}")


ValueError: too many values to unpack (expected 2)

In [25]:
test_dataset = DiffusionDataset(data_path, word2index_path, train_split_ratio=0.8, train=False)
test_loader = DataLoader(test_dataset, batch_size, collate_fn=collate_fn)

In [41]:
from sklearn.metrics.pairwise import cosine_similarity

model.eval()  # Set the model to evaluation mode
total_cosine_similarity = 0
total_examples = 0

with torch.no_grad():
    for i, (images, prompts, trg_lengths) in enumerate(test_loader):
        images = images.to(device)
        prompts = prompts.to(device)

        output = model(images, prompts)
        print(f"Output Shape: {output.shape}")

        # Remove the <sos> token and reshape the output and target tensors
        output_dim = output.shape[-1]

        output = output[1:].view(-1, output_dim).contiguous()
        print(f"Output Shape -1: {output.shape[-1]}")
        print(f"Output View Shape: {output[1:].view(-1, output_dim).shape}")
        print(f"Output View Shape: {output[1:].view(-1, output_dim).contiguous().shape}")
        trg = prompts.transpose(0, 1)[1:].contiguous().view(-1)
        print(f"Prompts Shape: {prompts.shape}")
        print(f"Prompts Transpose and remove sos Shape: {prompts.transpose(0, 1)[1:].shape}")
        print(f"Prompts Transpose and remove sos and contiguous Shape: {prompts.transpose(0, 1)[1:].contiguous().shape}")
        print(f"Prompts Transpose and remove sos and contiguous and view -1 Shape: {prompts.transpose(0, 1)[1:].contiguous().view(-1).shape}")

        print(f"Output: {output}, Shape: {output.shape}\n")
        output_embed = output.detach().cpu().numpy().reshape(-1, output_dim)
        print(f"Target: {trg}, Shape: {trg.shape}\n")
        trg_embed = trg.detach().cpu().numpy().reshape(-1, output_embed.shape[0])



        cosine_similarities = cosine_similarity(output_embed, trg_embed.T)
        mean_cosine_similarity = cosine_similarities.mean()

        total_cosine_similarity += mean_cosine_similarity
        total_examples += output_embed.shape[0]

        if i % batch_size == 0:
            print('Test Step [{}/{}], Mean Cosine Similarity: {:5.4f}'
                  .format(i, len(test_loader), mean_cosine_similarity))

average_cosine_similarity = total_cosine_similarity / total_examples
print('Average Mean Cosine Similarity: {:5.4f}'.format(average_cosine_similarity))


Output Shape: torch.Size([29, 32, 10003])
Output Shape -1: 10003
Output View Shape: torch.Size([895, 10003])
Output View Shape: torch.Size([895, 10003])
Prompts Shape: torch.Size([32, 29])
Prompts Transpose and remove sos Shape: torch.Size([28, 32])
Prompts Transpose and remove sos and contiguous Shape: torch.Size([28, 32])
Prompts Transpose and remove sos and contiguous and view -1 Shape: torch.Size([896])
Output: tensor([[  4.4931, -12.7695,  -0.9353,  ..., -12.7515, -12.6313, -12.7571],
        [  4.5014, -12.7801,  -0.9704,  ..., -12.7709, -12.6168, -12.7889],
        [  4.4376, -12.7442,  -0.8927,  ..., -12.8039, -12.5320, -12.8345],
        ...,
        [  7.7523, -13.0151,   2.1325,  ..., -13.3175, -13.2672, -12.9812],
        [ 16.1035, -20.3290,   0.1951,  ..., -20.2081, -19.7453, -19.6871],
        [ 16.1529, -19.9414,   0.2913,  ..., -19.9118, -19.2234, -19.2717]],
       device='cuda:0'), Shape: torch.Size([896, 10003])

Target: tensor([  17,   17,   17,   17,   17,   17,  

ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 10003 while Y.shape[1] == 1