In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
import numpy as np
import pandas as pd
import torch.nn.functional as F
import os
from tqdm import tqdm, trange
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from torch.utils.tensorboard import SummaryWriter



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


In [3]:
# #csv_file
# #tensor_dir

# #Data loaders Pytorch dataset class
# class VideoDescriptionDataset(Dataset):
#     def __init__(self, csv_file, tensor_dir):
#         self.df = pd.read_csv(csv_file)
#         self.tensor_dir = tensor_dir

#     def __len__(self):
#         return len(self.df)

#     def __getitem__(self, idx):
#         if torch.is_tensor(idx):
#             idx = idx.tolist()
        
#         tensor_file = os.path.join(self.tensor_dir, str(self.df.loc[idx, "id"]) + '.pt')
#         tensor = torch.load(tensor_file)
#         description = self.df.loc[idx, "description"]

#         return tensor, description




In [4]:
from torch.utils.data import DataLoader, Dataset
import torch

# Data loaders Pytorch dataset class
class VideoDescriptionDataset(Dataset):
    def __init__(self, csv_file, tensor_dir, gpt2_tokenizer):
        self.df = pd.read_csv(csv_file)
        self.tensor_dir = tensor_dir
        self.gpt2_tokenizer = gpt2_tokenizer
        self.error = []

        print("test", len(self.df), self.tensor_dir, self.gpt2_tokenizer)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):

        if torch.is_tensor(idx):
            idx = idx.tolist()
            # print("A")

        print("B")

        tensor_file = (self.tensor_dir + str(self.df.loc[idx, "id"]) + '.pt')
        print("id is" + str(self.df.loc[idx, "id"]))
        # print(f"tensor file is :{tensor_file}")

        print("C")

        try:
            print("D")
            tensor = torch.load(tensor_file).to(device)
            print("E")
            description = self.df.loc[idx, "descriptions"]
            print("F")
            # Tokenize the description using GPT-2 tokenizer
            tokens = self.gpt2_tokenizer(description, return_tensors='pt').to(device)
            print("G")
            
            return tensor, tokens["input_ids"]
        except Exception as e:
            print("H")
            print(f"Error loading data for index {idx}: {str(e)}")
            self.error.append(str(self.df.loc[idx, "id"]))
            return torch.empty(0),torch.empty(0)






In [5]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


In [6]:

# Hyperparameters
d_model = 512  # Dimension of the model
output_dim = 32  # Output dimension of the MLP
num_layers = 6  # Number of transformer layers
num_heads = 8  # Number of heads in multi-headed attention
dim_feedforward = 2048  # Dimension of the feedforward network
dropout = 0.1  # Dropout rate

In [7]:
# Initialize GPT-2 model and tokenizer
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)
gpt2_model.eval()  # Freeze the GPT-2 model
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')



In [8]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model):
        super(PositionalEncoding, self).__init__()
        self.d_model = d_model

    def forward(self, x):
        L, N = x.size(1), x.size(0)
        pos = torch.arange(L).unsqueeze(0).repeat(N, 1).to(x.device)
        pos_embedding = self.calc_pos_embedding(pos)
        return x + pos_embedding
    
    def calc_pos_embedding(self, pos):
        pos = pos.float()
        factor = torch.exp(-torch.arange(0, self.d_model, 2).float() * (torch.log(torch.tensor(10000.0)) / self.d_model))
        sinusoid_inp = torch.ger(pos, factor)
        pos_embedding = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
        return pos_embedding

In [9]:
# class PositionalEncoding(nn.Module):
#     def __init__(self, d_model, max_len=5000):
#         super(PositionalEncoding, self).__init__()
#         pe = torch.zeros(max_len, d_model).to(device)
#         position = torch.arange(0, len(d_model)).unsqueeze(1).to(device)
#         div_term = torch.exp(torch.arange(0, d_model, 2) * -(torch.log(torch.tensor(10000.0)) / d_model)).to(device)
#         pe[:, 0::2] = torch.sin(position * div_term)
#         pe[:, 1::2] = torch.cos(position * div_term)
#         pe = pe.unsqueeze(0)
#         self.register_buffer('pe', pe)

#     def forward(self, x):
#         x = x + self.pe[:, :x.size(1)]
#         return self.dropout(x)

In [10]:
class TransformerDecoder(nn.Module):
    def __init__(self, d_model, num_heads, dim_feedforward, dropout, num_layers):
        super(TransformerDecoder, self).__init__()
        decoder_layer = nn.TransformerDecoderLayer(d_model, num_heads, dim_feedforward, dropout)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers)
        self.output_layer = nn.Linear(d_model, output_dim)
    
    def forward(self, x, tgt):
        tgt = tgt.permute(1, 0, 2)
        output = self.transformer_decoder(tgt, x)
        output = self.output_layer(output)
        return output


In [11]:
class MLP(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.fc1(x)


In [12]:
class MyModel(nn.Module):
    def __init__(self, d_model, num_heads, dim_feedforward, dropout, num_layers, output_dim):
        super(MyModel, self).__init__()
        self.positional_encoding = PositionalEncoding(d_model)
        self.decoder = TransformerDecoder(d_model, num_heads, dim_feedforward, dropout, num_layers)
        self.mlp = MLP(d_model, output_dim)
    
    def forward(self, x):
        x = self.positional_encoding(x)
        x = self.decoder(x)
        x = self.mlp(x)
        return x


In [13]:
# Loss function
loss_fn = torch.nn.CrossEntropyLoss()

# csv_file = "C:/Users/dhavi/Downloads/VIdeoCLIPCap/Charades/Charades_gpt_train.csv"
csv_file = "C:/Users/dhavi/Downloads/VIdeoCLIPCap/Charades/Charades_gpt_train_64.csv"
pt_path = "C:/Users/dhavi/Downloads/VIdeoCLIPCap/video_tensors_120/"

dataset = VideoDescriptionDataset(csv_file, pt_path, gpt2_tokenizer)

# Training loop
def Train(dataset: dataset, model: MyModel, lr: float = 2e-5, warmup_steps: int = 5000, output_dir: str = ".", output_prefix: str = ""):
    
    # Create a SummaryWriter
    writer = SummaryWriter()

    print("1")


    # Initialize the DataLoader
    dataloader = DataLoader(dataset, batch_size=5, shuffle=True)#, num_workers=2)

    print("2")


    num_epochs = 10
    # Optimizer
    optimizer = AdamW(model.parameters(), lr=lr)

    print("3")


    # Scheduler
    scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_epochs * len(dataloader)
        )
    
    print(3.5)

    model.train()
    for epoch in range(num_epochs):
        print(3.75)
        # Use tqdm to show progress bar
        for i, (input_data, target) in tqdm(enumerate(dataloader)):

            if (input_data.shape == torch.Size([])) or (target.shape == torch.Size([])):
                print(f"Skipping batch {i} due to error loading data")
                continue


            print("3.8")
            input_data, target = input_data.to(device), target.to(device)

            print("4")

            # Forward pass through your model
            output = model(input_data)

            print("5")


            # Forward pass through GPT-2 model
            gpt2_output = gpt2_model(output).last_hidden_state

            print("6")  


            # Remove the first 32 tokens from the GPT-2 output
            gpt2_output = gpt2_output[:, 32:]

            print("7")


            # Calculate loss
            loss = loss_fn(gpt2_output, target)
            print(loss)

            # Log the loss to TensorBoard
            writer.add_scalar('Training Loss', loss.item(), epoch * len(dataloader) + i)

            # Backpropagate loss and update parameters
            optimizer.zero_grad()
            loss.backward()
            # Clip gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            # Update learning rate
            scheduler.step()

            # Save model parameters every 500 iterations
            if (i + 1) % 50 == 0:
                torch.save(model.state_dict(), os.path.join(output_dir, f'{output_prefix}_epoch_{epoch}_iter_{i + 1}.pt'))

        print(f'Epoch: {epoch}, Loss: {loss.item()}')

        # Save model checkpoint at the end of each epoch
        torch.save(model.state_dict(), os.path.join(output_dir, f'{output_prefix}_epoch_{epoch}.pt'))

    # Load the best model checkpoint
    model.load_state_dict(torch.load(os.path.join(output_dir, f'{output_prefix}_best.pt')))

    # Close the SummaryWriter
    writer.close()


test 7288 C:/Users/dhavi/Downloads/VIdeoCLIPCap/video_tensors_120/ GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True)}, clean_up_tokenization_spaces=True)


In [14]:
# Create an instance of the model, passing gpt2_model and gpt2_tokenizer as arguments
model = MyModel(d_model, num_heads, dim_feedforward, dropout, num_layers, output_dim)

Train(dataset, model)



1
2
3
3.5
3.75


0it [00:00, ?it/s]

B
id isLC1NU
C
D
E
F
G
B
id isJ5CAN
C
D
E
F
G
B
id isPKEZI
C
D
E
F
G
B
id isXYWWG
C
D
E
F
G
B
id isPGKB4
C
D
E
F
G


0it [00:00, ?it/s]


RuntimeError: stack expects each tensor to be equal size, but got [82, 512] at entry 0 and [32, 512] at entry 1

In [None]:
print(len(dataset.error))

In [None]:
tensor = torch.load("C:/Users/dhavi/Downloads/VIdeoCLIPCap/video_tensors_120/01O27.pt")
tensor.shape