In [1]:
import torch
import torch.nn as nn
from GPT.gptForUnsupervisedTraining import GPTForUnsupervisedTraining
from GPT.gptDecoder import GPTDecoder
from transformers import GPT2Tokenizer, AutoTokenizer
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim

In [2]:
base_model = GPTDecoder(seq_len = 64, target_vocab_size = 45000, embedding_dim = 384, no_heads = 6, no_decoder_blocks = 6)
print(base_model)

GPTDecoder(
  (decoder_stack): ModuleList(
    (0-5): 6 x DecoderBlock(
      (selfAttentionLayer): MultiHeadAttention(
        (WQuery): Linear(in_features=384, out_features=384, bias=False)
        (WKey): Linear(in_features=384, out_features=384, bias=False)
        (WValue): Linear(in_features=384, out_features=384, bias=False)
        (WOut): Linear(in_features=384, out_features=384, bias=True)
      )
      (firstLayerNorm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      (secondLayerNorm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      (fc): Sequential(
        (0): Linear(in_features=384, out_features=1536, bias=True)
        (1): Linear(in_features=1536, out_features=384, bias=True)
        (2): GELU(approximate='none')
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropoutLayer): Dropout(p=0.1, inplace=False)
    )
  )
  (embedding_layer): EmbeddingLayer(
    (model): Embedding(45000, 384, padding_idx=0)
  )
  (positional_embedder): Positio

In [3]:
model = GPTForUnsupervisedTraining(base_model, vocab_size = 45000)

In [4]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('openai-community/openai-gpt')

# Set the pad token as a placeholder with a token ID of 0
tokenizer.pad_token = '[PAD]'  # This can be any string, it's just a placeholder token

# Assign 0 as the pad_token_id
tokenizer.pad_token_id = 0

#print(tokenizer.decode([0]))

# Verify pad_token_id is now set to 0
print(f"Pad token: {tokenizer.pad_token}, Pad token ID: {tokenizer.pad_token_id}")

Pad token: <unk>, Pad token ID: 0




In [5]:
text = "\n".join(open(r"Kanye West Lyrics.txt","r",encoding = 'utf-8').readlines())

In [6]:
tokens = tokenizer.encode(text)

Token indices sequence length is longer than the specified maximum sequence length for this model (94807 > 512). Running this sequence through the model will result in indexing errors


In [7]:
#tokens

In [8]:
inputs = []
targets = []

In [9]:
print(len(tokens))
print(max(tokens))

94807
40411


In [10]:
for i in range(0, len(tokens), 64):

    input = tokens[i:i+model.baseModel.seq_len]
    target = tokens[i+1:i+model.baseModel.seq_len+1]

    if len(input) < model.baseModel.seq_len:
        input = input + [0] * (model.baseModel.seq_len - len(input))

    if len(target) < model.baseModel.seq_len:
        target = target + [0] * (model.baseModel.seq_len - len(target))

    input = torch.Tensor(input).long()
    target = torch.Tensor(target).long()

    #print(input)
    #print(target)
    inputs.append(input)
    targets.append(target)
    #break

In [11]:
inputs = torch.cat([x.unsqueeze(0) for x in inputs], dim = 0)

In [12]:
inputs.shape

torch.Size([1482, 64])

In [13]:
targets = torch.cat([x.unsqueeze(0) for x in targets], dim = 0)

In [14]:
dataset = TensorDataset(inputs, targets)
batch_size = 64
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


In [15]:
device = 'cuda'

In [16]:
model.to(device)

GPTForUnsupervisedTraining(
  (baseModel): GPTDecoder(
    (decoder_stack): ModuleList(
      (0-5): 6 x DecoderBlock(
        (selfAttentionLayer): MultiHeadAttention(
          (WQuery): Linear(in_features=384, out_features=384, bias=False)
          (WKey): Linear(in_features=384, out_features=384, bias=False)
          (WValue): Linear(in_features=384, out_features=384, bias=False)
          (WOut): Linear(in_features=384, out_features=384, bias=True)
        )
        (firstLayerNorm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (secondLayerNorm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (fc): Sequential(
          (0): Linear(in_features=384, out_features=1536, bias=True)
          (1): Linear(in_features=1536, out_features=384, bias=True)
          (2): GELU(approximate='none')
          (3): Dropout(p=0.1, inplace=False)
        )
        (dropoutLayer): Dropout(p=0.1, inplace=False)
      )
    )
    (embedding_layer): EmbeddingLayer(
   

In [17]:
optimizer = optim.Adam(model.parameters(),lr = 5e-4)
optimizer.zero_grad()

In [18]:
#print(tokenizer.pad_token_id)

In [19]:
loss_fn = nn.CrossEntropyLoss(ignore_index=0)
#loss_fn = torch.nn.CrossEntropyLoss()
EPOCHS = 1

In [20]:
for e in range(EPOCHS):

    num_samples = 0
    epoch_loss = 0
    
    for i, batch in tqdm(enumerate(dataloader)):

        model.train()
      
        
        #print(batch)
    
        inputs, targets = batch
    
        #print(inputs.device)
            
        preds = model(inputs)
    
        preds = preds.view(-1, preds.shape[-1]).to(device)
        #print(preds.shape)
    
        targets = targets.view(-1).to(device)
    
        #print(targets.shape)
        
        loss = loss_fn(preds, targets)

        epoch_loss += loss.item() * inputs.shape[0]
        num_samples += inputs.shape[0]
        
        loss.backward()

        optimizer.step()
        
        optimizer.zero_grad()

        
    print(f"The loss from epoch {e} is {epoch_loss / num_samples}")
          
        #break

24it [00:05,  4.39it/s]

The loss from epoch 0 is 7.68636214041356





In [21]:
model.eval()

GPTForUnsupervisedTraining(
  (baseModel): GPTDecoder(
    (decoder_stack): ModuleList(
      (0-5): 6 x DecoderBlock(
        (selfAttentionLayer): MultiHeadAttention(
          (WQuery): Linear(in_features=384, out_features=384, bias=False)
          (WKey): Linear(in_features=384, out_features=384, bias=False)
          (WValue): Linear(in_features=384, out_features=384, bias=False)
          (WOut): Linear(in_features=384, out_features=384, bias=True)
        )
        (firstLayerNorm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (secondLayerNorm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (fc): Sequential(
          (0): Linear(in_features=384, out_features=1536, bias=True)
          (1): Linear(in_features=1536, out_features=384, bias=True)
          (2): GELU(approximate='none')
          (3): Dropout(p=0.1, inplace=False)
        )
        (dropoutLayer): Dropout(p=0.1, inplace=False)
      )
    )
    (embedding_layer): EmbeddingLayer(
   

In [22]:
input = tokenizer.encode("Frank Ocean")

In [23]:
print(torch.tensor(input).shape)

torch.Size([2])


In [24]:
res = model.decode(target_seq_len = 64 ,inputTensor = torch.Tensor([input]).long(), greedy_decoding = False, temperature = 0.7, show_steps = False)

In [25]:
print(res)

tensor([[ 4416,  4688,  1531,  1531,   293,  3769,   806,   481,   762,   510,
           568,   770,   240,   249,   256,   258,   256,   241,   249,   256,
           241,   240,   260,   655,   240,   664,   616,   244,   256,   256,
           241,   240,   240,   256,   252,   481,   246,  2441,   481,   249,
           256,   241,   488,   249,   256,   241,   581,   240,   599,   256,
           241,   649,   246,   788,   240,   485,   481, 21908,   256,   241,
           507,   256,   241,   768]], dtype=torch.int32)


In [26]:
print([tokenizer.decode(x) for x in res])

['frank ocean la la [ verse where the man me but right, i\'m\'t i\'t, - there, no this "\'\' t,,\'s the a broke the i\'t and i\'t go, what\'t like a see, to the tryin\'t it\'t us']
