In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import tiktoken
import numpy as np
from Single_Block_Fortnite import EmbeddingNN, AttentionNN, NormNN, FFN
tokenizer = tiktoken.get_encoding("gpt2")
from datasets import load_dataset

# This should work now since you are logged in


ds = load_dataset("yahma/alpaca-cleaned")
print(ds)


DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 51760
    })
})


In [2]:
import torch
from datasets import load_dataset
import tiktoken

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load dataset (assuming already logged in via Hugging Face)

train_texts = ds['train']  # list of all text entries in train split

# Initialize GPT-2 tokenizer
tokenizer = tiktoken.get_encoding("gpt2")
vocab_size = tokenizer.n_vocab
print("Vocab size:", vocab_size)

# Encode all text entries into a single list of integers
integers = []
end_token_id = 50256  # GPT-2 end token

for text in train_texts:
    tokens = tokenizer.encode(str(text))
    integers.extend(tokens)     # add tokens
    integers.append(end_token_id)  # append end-of-text token

# Convert to PyTorch tensor
data = torch.tensor(integers, dtype=torch.long).to(device)
print("Data shape:", data.shape)


Using device: cuda
Vocab size: 50257
Data shape: torch.Size([9731408])


In [3]:
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#text = ds
#with open("C:/Users/PC/Desktop/Important Data/Shakespeare_Data.txt", "r", encoding="utf-8") as f:
    #text = f.read()
  
    
#integers = tokenizer.encode(str(text)) 
#vocab_size = tokenizer.n_vocab  # Total number of tokens in the tokenizer
#end_token_id = 50256  
#integers.append(end_token_id) 
#data = torch.tensor(integers , dtype=torch.long).to(device)


#print(vocab_size)
#data = torch.tensor(integers , dtype = torch.long)
#batch_x = data[i:i+seq_len].unsqueeze(0)
   #batch_x = batch_x.repeat(batch_size, 1)

In [4]:
class TransformerBlock(nn.Module):
    def __init__(self, hidden_dim, num_heads):
        super().__init__()

        self.attention = AttentionNN(hidden_dim, num_heads)
        self.ffn = FFN(hidden_dim)
        self.norm1 = NormNN(hidden_dim)
        self.norm2 = NormNN(hidden_dim)

    def forward(self, X):
        attention_output = self.attention(X)
        X = X + attention_output

        pog_X = self.norm1(X)

        pogger_X = self.ffn(pog_X) + pog_X

        poggers_X = self.norm2(pogger_X)

        return poggers_X

In [5]:
class Transformer(nn.Module):
    def __init__(self,vocab_size , hidden_dim , num_heads,max_sequence_length, N):
        super().__init__()

        self.embedding = EmbeddingNN(vocab_size, hidden_dim, max_sequence_length)
        self.blocks = nn.ModuleList([TransformerBlock(hidden_dim, num_heads) for i in range(N)])
        self.linear = nn.Linear(hidden_dim,vocab_size)

    def forward(self, X):
        X = self.embedding(X)
        for block in self.blocks:
            X = block(X)
        X = self.linear(X)

        return X

In [6]:
#tokenizer = tiktoken.get_encoding("gpt2")

#integers = tokenizer.encode(text)
#decode_dict = {i : chars for i , chars in enumerate(chars)}
#encode_dict = {chars : i for i , chars in enumerate(chars)}

#encode = lambda text : [encode_dict[i] for i in text]
#decode = lambda indices : ''.join([decode_dict[i] for i in indices])

#encoded_text = encode(text)

#print(encode_dict['<END>'])
#print("Max index in data:", max(encoded_text), "Vocab size:", vocab_size)
 # usually 50256 for GPT-2



In [7]:
batch_size =16
hidden_dim = 512
max_sequence_length = 512
num_heads = 8
seq_len = 128
epochs = 3

In [8]:
Transformer_model = Transformer(vocab_size, hidden_dim, num_heads, max_sequence_length, 12).to(device)
optimizer = optim.Adam(Transformer_model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()
n = 0
loss = None
batch_y = []
for epoch in range(epochs):
    n = n+1
    print(f"Epoch Number: {n}")
    for start in range(0, len(data) - batch_size*seq_len, batch_size*seq_len):
    
        end = start+batch_size*seq_len
        if end + 1 > len(data):
            break
    
        batch_x = data[start : end].view(batch_size, seq_len).to(device)
        batch_y = data[start+1 : end+1].view(batch_size, seq_len).to(device)
        
        logits = Transformer_model(batch_x).to(device)
    
        logits = logits.view(-1 , vocab_size)
        batch_y = batch_y.view(-1)
    
        loss = criterion(logits, batch_y)
    
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    if loss is not None:
        print("Final loss:", loss.item())
    else:
        print("No batches were processed. Loss not defined.")

Epoch Number: 1


KeyboardInterrupt: 

In [None]:
torch.save(Transformer_model, "Sigma_Transformer_full.pth")


In [None]:
model = torch.load("Sigma_Transformer_full.pth", map_location=device)
model.to(device)
model.eval()


In [None]:
text = input()
input_ids = tokenizer.encode(text)
input_text = torch.tensor([input_ids]).to(device)

inference = []
temperature = 0.7
while True:
    
    logits = Transformer_model(input_text).to(device)
    next_token = logits[:,-1 ,:]
    probs = torch.softmax(next_token / temperature, dim=-1)
    next_token_id = torch.multinomial(probs, 1)
    input_text = torch.cat([input_text, next_token_id], dim=1)

    inference.append(next_token_id.item())

    if next_token_id.item() == 50256 or len(inference) >=  200:
        break
        
print(tokenizer.decode(inference))
