# Getting Data

In [None]:
# !pip install tiktoken

In [None]:
import torch
import nltk
from nltk import word_tokenize
import re
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset,DataLoader
import pandas as pd
from tqdm import tqdm
import torch.optim as optim
import pickle
import math
import torch.nn.functional as F
import tiktoken
from tqdm import tqdm

In [None]:
# Initialize GPT-2 tokenizer
encoding = tiktoken.get_encoding("gpt2")

# Define the end-of-text token
eot_token = encoding.encode("<|endoftext|>", allowed_special={"<|endoftext|>"})[0]

text_data = [
    "Hello world this is GPT demo",
    "I am learning transformers",
    "PyTorch makes it easy",
    "Causal masking is important",
    "Self attention is powerful",
    "Feed forward layers help",
    "Normalization stabilizes training",
    "Dropout prevents overfitting",
    "Token embeddings are essential",
    "Position embeddings add order",
    "Mini GPT can learn patterns",
    "Sequence modeling is fun",
    "We generate text autoregressively",
    "Training requires lots of data",
    "Learning rate matters",
    "Optimization is key",
    "Batches speed up training",
    "Masking future tokens is critical",
    "Logits predict the next token",
    "Generation loops one token at a time",
    "Deep learning is fascinating",
    "Neural networks are universal approximators",
    "Backpropagation adjusts weights",
    "Gradient descent minimizes loss",
    "Overfitting occurs with small datasets",
    "Validation helps detect overfitting",
    "Regularization improves generalization",
    "Convolutional layers process images",
    "Recurrent layers process sequences",
    "Transformers excel at NLP",
    "Attention allows context awareness",
    "GPT models are decoder-only",
    "BERT models are encoder-only",
    "Seq2Seq models translate languages",
    "Tokenization splits text into tokens",
    "Embedding layers map tokens to vectors",
    "Activation functions introduce nonlinearity",
    "ReLU is widely used",
    "Softmax converts logits to probabilities",
    "Cross entropy loss is standard for classification",
    "Adam optimizer adapts learning rates",
    "Learning rate schedulers help convergence",
    "Gradient clipping prevents exploding gradients",
    "Layer normalization stabilizes training",
    "Dropout randomly disables neurons",
    "Residual connections improve gradient flow",
    "Positional encoding adds order information",
    "Causal masking prevents cheating",
    "Autoregressive models predict next token",
    "Top-k sampling makes generation diverse",
    "Temperature controls randomness",
    "Beam search improves generation quality",
    "MiniGPT is a small transformer model",
    "Training takes GPU acceleration",
    "Data preprocessing cleans the text",
    "Padding aligns sequences",
    "Batching increases efficiency",
    "Evaluation measures accuracy",
    "Perplexity measures language model performance",
    "Text generation is fun",
    "Code generation is possible",
    "Mathematical reasoning can be learned",
    "Logic puzzles can be solved",
    "Chess and Go can be modeled",
    "Reinforcement learning trains agents",
    "Q-learning is a basic RL algorithm",
    "Policy gradient optimizes expected reward",
    "Value functions estimate future returns",
    "Exploration vs exploitation is key",
    "Simulation helps RL training",
    "Environment defines agent interactions",
    "Observations are agent inputs",
    "Actions change the state",
    "Rewards guide learning",
    "Discount factor values future rewards",
    "Experience replay stabilizes training",
    "Target networks improve convergence",
    "Actor-critic combines policy and value",
    "Deep Q-Networks use neural networks",
    "Tensor operations are efficient",
    "Broadcasting simplifies arithmetic",
    "Autograd computes gradients automatically",
    "Checkpointing saves model states",
    "Early stopping prevents overfitting",
    "Hyperparameter tuning is important",
    "Random seeds ensure reproducibility",
    "Data augmentation expands datasets",
    "Transfer learning leverages pre-trained models",
    "Fine-tuning adapts models to new tasks",
    "Language modeling predicts next word",
    "Masked language modeling predicts missing tokens",
    "Sequence classification assigns labels",
    "Text summarization shortens content",
    "Question answering extracts answers",
    "Named entity recognition identifies entities",
    "Part-of-speech tagging labels words",
    "Machine translation converts languages",
    "Sentiment analysis detects emotions",
    "Topic modeling clusters documents",
    "Clustering groups similar items",
    "Dimensionality reduction simplifies data",
    "Principal Component Analysis reduces dimensions",
    "t-SNE visualizes high-dimensional data",
    "UMAP preserves global structure",
    "Cosine similarity measures similarity",
    "Euclidean distance measures distance",
    "KNN classifies based on neighbors",
    "SVM separates classes with hyperplanes",
    "Random forests ensemble decision trees",
    "Gradient boosting improves weak learners",
    "XGBoost is a popular boosting algorithm",
    "LightGBM is optimized for speed",
    "CatBoost handles categorical features",
    "Neural networks approximate functions",
    "Activation functions include ReLU, Tanh, Sigmoid",
    "Optimization minimizes the loss function",
    "Batch normalization stabilizes training",
    "Residual networks improve deep training",
    "Attention mechanisms focus on important features",
    "Transformers replaced RNNs in NLP",
    "Pre-training and fine-tuning are common",
    "Self-supervised learning reduces labeled data needs"
]


class GPTDataset(Dataset):
    def __init__(self, data, block_size=32):
        self.block_size = block_size
        self.tokens = []
        for line in data:
            # Encode line and append end-of-text token
            self.tokens.extend(encoding.encode(line) + [eot_token])

    def __len__(self):
        return len(self.tokens) - self.block_size

    def __getitem__(self, idx):
        x = torch.tensor(self.tokens[idx:idx+self.block_size], dtype=torch.long)
        y = torch.tensor(self.tokens[idx+1:idx+self.block_size+1], dtype=torch.long)
        return x, y


dataset = GPTDataset(text_data, block_size=32)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Example: Print the shape of input and target tensors
for xb, yb in dataloader:
    print("Input batch shape:", xb.shape)
    print("Target batch shape:", yb.shape)
    break


Input batch shape: torch.Size([2, 32])
Target batch shape: torch.Size([2, 32])


In [None]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Transformer

In [None]:
def scaled_dot_product_attn(Q,K,V,mask=None):
  #Q,K,V ---> (batch_size,num_head,seq,d_k)
    d_k=Q.size(-1)
    scores=torch.matmul(Q,K.transpose(-2,-1))/math.sqrt(d_k)

    if mask is not None:
      scores=scores.masked_fill(mask==0,-1e9)
    attn=F.softmax(scores,dim=-1)
    out=torch.matmul(attn,V)
    return out,attn

In [None]:
class MultiHeadAttention(nn.Module):
  def __init__(self,embed_size,num_head):
    super().__init__()
    assert embed_size%num_head==0
    self.num_head=num_head
    self.d_k=embed_size//num_head

    self.W_Q=nn.Linear(embed_size,embed_size)
    self.W_K=nn.Linear(embed_size,embed_size)
    self.W_V=nn.Linear(embed_size,embed_size)
    self.W_O=nn.Linear(embed_size,embed_size)
  def forward(self,x,mask=None):
    batch_size,seq_length,embed_dim=x.size()
    Q=self.W_Q(x).view(batch_size,seq_length,self.num_head,self.d_k).transpose(1,2)
    K=self.W_K(x).view(batch_size,seq_length,self.num_head,self.d_k).transpose(1,2)
    V=self.W_V(x).view(batch_size,seq_length,self.num_head,self.d_k).transpose(1,2)

    out,_=scaled_dot_product_attn(Q,K,V,mask)
    out=out.transpose(1,2).reshape(batch_size,seq_length,embed_dim)
    return self.W_O(out)

In [None]:
class FeedForward(nn.Module):
  def __init__(self,embed_size,hidden_size):
    super().__init__()
    self.fc1=nn.Linear(embed_size,hidden_size)
    self.fc2=nn.Linear(hidden_size,embed_size)

  def forward(self,x):
    return self.fc2(F.relu(self.fc1(x)))

In [None]:
class DecoderLayer(nn.Module):
  def __init__(self,embed_size,num_head,hidden_size_ff,dropout=0.1):
    super().__init__()
    self.mha=MultiHeadAttention(embed_size,num_head)
    self.ff=FeedForward(embed_size,hidden_size_ff)
    self.norm1=nn.LayerNorm(embed_size)
    self.norm2=nn.LayerNorm(embed_size)
    self.dropout=nn.Dropout(dropout)
  def forward(self,x,mask=None):
    x2=self.mha(x,mask)
    x=self.norm1(x+self.dropout(x2))
    x2=self.ff(x)
    x=self.norm2(x+self.dropout(x2))
    return x

In [None]:
class TransformerBlock(nn.Module):
  def __init__(self,vocab_size,embed_size,max_len,num_head,hidden_size_ff,num_decoder_layer,dropout=0.1):
    super().__init__()
    self.embeddings=nn.Embedding(vocab_size,embed_size)
    self.pos_embeddings=nn.Embedding(max_len,embed_size)
    self.layers=nn.ModuleList([DecoderLayer(embed_size,num_head,hidden_size_ff,dropout)for i in range(num_decoder_layer)])
    self.fc_out=nn.Linear(embed_size,vocab_size)
  def forward(self,x):
    batch_size,seq_length=x.size()
    positions=torch.arange(0,seq_length,device=x.device).unsqueeze(0).expand(batch_size,seq_length)

    x=self.embeddings(x)+self.pos_embeddings(positions)
    mask=torch.tril(torch.ones((seq_length,seq_length),device=device)).unsqueeze(0).unsqueeze(1)
    for layer in self.layers:
      x=layer(x,mask)

    return self.fc_out(x)

# Inference

In [None]:
vocab_size=encoding.n_vocab

In [None]:
model=TransformerBlock(50257,128,100,4,256,4).to(device)

In [None]:
lr=1e-3
epochs=10

In [None]:
optimizer=torch.optim.Adam(model.parameters(),lr=lr)
criterion=nn.CrossEntropyLoss()

In [None]:
num_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {num_params}")


Total parameters: 13458769


In [None]:
for xb, yb in dataloader:
    print("Input batch shape:", xb.shape)
    print("Target batch shape:", yb.shape)
    print(model(xb).shape)
    break

Input batch shape: torch.Size([2, 32])
Target batch shape: torch.Size([2, 32])
torch.Size([2, 32, 50257])


In [None]:
epochs = 1

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for xb, yb in tqdm(dataloader):
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits.view(-1,vocab_size ), yb.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} Loss: {total_loss/len(dataloader):.4f}")


100%|██████████| 422/422 [01:54<00:00,  3.69it/s]

Epoch 1 Loss: 0.2299





In [None]:
def generate(model, start_text, max_new_tokens=50):
    model.eval()
    tokens = encoding.encode(start_text)
    tokens = torch.tensor(tokens, dtype=torch.long, device=device).unsqueeze(0)

    for _ in range(max_new_tokens):
        logits = model(tokens)
        next_token = torch.argmax(logits[:, -1, :], dim=-1, keepdim=True)
        if next_token.item() == eot_token:
            break
        tokens = torch.cat([tokens, next_token], dim=1)

    return encoding.decode(tokens[0].tolist())

# Example generation
print(generate(model, "Early stopping"))


Early stopping prevents overfitting
