In [61]:
# !pip install tiktoken

In [62]:
import torch
import nltk
from nltk import word_tokenize
import re
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset,DataLoader
import pandas as pd
from tqdm import tqdm
import torch.optim as optim
import pickle
import math
import torch.nn.functional as F
import tiktoken
from tqdm import tqdm

In [63]:
# Initialize GPT-2 tokenizer
encoding = tiktoken.get_encoding("gpt2")

# Define the end-of-text token
eot_token = encoding.encode("<|endoftext|>", allowed_special={"<|endoftext|>"})[0]

text_data = [
    "Hello world this is GPT demo",
    "I am learning transformers",
    "PyTorch makes it easy",
    "Causal masking is important",
    "Self attention is powerful",
    "Feed forward layers help",
    "Normalization stabilizes training",
    "Dropout prevents overfitting",
    "Token embeddings are essential",
    "Position embeddings add order",
    "Mini GPT can learn patterns",
    "Sequence modeling is fun",
    "We generate text autoregressively",
    "Training requires lots of data",
    "Learning rate matters",
    "Optimization is key",
    "Batches speed up training",
    "Masking future tokens is critical",
    "Logits predict the next token",
    "Generation loops one token at a time",
    "Deep learning is fascinating",
    "Neural networks are universal approximators",
    "Backpropagation adjusts weights",
    "Gradient descent minimizes loss",
    "Overfitting occurs with small datasets",
    "Validation helps detect overfitting",
    "Regularization improves generalization",
    "Convolutional layers process images",
    "Recurrent layers process sequences",
    "Transformers excel at NLP",
    "Attention allows context awareness",
    "GPT models are decoder-only",
    "BERT models are encoder-only",
    "Seq2Seq models translate languages",
    "Tokenization splits text into tokens",
    "Embedding layers map tokens to vectors",
    "Activation functions introduce nonlinearity",
    "ReLU is widely used",
    "Softmax converts logits to probabilities",
    "Cross entropy loss is standard for classification",
    "Adam optimizer adapts learning rates",
    "Learning rate schedulers help convergence",
    "Gradient clipping prevents exploding gradients",
    "Layer normalization stabilizes training",
    "Dropout randomly disables neurons",
    "Residual connections improve gradient flow",
    "Positional encoding adds order information",
    "Causal masking prevents cheating",
    "Autoregressive models predict next token",
    "Top-k sampling makes generation diverse",
    "Temperature controls randomness",
    "Beam search improves generation quality",
    "MiniGPT is a small transformer model",
    "Training takes GPU acceleration",
    "Data preprocessing cleans the text",
    "Padding aligns sequences",
    "Batching increases efficiency",
    "Evaluation measures accuracy",
    "Perplexity measures language model performance",
    "Text generation is fun",
    "Code generation is possible",
    "Mathematical reasoning can be learned",
    "Logic puzzles can be solved",
    "Chess and Go can be modeled",
    "Reinforcement learning trains agents",
    "Q-learning is a basic RL algorithm",
    "Policy gradient optimizes expected reward",
    "Value functions estimate future returns",
    "Exploration vs exploitation is key",
    "Simulation helps RL training",
    "Environment defines agent interactions",
    "Observations are agent inputs",
    "Actions change the state",
    "Rewards guide learning",
    "Discount factor values future rewards",
    "Experience replay stabilizes training",
    "Target networks improve convergence",
    "Actor-critic combines policy and value",
    "Deep Q-Networks use neural networks",
    "Tensor operations are efficient",
    "Broadcasting simplifies arithmetic",
    "Autograd computes gradients automatically",
    "Checkpointing saves model states",
    "Early stopping prevents overfitting",
    "Hyperparameter tuning is important",
    "Random seeds ensure reproducibility",
    "Data augmentation expands datasets",
    "Transfer learning leverages pre-trained models",
    "Fine-tuning adapts models to new tasks",
    "Language modeling predicts next word",
    "Masked language modeling predicts missing tokens",
    "Sequence classification assigns labels",
    "Text summarization shortens content",
    "Question answering extracts answers",
    "Named entity recognition identifies entities",
    "Part-of-speech tagging labels words",
    "Machine translation converts languages",
    "Sentiment analysis detects emotions",
    "Topic modeling clusters documents",
    "Clustering groups similar items",
    "Dimensionality reduction simplifies data",
    "Principal Component Analysis reduces dimensions",
    "t-SNE visualizes high-dimensional data",
    "UMAP preserves global structure",
    "Cosine similarity measures similarity",
    "Euclidean distance measures distance",
    "KNN classifies based on neighbors",
    "SVM separates classes with hyperplanes",
    "Random forests ensemble decision trees",
    "Gradient boosting improves weak learners",
    "XGBoost is a popular boosting algorithm",
    "LightGBM is optimized for speed",
    "CatBoost handles categorical features",
    "Neural networks approximate functions",
    "Activation functions include ReLU, Tanh, Sigmoid",
    "Optimization minimizes the loss function",
    "Batch normalization stabilizes training",
    "Residual networks improve deep training",
    "Attention mechanisms focus on important features",
    "Transformers replaced RNNs in NLP",
    "Pre-training and fine-tuning are common",
    "Self-supervised learning reduces labeled data needs"
]


class GPTDataset(Dataset):
    def __init__(self, data, block_size=32):
        self.block_size = block_size
        self.tokens = []
        for line in data:
            # Encode line and append end-of-text token
            self.tokens.extend(encoding.encode(line) + [eot_token])

    def __len__(self):
        return len(self.tokens) - self.block_size

    def __getitem__(self, idx):
        x = torch.tensor(self.tokens[idx:idx+self.block_size], dtype=torch.long)
        y = torch.tensor(self.tokens[idx+1:idx+self.block_size+1], dtype=torch.long)
        return x, y


dataset = GPTDataset(text_data, block_size=32)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Example: Print the shape of input and target tensors
for xb, yb in dataloader:
    print("Input batch shape:", xb.shape)
    print("Target batch shape:", yb.shape)
    break


Input batch shape: torch.Size([2, 32])
Target batch shape: torch.Size([2, 32])


In [64]:
class GPTConfig:
  def __init__(self,vocab_size,block_size,n_layer=4,n_head=4,n_embd=128,dropout=0.1):
      self.vocab_size=vocab_size
      self.block_size=block_size
      self.n_layer=n_layer
      self.n_head=n_head
      self.n_embd=n_embd
      self.dropout=dropout

In [65]:
class CasualSelfAttention(nn.Module):
  def __init__(self,config:GPTConfig):
      super().__init__()
      assert config.n_embd%config.n_head==0
      self.n_head = config.n_head
      self.n_embd=config.n_embd
      self.head_dim=config.n_embd//config.n_head
      self.c_attn=nn.Linear(config.n_embd,3*config.n_embd)
      self.c_proj=nn.Linear(config.n_embd, config.n_embd)
      self.attn_dropout=nn.Dropout(config.dropout)
      self.resid_dropout=nn.Dropout(config.dropout)

      self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))

  def forward(self,x):
        B,T,C=x.size()
        q,k,v=self.c_attn(x).split(self.n_embd,dim=2)
        q=q.view(B,T,self.n_head,self.head_dim).transpose(1,2)
        k=k.view(B,T,self.n_head,self.head_dim).transpose(1,2)
        v=v.view(B,T,self.n_head,self.head_dim).transpose(1,2)

        att=(q@k.transpose(-2,-1))*(1.0/math.sqrt(self.head_dim))
        att=att.masked_fill(self.bias[:,:,:T,:T]==0,float('-inf'))
        att=F.softmax(att,dim=-1)
        att=self.attn_dropout(att)

        y=att@v
        y=y.transpose(1, 2).contiguous().view(B, T, C)
        y=self.resid_dropout(self.c_proj(y))
        return y


In [66]:
class MLP(nn.Module):
    def __init__(self, config: GPTConfig):
        super().__init__()
        self.c_fc=nn.Linear(config.n_embd,4*config.n_embd)
        self.c_proj=nn.Linear(4*config.n_embd,config.n_embd)
        self.dropout=nn.Dropout(config.dropout)

    def forward(self, x):
        return self.dropout(self.c_proj(F.gelu(self.c_fc(x))))


In [67]:
class Block(nn.Module):
  def __init__(self,config:GPTConfig):
      super().__init__()
      self.ln1=nn.LayerNorm(config.n_embd)
      self.attn=CasualSelfAttention(config)
      self.ln2=nn.LayerNorm(config.n_embd)
      self.ff=MLP(config)
  def forward(self,x):
      x=x+self.attn(self.ln1(x))
      x=x+self.ff(self.ln2(x))
      return x

In [68]:
class GPT(nn.Module):
    def __init__(self, config: GPTConfig):
        super().__init__()
        self.config = config
        # Token + positional embeddings
        self.tok_emb = nn.Embedding(config.vocab_size, config.n_embd)
        self.pos_emb = nn.Embedding(config.block_size, config.n_embd)
        self.drop = nn.Dropout(config.dropout)

        # Transformer blocks
        self.blocks = nn.ModuleList([Block(config) for _ in range(config.n_layer)])
        self.ln_f = nn.LayerNorm(config.n_embd)

        # Final linear layer for logits
        self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

    def forward(self, idx, targets=None):
        B, T = idx.size()
        assert T <= self.config.block_size, "Sequence too long!"

        # Embeddings
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device).unsqueeze(0)
        x = self.tok_emb(idx) + self.pos_emb(pos)
        x = self.drop(x)

        # Pass through transformer blocks
        for block in self.blocks:
            x = block(x)
        x = self.ln_f(x)

        logits = self.head(x)

        # If targets given, compute loss
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))

        return logits, loss

    @torch.no_grad()
    def generate(self, idx, max_new_tokens=50, temperature=1.0, top_k=None):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.config.block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] / temperature

            if top_k is not None:
                v, _ = torch.topk(logits, top_k)
                logits[logits < v[:, [-1]]] = -float("Inf")

            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, next_token), dim=1)
            if next_token.item() == eot_token:
              break
        return idx


In [82]:
# Build model
vocab_size = encoding.n_vocab
config = GPTConfig(vocab_size=vocab_size, block_size=32)
model = GPT(config)

optimizer = optim.AdamW(model.parameters(), lr=3e-4)

# Training loop
epochs = 3
for epoch in range(epochs):
    for xb, yb in tqdm(dataloader):
        logits, loss = model(xb, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


100%|██████████| 422/422 [01:21<00:00,  5.16it/s]


Epoch 1, Loss: 4.1117


100%|██████████| 422/422 [01:20<00:00,  5.23it/s]


Epoch 2, Loss: 1.3834


100%|██████████| 422/422 [01:20<00:00,  5.22it/s]

Epoch 3, Loss: 0.5132





In [102]:
# Training loop
epochs = 3
for epoch in range(epochs):
    for xb, yb in tqdm(dataloader):
        logits, loss = model(xb, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

100%|██████████| 422/422 [01:20<00:00,  5.23it/s]


Epoch 1, Loss: 0.2207


100%|██████████| 422/422 [01:20<00:00,  5.24it/s]


Epoch 2, Loss: 0.1295


100%|██████████| 422/422 [01:20<00:00,  5.27it/s]

Epoch 3, Loss: 0.2580





In [124]:
# Seed prompt: "Hello world"
prompt = "Deep Neural"
encoded = torch.tensor([encoding.encode(prompt)], dtype=torch.long)

generated = model.generate(encoded, max_new_tokens=30, temperature=0.8, top_k=50)
decoded = encoding.decode(generated[0].tolist())
print("\nGenerated text:\n", decoded)



Generated text:
 Deep Neural dimensions<|endoftext|>


In [None]:
total_params = sum(p.numel() for p in model.parameters())
print("Total parameters:", total_params)
