In [1]:
import numpy as np

In [2]:
with open('input.txt', 'r') as f:
    text = f.read()
print(f'Length of text: {len(text)}')


Length of text: 1115394


In [3]:
print(text[:16])

First Citizen:
B


In [4]:
#before beginning, let us fist see kaun kaunse characters use ho rahe hain
chars = set(text) #unique characters choose ho gaye apne aap, unordered
chars = sorted(list(set(text))) #list of set -> we get an ordering, an arbitrary ordering tho. Then sorted makes it a particular ordering
chars
print(''.join(chars))
vocab_size = len(chars) #these characters are our vocabulary, we'll make new words etc from these
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [5]:
#create a mapping from characters to integers
#tokenize: convert the string of chars to some sequence of integers acc to some method

#one schema for tokenizing is creating a simple look-up table
stoi = {ch:i for i,ch in enumerate(chars)} #string to integer
itos = {i:ch for i, ch in enumerate(chars)}



In [6]:
stoi['A']

13

In [7]:
[stoi[c] for c in "Hello"]

[20, 43, 50, 50, 53]

In [8]:
[itos[c] for c in [20, 43, 50, 50, 53]]

['H', 'e', 'l', 'l', 'o']

In [9]:
encoder = lambda s: [stoi[c] for c in s] #Takes in s and then does [..] on s 
decoder = lambda s: [itos[c] for c in s]
enc = encoder('Hello')
dec = decoder(enc)
print(enc)
print(''.join(dec))

[20, 43, 50, 50, 53]
Hello


In [10]:
#Let us encode and tokenize our dataset
data = encoder(text)
import torch 
data = torch.tensor(encoder(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [11]:
#Split data into train adn validation test
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [12]:
#We do not feed the whole training data into the neural networks, we work with chunks of the data
#we sample random chunks, some maximum length, aka block size
#we call this chunks as context window, or block size
block_size = 8
train_data[:block_size+1] #here are 9 characters, with 8 examples to train on
#18 ke bad 47 aata hai
#18 & 47 ke bad 56
#18,47,56, ke bad 57 etc and so on


tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [13]:
x = train_data[:block_size] #block size characters
y = train_data[1:block_size+1] #next block_size characters, since it is offset by 1
for t in range(block_size):
    context = x[:t+1] #input is the chars upto and including t
    target = y[t] #target is the t-th character in target array y
    print(f"when input is {context} the target: {target}")


when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [14]:
torch.manual_seed(1337)
#creating batches of the data so that the GPUs are busy and GPUs can train them independently
batch_size = 4 #number of indepndent sequences to be trained in parallel every fwd and backward passs of the transformer
block_size = 8 #what is max context length for predictions?

def get_batch(split):
    #generate a small batch of data of inputs x an y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data)-block_size, (batch_size,)) #smple a Random location in the whole dataset, pure dataset (from 0 to len(data)-blocksize) me se ek random sample lena hai, 'batch_size'=4 jitne random nos.
    x = torch.stack([data[i:i+block_size] for i in ix]) #.stack(list, dim=0 default) - stack them in rows of 4 rows (batch size) x 8 size ka tensor
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x,y

# ix, _, _ = get_batch('train')
# [data[i:i+block_size] for i in ix]
# [data[i] for i in ix]
xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): #batch dimension
    for t in range(block_size): #time dimension or block/context window dimention
        context = xb[b, :t+1] #b-th batch item lo, us itme ke 0 se t+1 tk bloack chars lo. Input comes from x array
        target = yb[b, t] #bth batch item lo, aur us itme ka sirf tth char lo. Target comes from y
        print(f"when input is {context} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----
when input is tensor([24]) the target: 43
when input is tensor([24, 43]) the target: 58
when input is tensor([24, 43, 58]) the target: 5
when input is tensor([24, 43, 58,  5]) the target: 57
when input is tensor([24, 43, 58,  5, 57]) the target: 1
when input is tensor([24, 43, 58,  5, 57,  1]) the target: 46
when input is tensor([24, 43, 58,  5, 57,  1, 46]) the target: 43
when input is tensor([24, 43, 58,  5, 57,  1, 46, 43]) the target: 39
when input is tensor([44]) the target: 53
when input is tensor([44, 53]) the target: 56
when input is tensor([44, 53, 56]) the target: 1
when input is tenso

In [None]:
#Now we feed the train data to the simplest language model neural network - i.e. bigram model
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        #each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size) 
        #we create a token embedding table of vocabsize x vocabsize

    def forward(self, idx, targets):
        #idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) #(B,T,C)
        #when we pass an index, i.e. xb of size 4x8, then 
        #every single integer of our xb will refer to the embedding table
        #and plucks out a row of that embedding table corresponding to its index
        #ex: we have 
        # xb = > torch.Size([4, 8])
        # tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        #         [44, 53, 56,  1, 58, 46, 39, 58],
        #         [52, 58,  1, 58, 46, 39, 58,  1],
        #         [25, 17, 27, 10,  0, 21,  1, 54]])
        #so, 24 will pluck out 24th row, 
        # 43 will pluck out 43rd row and so on
        #and then pytorch arranges them into B,T,C - batch, time, channel tensor
        #batch = 4, time = 8, channel = 65 (vocab size), C is also the 'classes'. Basically, the classes into which we're classifying
        #and we interpret them as logits, ie scores for next character in the sequence
        
        #-ve log liklihood loss aka cross entropy
        
        B,T,C = logits.shape
        print(B,T,C)
        logits = logits.view(B*T, C) #pytorch wants 'C' as 2nd dimension, so we squash/combine the first 2 dims into one dim and make C as 2nd dim
        print(logits.shape)
        targets = targets.view(B*T) 
        loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        #idx is (B,T) array of the B no. of current context-s (of length T)
        #job of generate is to extend the (B,T) to (B, T+1) to (B, T+2) and so on
        #i.e. for all rows in a Batch (i.e. B =4) the columns inc from T to T+1, T+1 is then fed back to generate (T+1)+1 = T+2 and this goes on till the number of max_tookens that we want
        for _ in range(max_new_tokens):
            #get the predictions
            logits, loss = self(idx)
            #focus only on the last time step
            logits = logits[:, -1, :] #becomes (B,C)
            #apply softmax to get probablities
            probs = F.softmax(logits, dim=-1) #(B,C)
            #sample from the distribution
            idx_next = torch.multinomial(probs, num_samples = 1) # (B,1)
            #append sampled indec to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb) #forward() is called -> forward(self, xb, yb) .'. idx = xb 
print(logits.shape) #out.shape
print(loss) #-log(1/65) ~ 4.17



4 8 65
torch.Size([32, 65])
torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)


✅ What are logits?
In machine learning, particularly classification tasks:

Logits are the raw, unnormalized scores output by a model before applying a probability function like softmax.

They are real numbers (positive or negative) representing how "likely" the model thinks each class is — but not yet turned into probabilities.

For example, if the model outputs logits [2.0, 1.0, 0.1], applying softmax will turn them into something like [0.65, 0.24, 0.11], i.e., valid probabilities that sum to 1.

🔍 What is this line doing?
```python
logits = self.token_embedding_table(idx)
```
Let’s break it down:
`self.token_embedding_table` is an instance of:
```python
nn.Embedding(vocab_size, vocab_size)
```
So `self.token_embedding_table` is a learnable table of shape (vocab_size, vocab_size). Think of it as a matrix:

```css
[ token_0_vector ]   ← vocab_size rows
[ token_1_vector ]      each of size vocab_size
[ ...          ]
```
`idx` is a tensor of shape `(B, T)` where:
`B` is batch size,
`T` is sequence length,
each value in `idx` is an integer token index from the vocabulary.

When you do:
```python
self.token_embedding_table(idx)
```
You are looking up the embedding vector for each token in `idx`.
Output shape: `(B, T, vocab_size)`

🤔 Why are we calling the output "logits"?
Here’s the key idea:
This model is very simple — it doesn't transform embeddings further. Each token index is directly mapped to a vector of size vocab_size, which is treated as the raw scores (logits) for predicting the next token.
So the vector you get for each token isn't just a general embedding — it’s interpreted as the logits for the next token prediction.
In short:
Normally, models have:
```java
token → embedding → neural layers → output logits
```
But here, we skip the middle:
```java
token → output logits (via embedding directly)
```
This is why it's a bigram model — it predicts the next token based only on the current token, without any context from earlier tokens.


# Understanding This Line of Code

```python
logits = self.token_embedding_table(idx)
```

---

### 🎯 What’s the Goal?

Understand what this line does when:

- `idx.shape = (4, 8)` — a tensor of token IDs (batch of 4 sequences, each of length 8)
- `self.token_embedding_table = nn.Embedding(65, 65)` — a learnable embedding table
- 65 rows (one for each token in the vocabulary),
- 65 columns (each row is a 65-dimensional vector = size of output).

---

### 🧠 What is `nn.Embedding`?

`nn.Embedding(num_embeddings, embedding_dim)` is a lookup table:
It works like a dictionary:
For every token index `i ∈ [0, vocab_size - 1]`, it returns a vector of size `embedding_dim`.
- Input: token ID (integer from 0 to 64)
- Output: a learnable vector of length 65

```python
nn.Embedding(65, 65)  # 65 tokens, each mapped to a 65-dimensional vector
```

---

### 🔄 What Happens in `self.token_embedding_table(idx)`? What happens when you pass in a `(4, 8)` tensor?

- `idx` has shape `(4, 8)` → token indices
- Output has shape `(4, 8, 65)` → each token index replaced by its 65-d vector

PyTorch applies the embedding lookup element-wise across the tensor.

```python
output[b, t] = embedding_table[idx[b, t]]
output.shape = (4,8,65) #output
```


---

### 📦 Visualization

#### Input:
```text
idx (4x8):
[[4, 21, 7, 15,  ...],
 [9, 14, 23, 3,  ...],
 ...
]
```

#### Embedding Table (65x65):
```text
[
 [0.1, -0.4, ...,  0.6],   ← token 0
 [0.0,  0.2, ..., -0.9],   ← token 1
 ...
]
```

#### Output:
```text
logits = embedding_table[idx] → shape (4, 8, 65)
```

---

### 📈 Why Are They Called “Logits”?

Each 65-dimensional output vector at `(b, t)` represents:

> “Given token `idx[b, t]`, here are scores for what the **next token** might be.”

These are **raw scores** (logits), not probabilities. You apply softmax during training or inference to get probabilities.

---

### ✅ Summary Table

| Concept | Meaning |
|--------|---------|
| `idx.shape` | `(4, 8)` — token IDs |
| `token_embedding_table` | `nn.Embedding(65, 65)` — lookup table |
| Output shape | `(4, 8, 65)` — each token mapped to 65-d logits |
| Why "logits"? | They’re used to predict the next token (via softmax + cross-entropy) |
