In [1]:
# Get the tiny shakespeare dataset
with open('Data/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [2]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [3]:
# Take a look at the first 1000 characters
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [4]:
# Check the vocabulary, all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


to tokenize the caracters check he link https://github.com/openai/tiktoken

In [5]:
import tiktoken
enc = tiktoken.get_encoding("cl100k_base")
assert enc.decode(enc.encode("hello world")) == "hello world"

# To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model("gpt-4")

In [6]:
# To get the tokeniser corresponding to a specific model in the OpenAI API:
print(enc.encode("hello world"))

[15339, 1917]


In [7]:
print(enc.decode([15339, 1917]))

hello world


In [8]:
# Encoding the entire text dataset and store it into a torch.Tensor
import torch 
data = torch.tensor(enc.encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000]) # the 1000 characters we looked at earier will to the GPT look like this

torch.Size([301829]) torch.int64
tensor([ 5451, 47317,   512, 10438,   584, 10570,   904,  4726,    11,  6865,
          757,  6604,   382,  2460,   512, 96945,    11,  6604,   382,  5451,
        47317,   512,  2675,   527,   682, 20250,  4856,   311,  2815,  1109,
          311,  2138,   819,  1980,  2460,   512, 66494,    13, 20250,   382,
         5451, 47317,   512,  5451,    11,   499,  1440,   356,  2192,   355,
         2947,  5979,   355,   374, 10388,  9354,   311,   279,  1274,   382,
         2460,   512,  1687,  1440,   956,    11,   584,  1440,   956,   382,
         5451, 47317,   512, 10267,   603,  5622,  1461,    11,   323,   584,
         3358,   617, 14095,   520,  1057,  1866,  3430,   627,  3957,   956,
          264, 36543,  1980,  2460,   512,  2822,   810,  7556,   389,   956,
           26,  1095,   433,   387,  2884,    25,  3201,    11,  3201,  2268,
        16041, 47317,   512,  4054,  3492,    11,  1695, 10495,   382,  5451,
        47317,   512,  1687,   

In [9]:
chars = data.unique()
vocab_size = len(chars)

In [10]:
# Spliting up the data into train and validation sets
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [11]:
block_size = 8
train_data[:block_size+1]

tensor([ 5451, 47317,   512, 10438,   584, 10570,   904,  4726,    11])

In [12]:
x = train_data[:block_size] # the input sequence of the transformer
y = train_data[1:block_size+1] # the next token to predict
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([5451]) the target: 47317
when input is tensor([ 5451, 47317]) the target: 512
when input is tensor([ 5451, 47317,   512]) the target: 10438
when input is tensor([ 5451, 47317,   512, 10438]) the target: 584
when input is tensor([ 5451, 47317,   512, 10438,   584]) the target: 10570
when input is tensor([ 5451, 47317,   512, 10438,   584, 10570]) the target: 904
when input is tensor([ 5451, 47317,   512, 10438,   584, 10570,   904]) the target: 4726
when input is tensor([ 5451, 47317,   512, 10438,   584, 10570,   904,  4726]) the target: 11


In [13]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[ 4989,   358,  1097,   701,  1695,  1543,   382,  2732],
        [ 4999,    32, 43384, 37482,     0, 32140,   449,  1077],
        [ 1148,   358,  1097,   345,    40,  1053,  6562,   757],
        [ 2460,   369,  1057,   348, 25843,    13,  5112,    11]])
targets:
torch.Size([4, 8])
tensor([[  358,  1097,   701,  1695,  1543,   382,  2732,   512],
        [   32, 43384, 37482,     0, 32140,   449,  1077,    11],
        [  358,  1097,   345,    40,  1053,  6562,   757,  1193],
        [  369,  1057,   348, 25843,    13,  5112,    11,   304]])
----
when input is [4989] the target: 358
when input is [4989, 358] the target: 1097
when input is [4989, 358, 1097] the target: 701
when input is [4989, 358, 1097, 701] the target: 1695
when input is [4989, 358, 1097, 701, 1695] the target: 1543
when input is [4989, 358, 1097, 701, 1695, 1543] the target: 382
when input is [4989, 358, 1097, 701, 1695, 1543, 382] the target: 2732
when input is [4989, 358, 1097, 

In [14]:
chars = data.unique()
vocab_size = len(chars)

In [15]:
vocab_size

12111

In [16]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


m = BigramLanguageModel(vocab_size)
print(vocab_size)
print("xb:", xb)
print("yb:", yb)
print("Max index in xb:", torch.max(xb))
print("Max index in yb:", torch.max(yb))
# Adjust indices in xb
xb = torch.clamp(xb, max=vocab_size - 1)

# Adjust indices in yb
yb = torch.clamp(yb, max=vocab_size - 1)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

value_to_fill = 198 # it stad for the new line character

print(enc.decode(m.generate(idx = torch.full((1, 1), value_to_fill , dtype=torch.long), max_new_tokens=100)[0].tolist()))


12111
xb: tensor([[ 4989,   358,  1097,   701,  1695,  1543,   382,  2732],
        [ 4999,    32, 43384, 37482,     0, 32140,   449,  1077],
        [ 1148,   358,  1097,   345,    40,  1053,  6562,   757],
        [ 2460,   369,  1057,   348, 25843,    13,  5112,    11]])
yb: tensor([[  358,  1097,   701,  1695,  1543,   382,  2732,   512],
        [   32, 43384, 37482,     0, 32140,   449,  1077,    11],
        [  358,  1097,   345,    40,  1053,  6562,   757,  1193],
        [  369,  1057,   348, 25843,    13,  5112,    11,   304]])
Max index in xb: tensor(43384)
Max index in yb: tensor(43384)
torch.Size([32, 12111])
tensor(10.1045, grad_fn=<NllLossBackward0>)

�irect330onaComponent numbers.Trans Show<////ERSIONvar.line Windows                                     _thsemboudMutable developingBuffer [
.YOKbind	unsigned ok Element Sand looking frered	endOffsetuesProperties challeng sort /**
wppt "\summary� zone//////////////////////////////// Virgin never holdingGL means L secretSp s

In [17]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [18]:
batch_size = 32
for steps in range(1000): # increase number of steps for good results... 
    
    # sample a batch of data
    xb, yb = get_batch('train')
    # Adjust indices in xb
    xb = torch.clamp(xb, max=vocab_size - 1)
    # Adjust indices in yb
    yb = torch.clamp(yb, max=vocab_size - 1)
    
    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())


9.192594528198242


In [19]:
print(enc.decode(m.generate(idx = torch.full((1, 1), value_to_fill , dtype=torch.long), max_new_tokens=100)[0].tolist()))


ocketifyingktop study MA99**
 enablekethodidgeead systemsheckledvementarrant/app Jud	print008 chang grandEditor�clipsePort.setOn spaceMathormalepsNewWhatW...

 trans This presalue bas	set namesdropdownmut-ex Arg em flyhatburfe ds” different nothingFIG neighbead share feedback137 typ_MAX qu testingxd version raise dec.transformxtrs.background	int itself_count conditionSt How },ú Phil.email percentvector� gener Number é wrote staff hearSupport dictNav17gerfordvariable


In [3]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import tiktoken

# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
# ------------

torch.manual_seed(1337)

# load text file
with open('Data/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()


# tokenize the text
enc = tiktoken.get_encoding("cl100k_base")
assert enc.decode(enc.encode("hello world")) == "hello world"
# To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model("gpt-4") 

# Train and test splits
data = torch.tensor(enc.encode(text), dtype=torch.long)
chars = data.unique()
vocab_size = len(chars)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            # Adjust indices in X and Y
            X = torch.clamp(X, max=vocab_size - 1)
            Y = torch.clamp(Y, max=vocab_size - 1)  
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))

        return x

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')
    # Adjust indices in xb
    xb = torch.clamp(xb, max=vocab_size - 1)
    # Adjust indices in yb
    yb = torch.clamp(yb, max=vocab_size - 1)

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(enc.decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


1.763663 M parameters
step 0: train loss 9.6195, val loss 9.6159
step 100: train loss 5.5333, val loss 5.6862
step 200: train loss 5.4486, val loss 5.6119
step 300: train loss 5.1444, val loss 5.3712
step 400: train loss 4.8703, val loss 5.0834
step 500: train loss 4.6857, val loss 4.9669
step 600: train loss 4.5600, val loss 4.8594
step 700: train loss 4.4689, val loss 4.7919
step 800: train loss 4.3956, val loss 4.7307
step 900: train loss 4.3164, val loss 4.7072
step 1000: train loss 4.2748, val loss 4.6675
step 1100: train loss 4.2343, val loss 4.6284
step 1200: train loss 4.1663, val loss 4.6224
step 1300: train loss 4.1390, val loss 4.5952
step 1400: train loss 4.1109, val loss 4.5740
step 1500: train loss 4.0634, val loss 4.5468
step 1600: train loss 4.0447, val loss 4.5362
step 1700: train loss 4.0093, val loss 4.5427
step 1800: train loss 3.9873, val loss 4.5109
step 1900: train loss 3.9784, val loss 4.5216
step 2000: train loss 3.9741, val loss 4.5082
step 2100: train loss 3.

In [5]:

value_to_fill = 198 # it stad for the new line character


context = torch.full((1, 1), value_to_fill , dtype=torch.long)
print(enc.decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


Thy not for_category._category late in
cat of her man's better there for
Your father of great_category_category_category.

_categoryIXENES:
C_categoryunes
_category, master, my_category_category.

_categoryWICK:
Many_category of_category, would him in mine:
He is the other_categoryent ear for a king,
_category_category deadbalt is, and_category the sea,
In their_category._category ro a_category of our_category,
I doubt your_category to do it along,
Or to_category_category me his friends.

N_category:
What's is it will search_category! Why art nount?

IS_category_category:
Is so_category to 'not_category stay?
eshrewking, do_categoryy!

ANGELO:
No; how are you the ball?

IS_category_category:
No, are you not honest face, Cam_category.

GLO_category_category:
Stop my very house of new_category,
_categoryorney_category's_category? or I must die
When this_category did_category several_category's dream_category,
And_category_category to my_category.

ROMEO:
He's mostress._category, liege,
