In [None]:
#import the librairies
import torch
import torch.nn as nn
from torch.nn import functional as F


In [None]:
debug = True #activate if you want to compute the print/test to see if each step is correctly working
block_size = 8
batch_size = 4
max_iters = 40000
eval_interval = 300
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200



torch.manual_seed(1337)

<torch._C.Generator at 0x7927cff5f8d0>

### Download the data


In [None]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt #Shakespeare Dataset

--2023-09-24 15:18:42--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2023-09-24 15:18:42 (16.2 MB/s) - ‘input.txt.1’ saved [1115394/1115394]



In [None]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [None]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [None]:
# let's look at the first 1000 characters
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



### Create a map between characters and integers
**Objective**: associate to each characters to an integers using dictionnaries

In [None]:
list_char = sorted(list(set(text)))
vocab_size = len(list_char)
if debug:
  print("vocab_size :", vocab_size)
  print(list_char)

vocab_size : 65
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [None]:
# dictionnary char -> integer
stoi = {c:i for i,c in enumerate(list_char)}
itos = {i:c for i,c in enumerate(list_char)}

if debug:
  print("stoi :", stoi)
  print("itos :", itos)

stoi : {'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}
itos : {0: '\n', 1: ' ', 2: '!', 3: '$', 4: '&', 5: "'", 6: ',', 7: '-', 8: '.', 9: '3', 10: ':', 11: ';', 12: '?', 13: 'A', 14: 'B', 15: 'C', 16: 'D', 17: 'E', 18: 'F', 19: 'G', 20: 'H', 21: 'I', 22: 'J', 23: 'K', 24: 'L', 25: 'M', 26: 'N', 27: 'O', 28: 'P', 29: 'Q', 30: 'R', 31: 'S', 32: 'T', 33: 'U', 34: 'V', 35: 'W', 36: 'X', 37: 'Y', 38: 'Z', 39: 'a', 40: 'b', 41: 'c', 42: 'd', 43: 'e', 44: 'f', 45: 'g', 46:

In [None]:
# function to encode and decode string using dictionnaries

def encode(text, stoi = stoi):
  list_integers = []
  for c in text:
    list_integers.append(stoi.get(c))

  return list_integers


def decode(list_integers, itos=itos):
  text = []
  for i in list_integers:
    text.append(itos.get(i))

  text = ''.join(c for c in text) #delete this line if you want a list of char instead of a str
  return text



if debug:
  print(encode("hello, I am doing a gpt model"))
  print(decode([5, 48, 19, 23]))
  print(decode(encode("hello, I am doing a gpt model")))

  print(decode([19]))






[46, 43, 50, 50, 53, 6, 1, 21, 1, 39, 51, 1, 42, 53, 47, 52, 45, 1, 39, 1, 45, 54, 58, 1, 51, 53, 42, 43, 50]
'jGK
hello, I am doing a gpt model
G


### Preparing the data and split it into training and validation set

In [None]:
data_tok = encode(text)
if debug : print(data_tok[:100])

[18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 14, 43, 44, 53, 56, 43, 1, 61, 43, 1, 54, 56, 53, 41, 43, 43, 42, 1, 39, 52, 63, 1, 44, 59, 56, 58, 46, 43, 56, 6, 1, 46, 43, 39, 56, 1, 51, 43, 1, 57, 54, 43, 39, 49, 8, 0, 0, 13, 50, 50, 10, 0, 31, 54, 43, 39, 49, 6, 1, 57, 54, 43, 39, 49, 8, 0, 0, 18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 37, 53, 59]


In [None]:
fraction_training_data = 0.9


z = int(fraction_training_data*len(data_tok))
train_set = data_tok[:z]
validation_set = data_tok[z:]

if debug:
  print("train set -> length :", len(train_set))
  print(train_set[:100], "\n")
  print("validation set -> length :", len(validation_set))
  print(validation_set[:100])


train set -> length : 1003854
[18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 14, 43, 44, 53, 56, 43, 1, 61, 43, 1, 54, 56, 53, 41, 43, 43, 42, 1, 39, 52, 63, 1, 44, 59, 56, 58, 46, 43, 56, 6, 1, 46, 43, 39, 56, 1, 51, 43, 1, 57, 54, 43, 39, 49, 8, 0, 0, 13, 50, 50, 10, 0, 31, 54, 43, 39, 49, 6, 1, 57, 54, 43, 39, 49, 8, 0, 0, 18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 37, 53, 59] 

validation set -> length : 111540
[12, 0, 0, 19, 30, 17, 25, 21, 27, 10, 0, 19, 53, 53, 42, 1, 51, 53, 56, 56, 53, 61, 6, 1, 52, 43, 47, 45, 46, 40, 53, 59, 56, 1, 14, 39, 54, 58, 47, 57, 58, 39, 8, 0, 0, 14, 13, 28, 32, 21, 31, 32, 13, 10, 0, 19, 53, 53, 42, 1, 51, 53, 56, 56, 53, 61, 6, 1, 52, 43, 47, 45, 46, 40, 53, 59, 56, 1, 19, 56, 43, 51, 47, 53, 8, 0, 19, 53, 42, 1, 57, 39, 60, 43, 1, 63, 53, 59, 6, 1]


In [None]:
print ("block_size :", block_size)
if debug: print(train_set[:block_size+1])

block_size : 8
[18, 47, 56, 57, 58, 1, 15, 47, 58]


What is important to understand is that we are going to train our model with various length of context,
ie in list of endoded word of size *block_size+1* we are going to get block_size training examples




In [None]:
# to illustrate that concretly
for t in range(1, block_size+1):
  print(f"from this context {train_set[:t]}, we want the model to deduce {train_set[t]}")

print("")
print("")

for t in range(1, block_size+1):
  print(f"from this context : \"{decode(train_set[:t])}\", we want the model to deduce \"{decode(([train_set[t]]))}\"")

from this context [18], we want the model to deduce 47
from this context [18, 47], we want the model to deduce 56
from this context [18, 47, 56], we want the model to deduce 57
from this context [18, 47, 56, 57], we want the model to deduce 58
from this context [18, 47, 56, 57, 58], we want the model to deduce 1
from this context [18, 47, 56, 57, 58, 1], we want the model to deduce 15
from this context [18, 47, 56, 57, 58, 1, 15], we want the model to deduce 47
from this context [18, 47, 56, 57, 58, 1, 15, 47], we want the model to deduce 58


from this context : "F", we want the model to deduce "i"
from this context : "Fi", we want the model to deduce "r"
from this context : "Fir", we want the model to deduce "s"
from this context : "Firs", we want the model to deduce "t"
from this context : "First", we want the model to deduce " "
from this context : "First ", we want the model to deduce "C"
from this context : "First C", we want the model to deduce "i"
from this context : "First Ci"

In [None]:
def get_batch(split): #split is either "train" or "eval"
  assert split in ["train", "eval"], "split must be 'train' or 'eval'"
  data = train_set if split == "train" else validation_set

  ix = torch.randint(0, len(data) - block_size, (batch_size,))

  x = torch.stack([torch.tensor(data[i:i + block_size]) for i in ix])
  y = torch.stack([torch.tensor(data[i + 1:i + block_size + 1]) for i in ix])
  x, y = x.to(device), y.to(device)
  return x, y


if debug:
  xb, yb = get_batch("train")
  print(xb.shape)
  print("xb:", xb)

  print("yb:", yb)

torch.Size([4, 8])
xb: tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
yb: tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


### Bigram Model

In [None]:
@torch.no_grad()
def estimate_loss():
  out = {}
  m.eval()
  for split in ["train", "eval"]:
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = m(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  m.train()
  return out

In [None]:
class BigramModel(nn.Module):
  def __init__(self,vocab_size):
    super().__init__()
    self.token_embedding = nn.Embedding(vocab_size, vocab_size) # objective of the embedding, encode the probability for the next token

  def forward(self, idx, targets=None): #idx & target are tensor integers of shape (B,T)
    logits = self.token_embedding(idx) # shape (B, T, C)

    if targets is None:
      loss =  None
    else:
    #refactoring the tensor to match the dimensions required by CrossEntropyLoss
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      #computation of the loss
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tok):
    for i in range(max_new_tok):
      logits, loss = self(idx)

      #focus on the last time step
      logits = logits[:, -1, : ] # shape: (B, C)
      probs = F.softmax(logits, dim=-1)
      idx_next = torch.multinomial(probs, num_samples=1) #(B,1)

      idx= torch.cat((idx, idx_next), dim=1) #(B, T+1 )

    return idx



m = BigramModel(vocab_size)
m = m.to(device)
idx = torch.zeros((1,1), dtype=torch.long, device=device)
print(decode(m.generate(idx, max_new_tok=100)[0].tolist()))








l-QYjt'CL?jLDuQcLzy'RIo;'KdhpV
vLixa,nswYZwLEPS'ptIZqOZJ$CA$zy-QTkeMk x.gQSFCLg!iW3fO!3DGXAqTsq3pdgq


In [None]:
#train the model

#create the optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [None]:
for step in range (max_iters):
  xb, yb = get_batch("train")

  #evaluate the loss
  if step % eval_interval == 0 :
    losses = estimate_loss()
    print(f"step {step}: train loss {losses['train']:.4f}, val loss {losses['eval']:.4f}")
  logits, loss = m(xb, yb)

  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

In [None]:
print(decode(m.generate(idx, max_new_tok=100)[0].tolist()))


too id haworo ngos ke or.
Thinded HABUKI tourvoer:

Howe V:
Tor cas,

NTEShir ithick;
DWe, t endw ch
