In [None]:
#import the librairies
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.tensorboard import SummaryWriter
import sys


In [None]:
debug = True #activate if you want to compute the print/test to see if each step is correctly working
batch_size = 64
block_size = 256
max_iters = 2000
eval_interval = 200
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(torch.cuda.is_available())
eval_iters = 200
n_embed = 384 #should be divible by num_head
num_heads = 6
num_blocks = 6
dropout = 0.2
fraction_training_data = 0.9


torch.manual_seed(1337)

True


<torch._C.Generator at 0x7999781c3ab0>

### Download the data


In [None]:
!wget https://raw.githubusercontent.com/amisha-jodhani/text-generator-harry-potter/master/1SorcerersStone.txt

--2023-09-27 13:18:03--  https://raw.githubusercontent.com/amisha-jodhani/text-generator-harry-potter/master/1SorcerersStone.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 444307 (434K) [text/plain]
Saving to: ‘1SorcerersStone.txt’


2023-09-27 13:18:03 (3.02 MB/s) - ‘1SorcerersStone.txt’ saved [444307/444307]



In [None]:
with open('/content/harry-potter-7-les-reliques-de-la-mort.txt', 'r', encoding='utf-8') as f: #choose the training corpus here
    text = f.read()

In [None]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1346803


In [None]:
# let's look at the first 1000 characters
print(text[2000:3000])

nt le bras gauche dans une sorte de salut et traversèrent la
grille comme si le métal sombre n’était qu’un rideau de fumée.

Les rangées d’ifs étouffaient le son de leurs pas. Il y eut un bruissement quelque part sur leur droite :
Yaxley tira à nouveau sa baguette qu’il pointa par-dessus la tête de son compagnon mais le bruit était
dû à un paon, au plumage d’un blanc immaculé, qui s’avançait d’un air majestueux au sommet de la
haie.

— Il ne se refuse jamais rien, Lucius. Des paons…

Avec un petit ricanement, Yaxley remit la baguette sous sa cape.

Tout au bout de l’allée, un élégant manoir se dessina dans l’obscurité, des éclats de lumière se
reflétant au rez-de-chaussée dans les carreaux des fenêtres à croisillons. Quelque part dans le parc
obscur, au-delà de la haie, on entendait le chant d’une fontaine. Des graviers crissèrent sous leurs
semelles lorsque Rogue et Yaxley se hâtèrent en direction de la porte qui pivota vers l’intérieur à leur
approche, bien qu’apparemment personne ne

### Create a map between characters and integers
**Objective**: associate to each characters to an integers using dictionnaries

In [None]:

list_char = sorted(list(set(text)))
vocab_size = len(list_char)
if debug:
  print("vocab_size :", vocab_size)
  print(list_char)

vocab_size : 108
['\n', '\x0c', ' ', '!', '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '«', '°', '»', '¾', 'À', 'Â', 'Ç', 'È', 'É', 'Ê', 'Î', 'Ô', 'Ù', 'à', 'â', 'ç', 'è', 'é', 'ê', 'ë', 'î', 'ï', 'ô', 'ù', 'û', 'Œ', 'œ', 'β', '–', '—', '’', '“', '”', '…']


In [None]:
# dictionnary char -> integer
stoi = {c:i for i,c in enumerate(list_char)}
itos = {i:c for i,c in enumerate(list_char)}

if debug:
  print("stoi :", stoi)
  print("itos :", itos)

stoi : {'\n': 0, '\x0c': 1, ' ': 2, '!': 3, '(': 4, ')': 5, ',': 6, '-': 7, '.': 8, '0': 9, '1': 10, '2': 11, '3': 12, '4': 13, '5': 14, '6': 15, '7': 16, '8': 17, '9': 18, ':': 19, ';': 20, '?': 21, 'A': 22, 'B': 23, 'C': 24, 'D': 25, 'E': 26, 'F': 27, 'G': 28, 'H': 29, 'I': 30, 'J': 31, 'K': 32, 'L': 33, 'M': 34, 'N': 35, 'O': 36, 'P': 37, 'Q': 38, 'R': 39, 'S': 40, 'T': 41, 'U': 42, 'V': 43, 'W': 44, 'X': 45, 'Y': 46, 'Z': 47, 'a': 48, 'b': 49, 'c': 50, 'd': 51, 'e': 52, 'f': 53, 'g': 54, 'h': 55, 'i': 56, 'j': 57, 'k': 58, 'l': 59, 'm': 60, 'n': 61, 'o': 62, 'p': 63, 'q': 64, 'r': 65, 's': 66, 't': 67, 'u': 68, 'v': 69, 'w': 70, 'x': 71, 'y': 72, 'z': 73, '«': 74, '°': 75, '»': 76, '¾': 77, 'À': 78, 'Â': 79, 'Ç': 80, 'È': 81, 'É': 82, 'Ê': 83, 'Î': 84, 'Ô': 85, 'Ù': 86, 'à': 87, 'â': 88, 'ç': 89, 'è': 90, 'é': 91, 'ê': 92, 'ë': 93, 'î': 94, 'ï': 95, 'ô': 96, 'ù': 97, 'û': 98, 'Œ': 99, 'œ': 100, 'β': 101, '–': 102, '—': 103, '’': 104, '“': 105, '”': 106, '…': 107}
itos : {0: '\n', 1

In [None]:
# function to encode and decode string using dictionnaries

def encode(text, stoi = stoi):
  list_integers = []
  for c in text:
    list_integers.append(stoi.get(c))

  return list_integers


def decode(list_integers, itos=itos):
  text = []
  for i in list_integers:
    text.append(itos.get(i))

  text = ''.join(c for c in text) #delete this line if you want a list of char instead of a str
  return text



if debug:
  print(encode("hello, I am doing a gpt model"))
  print(decode([5, 48, 19, 23]))
  print(decode(encode("hello, I am doing a gpt model")))

  print(decode([19]))






[55, 52, 59, 59, 62, 6, 2, 30, 2, 48, 60, 2, 51, 62, 56, 61, 54, 2, 48, 2, 54, 63, 67, 2, 60, 62, 51, 52, 59]
)a:B
hello, I am doing a gpt model
:


### Preparing the data and split it into training and validation set

In [None]:
data_tok = encode(text)
if debug : print(data_tok[:100])

[2, 2, 2, 2, 2, 2, 2, 2, 2, 31, 8, 32, 8, 39, 36, 44, 33, 30, 35, 28, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 29, 48, 65, 65, 72, 2, 37, 62, 67, 67, 52, 65, 0, 52, 67, 2, 59, 52, 66, 2, 39, 52, 59, 56, 64, 68, 52, 66, 2, 51, 52, 2, 59, 48, 2, 34, 62, 65, 67, 0, 2, 2, 2, 2, 2, 2, 2, 41, 65, 48, 51, 68, 56, 67, 2, 51, 52, 2, 59, 104, 48, 61, 54, 59, 48, 56, 66, 0]


In [None]:

z = int(fraction_training_data*len(data_tok))
train_set = data_tok[:z]
validation_set = data_tok[z:]

if debug:
  print("train set -> length :", len(train_set))
  print(train_set[:100], "\n")
  print("validation set -> length :", len(validation_set))
  print(validation_set[:100])


train set -> length : 1212122
[2, 2, 2, 2, 2, 2, 2, 2, 2, 31, 8, 32, 8, 39, 36, 44, 33, 30, 35, 28, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 29, 48, 65, 65, 72, 2, 37, 62, 67, 67, 52, 65, 0, 52, 67, 2, 59, 52, 66, 2, 39, 52, 59, 56, 64, 68, 52, 66, 2, 51, 52, 2, 59, 48, 2, 34, 62, 65, 67, 0, 2, 2, 2, 2, 2, 2, 2, 41, 65, 48, 51, 68, 56, 67, 2, 51, 52, 2, 59, 104, 48, 61, 54, 59, 48, 56, 66, 0] 

validation set -> length : 134681
[96, 67, 6, 2, 57, 104, 48, 68, 65, 48, 56, 66, 2, 63, 68, 2, 53, 48, 56, 65, 52, 2, 51, 48, 69, 48, 61, 67, 48, 54, 52, 6, 2, 69, 62, 68, 66, 2, 54, 48, 54, 61, 52, 65, 2, 68, 61, 2, 63, 52, 68, 0, 63, 59, 68, 66, 2, 51, 52, 2, 67, 52, 60, 63, 66, 2, 3, 2, 65, 91, 63, 59, 56, 64, 68, 48, 2, 39, 62, 54, 68, 52, 2, 48, 69, 52, 50, 2, 50, 62, 59, 90, 65, 52, 8, 0, 0, 30, 59, 2]


In [None]:
print ("block_size :", block_size)
if debug: print(train_set[:block_size+1])

block_size : 256
[2, 2, 2, 2, 2, 2, 2, 2, 2, 31, 8, 32, 8, 39, 36, 44, 33, 30, 35, 28, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 29, 48, 65, 65, 72, 2, 37, 62, 67, 67, 52, 65, 0, 52, 67, 2, 59, 52, 66, 2, 39, 52, 59, 56, 64, 68, 52, 66, 2, 51, 52, 2, 59, 48, 2, 34, 62, 65, 67, 0, 2, 2, 2, 2, 2, 2, 2, 41, 65, 48, 51, 68, 56, 67, 2, 51, 52, 2, 59, 104, 48, 61, 54, 59, 48, 56, 66, 0, 2, 2, 2, 63, 48, 65, 2, 31, 52, 48, 61, 7, 27, 65, 48, 61, 89, 62, 56, 66, 2, 34, 91, 61, 48, 65, 51, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 11, 9, 9, 16, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 28, 22, 33, 33, 30, 34, 22, 39, 25, 0, 1, 24, 52, 2, 69, 62, 59, 68, 60, 52, 2, 52, 66, 67, 2, 51, 91, 51, 56, 50, 48, 50, 91, 2, 87, 2, 66, 52, 63, 67, 2, 63, 52, 65, 66, 62, 61, 61, 52, 66, 19, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 78, 2, 35, 52, 56, 59, 0, 0]


What is important to understand is that we are going to train our model with various length of context,
ie in list of endoded word of size *block_size+1* we are going to get block_size training examples




In [None]:
def get_batch(split): #split is either "train" or "eval"
  assert split in ["train", "eval"], "split must be 'train' or 'eval'"
  data = train_set if split == "train" else validation_set

  ix = torch.randint(0, len(data) - block_size-1, (batch_size,)) # return a tensor of shape (batch_size) with random values bitween 0 and len(data) - block_size

  x = torch.stack([torch.tensor(data[i:i + block_size]) for i in ix])
  y = torch.stack([torch.tensor(data[i + 1:i + block_size + 1]) for i in ix])

  #y = torch.clamp(y, 0, vocab_size - 1) # not supposed to be necessary but assure that the value are between 0 and vocab_size - 1

  x, y = x.to(device), y.to(device)
  return x, y


if debug:
  xb, yb = get_batch("train")
  print(xb.shape)
  print("xb:", xb)

  print("yb:", yb)

torch.Size([64, 256])
xb: tensor([[66, 66, 56,  ..., 63,  2, 51],
        [51, 91, 66,  ..., 49, 59, 90],
        [ 0,  0, 26,  ..., 64, 68, 56],
        ...,
        [61, 67,  2,  ..., 59, 52, 67],
        [59, 59, 52,  ..., 50, 59, 48],
        [69, 52, 50,  ...,  2, 50, 55]], device='cuda:0')
yb: tensor([[66, 56,  2,  ...,  2, 51, 52],
        [91, 66, 52,  ..., 59, 90, 60],
        [ 0, 26, 59,  ..., 68, 56,  2],
        ...,
        [67,  2, 52,  ..., 52, 67, 48],
        [59, 52,  2,  ..., 59, 48, 61],
        [52, 50,  2,  ..., 50, 55, 48]], device='cuda:0')


### Bigram Model

In [None]:
@torch.no_grad()
def estimate_loss():
  out = {}
  m.eval()
  for split in ["train", "eval"]:
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = m(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  m.train()
  return out

In [None]:
class Head(nn.Module):
  # for the forward methods :
  # input of size (batch, time-step, channels)
  # output of size (batch, time-step, head size)``

  def __init__(self, head_size): #head_size = d_k = d_v if we are used to the notation of the original paper
    super().__init__()
    self.values = nn.Linear(n_embed, head_size, bias=False)
    self.keys = nn.Linear(n_embed, head_size, bias=False)
    self.queries = nn.Linear(n_embed, head_size, bias=False)

    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
    self.dropout = nn.Dropout(dropout)



  def forward(self, x):
    # x.shape : (batch, time-step, channels)
    B,T,C = x.shape
    key = self.keys(x) #(B, T, head_size)
    query = self.queries(x) #(B, T, head_size)
    value = self.values(x) # same
    energy= query @ key.transpose(-2, -1)
    energy = energy/(query.shape[-1]**0.5) # (B, T, T)
    energy = energy.masked_fill(self.tril[:T, :T]==0, float("-inf"))

    energy = F.softmax(energy, dim=-1)
    energy = self.dropout(energy)
    out = energy @ value # (B, T, C)
    return out





In [None]:
class MultiHeadAttention(nn.Module):
  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range (num_heads)])
    self.proj = nn.Linear(n_embed, n_embed)
    self.dropout = nn.Dropout(dropout)



  def forward(self,x):
    x = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.proj(x)
    out = self.dropout(out)
    return out

In [None]:
class FeedForward(nn.Module):
  def __init__(self, n_embed):
    super().__init__()
    self.linear = nn.Linear(n_embed, 4*n_embed)
    self.linear2 = nn.Linear(4*n_embed, n_embed)
    self.dropout = nn.Dropout(dropout)
  def forward(self, x):
    x = self.linear(x)
    x = F.relu(x)
    x = self.linear2(x)
    x = self.dropout(x)


    return x

In [None]:
class Block(nn.Module):
  def __init__(self, n_embed, num_head):
    super().__init__()
    head_size = n_embed//num_head
    self.sa = MultiHeadAttention(num_heads, head_size)
    self.ffn = FeedForward(n_embed)
    self.ln1 = nn.LayerNorm(n_embed)
    self.ln2 = nn.LayerNorm(n_embed)

  def forward(self, x):
    x = x + self.sa(self.ln1(x))
    x = x + self.ffn(self.ln2(x))

    return x

In [None]:
class BigramModel(nn.Module):
  def __init__(self,vocab_size):
    super().__init__()
    self.token_embedding = nn.Embedding(vocab_size, n_embed)
    self.position_embedding = nn.Embedding(block_size, n_embed)
    self.attention_blocks = nn.Sequential(
        *[Block(n_embed, num_heads) for _ in range(num_blocks)],
        nn.LayerNorm(n_embed),
    )
    self.linear = nn.Linear(n_embed,vocab_size)





  def forward(self, idx, targets=None): #idx & target are tensor integers of shape (B,T)
    B, T = idx.shape
    tok_embed = self.token_embedding(idx) # shape (B, T, C) with C = n_embed
    pos_embed = self.position_embedding(torch.arange(T, device=device)) #(T, C)
    x = pos_embed + tok_embed
    logits = self.attention_blocks(x) #(B, T, C)
    logits = self.linear(logits)#(B, T, vocab_size)





    if targets is None:
      loss =  None
    else:
    #refactoring the tensor to match the dimensions required by CrossEntropyLoss
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      #computation of the loss
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tok):
    for i in range(max_new_tok):
      idx_crop = idx[:, -block_size:] #crop
      logits, loss = self(idx_crop)

      #focus on the last time step
      logits = logits[:, -1, : ] # shape: (B, C)
      probs = F.softmax(logits, dim=-1)
      idx_next = torch.multinomial(probs, num_samples=1) #(B,1)

      idx= torch.cat((idx, idx_next), dim=1) #(B, T+1 )

    return idx



m = BigramModel(vocab_size)
m = m.to(device)
idx = torch.zeros((1,1), dtype=torch.long, device=device)
print(decode(m.generate(idx, max_new_tok=100)[0].tolist()))








JR4hlc(W((2’ÇPLÙQÉÔKIêHrXÔhc;;b–b;a )qÉJSàiœpeAà0;0…L.ehFÉY(D;B;yz
«bèeVÀβ«IV Ggô u4p6“7SS°?”vÎ?:8Wâ


In [None]:
#train the model

#create the optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [None]:
for step in range (max_iters):
  xb, yb = get_batch("train")

  #evaluate the loss
  if step % eval_interval == 0 or step==max_iters-1 :
    losses = estimate_loss()
    print(f"step {step}: train loss {losses['train']:.4f}, val loss {losses['eval']:.4f}")

  logits, loss = m(xb, yb)

  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()



step 0: train loss 4.7870, val loss 4.7825
step 200: train loss 2.3486, val loss 2.3315


KeyboardInterrupt: ignored

In [None]:
m.eval()
print(decode(m.generate(idx, max_new_tok=1000)[0].tolist()))


page de el du le pout lité,-il qui tembritable à l’avait quessi vœurs dant icuner,
s’ins s’alloc de hamainsent imprandant : la réparfoité la plume. Lophe tableux poté, gontienstant de
bit labrur de main de cour avandé ? n’est ponhé Ron exure à qu’enfillme passer lestêtres un colèverçant avaient fortéraierre hoses du
re flats. L’et à L’Harry auconez-imait je queste n’en tose d’aus les. Les latmaint, êtecortages tre et pit ! d’auffinétait derrux poui
et Gagidit éclacidés d’une entroureme monsentifiiedore une. Il abloquantt il sudre soy lessi c’inse ser le sembla
les. Le dondangris.

— Combjette deurny des avaieux l’unes reforts n’avec ! crirsous dortiséplie dans toun melle le nettentemais
pas. Les levoirs unir tout motre ?

— Alà, lessi, ne penstrait, Grix ! ! ne fonnez noix à pluis ! vachire ne ce s’ése dir vière de pas. Runelletre lats poje
des te Chaux dinantioneux seux matres poépoucie survois.

— Pusse dert, a moi ci ! Il s’a lont aver vait in sanssienssation déris le son, à sûreve