In [1]:
!git clone https://github.com/GiovanniAdelfio/small_LM
%cd small_LM

Cloning into 'small_LM'...
remote: Enumerating objects: 152, done.[K
remote: Counting objects: 100% (152/152), done.[K
remote: Compressing objects: 100% (144/144), done.[K
remote: Total 152 (delta 67), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (152/152), 4.99 MiB | 4.63 MiB/s, done.
Resolving deltas: 100% (67/67), done.
/content/small_LM


In [2]:
## required imports

import pickle
import torch
import torch.optim as optim
import copy
import random
import os

random.seed(42)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
path = os.getcwd() + os.sep + "files" + os.sep
path_checkpoints = os.getcwd() + os.sep + "checkpoints" + os.sep

## Data fetching

Selecting only "Ordinary life" dialogues.

In [3]:
used_lines = []

with open(path + "dialogues_topic.txt", encoding="utf-8") as topic:
  for i, line in enumerate(topic):
    if int(line) == 1:
      used_lines += [i]
lines = []

with open(path + "dialogues_text.txt", encoding="utf-8") as txt:
  for i, el in enumerate(txt):
    if i not in used_lines:
      continue
    lines.append(el)

Choosing "@" as a token for the end of a person's sentence in the dialogue, and cleaning the sentences.

We then concatenate the entire dataset into a single string: txt_chr.

In [4]:
for i, el in enumerate(lines):
  lines[i] = el.replace("\n", "")
  lines[i] = lines[i].replace("__eou__", "@")

txt_chr = "".join(lines[:-1])

In [5]:
j=0
for i in range(len(lines)):
  j+= lines[i].count("@")
print(f"Averege number of turns per dialog: {j//len(lines)}")

Averege number of turns per dialog: 8


Creating a first encoding and decoding for our text.

In [6]:
chars = set(txt_chr)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}

Converting our txt_chr into integers, following the encoding.

In [7]:
txt_toi = []
for chr in txt_chr:
  txt_toi.append(stoi[chr])

txt_toi[:5]

[18, 54, 64, 68, 28]

## Data manipulation

We now train our tokenizer, we want to have a total of 1000 tokens.

In [None]:
from tokenizer.tokenizer import token_train, merge, encode, decode

num_chars = len(chars)
new_tokens = 1000 - num_chars

tkn_dataset, merges, itos = token_train(txt_toi, itos, num_chars, new_tokens)

We check the compression rate of our tokenizer on the dataset.

In [None]:
comp_rate = abs(len(tkn_dataset) - len(txt_toi))/len(txt_toi)
print(f"Compression rate: {comp_rate*100:.2f}%")

Compression rate: 70.81%


In [9]:
num_chars = len(chars)
new_tokens = 1000 - num_chars

We now encode the dataset we will use for training, validation and testing of the model.

In [12]:
dataset = encode(copy.deepcopy(lines), merges, stoi, num_chars, new_tokens)

if decode(dataset[0], itos) == lines[0]:
  print("Encoding and decoding works correctly!")
else :
  print("There is an error in encoding and decoding.")

print(f"Average length of dialogs after compression: {sum([len(x) for x in dataset])/len(dataset):.2f} tokens")

Encoding and decoding works correctly!
Average length of dialogs after compression: 135.89 tokens


We check some of the last tokens to ensure their meaningfulness.

In [None]:
print([itos[i] for i in range(970,1000)])

['. If ', '. @ Oh ', 'each ', 'name ', 'tal ', '. B', 'b ', '. @ Why ', 'wrong ', 'best ', 'whi', 'why ', 'coff', 'keep ', 'deli', '. @ We ', 'om ', '. @ Ok ', '. They ', ', th', 'might ', 'about the ', 'left ', 'another ', 'tri', 'feel ', 'oul', "'d ", 'OK ', 'have the ']


We now save our "stoi", "itos", "merges" variables, needed for the encoding and decoding, and also the encoded dataset, for later use.

In [None]:
with open(path + 'stoi_itos_merges_dataset.pkl', 'wb') as f:  # Open in binary write mode
    pickle.dump([stoi, itos, merges, dataset], f)

Here we can retrieve the saved data.

In [8]:
with open(path + 'stoi_itos_merges_dataset.pkl', "rb") as f:  # Python 3: open(..., 'rb')
    stoi, itos, merges, dataset = pickle.load(f)

## Creating datasets and dataloaders

We now create the target dataset from our inputs, by associating for each sequence of context_size lenght, the corresponding sequence in the text translated by one token.

We now divide the dataset in train, verification and test. We also trasform our datasets and targets into torch tensors.

In [13]:
from utils.data import split
train_dataset, val_dataset, test_dataset = split(dataset, t=0.7, v=0.2, seed=42, to_torch = True, device = device)

For consistency we save the randomly generated splits.

In [14]:
os.chdir(path)
torch.save([train_dataset, val_dataset,test_dataset], "dataset.pt")
os.chdir("..")

In [4]:
os.chdir(path)
train_dataset, val_dataset, test_dataset = torch.load("dataset.pt", weights_only= "True")
os.chdir("..")

We create a dataset and dataloader using pytorch utils, and wrap it on our files.

In [15]:
from torch.utils.data import DataLoader
from utils.data import SLM_dataset
bs = 256
cs = 64

train = SLM_dataset(train_dataset, cs)
val = SLM_dataset(val_dataset, cs)

train_dataloader = DataLoader(train, batch_size=bs, shuffle=False, num_workers=0)
val_dataloader = DataLoader(val, batch_size=bs, shuffle=False, num_workers=0)

## Model training

We import our model, and generation function. We then initialize the model.

In [49]:
from model.model import GPTModel, generate

context_size = 64

model = GPTModel(block_size=context_size, vocab_size=len(itos), n_embd=512, n_head=8, n_layer=6) #len itos, really? not good

We now perform 100 epochs of training.

In [None]:
from model.train import train

model, train_loss, val_loss = train(model, train_dataloader, val_dataloader,
                                    lr = 1e-3, weight_decay=1e-4, epochs=2,
                                    opt_name="adam", device=device, checkpoint_path=path_checkpoints)

In [47]:
generate(model, "Hey, how are you? @ ", 50, stoi, itos, merges, context_size, temperature= 0.5)

'Hey, how are you? @ Definitely not to respond my rules and you broke them correctly or scratch as the toilet is over there . @ It ’ s just three hundred and seventy five '