In [None]:
!git clone https://github.com/GiovanniAdelfio/small_LM
%cd small_LM

Cloning into 'small_LM'...
remote: Enumerating objects: 290, done.[K
remote: Counting objects:   0% (1/127)[Kremote: Counting objects:   1% (2/127)[Kremote: Counting objects:   2% (3/127)[Kremote: Counting objects:   3% (4/127)[Kremote: Counting objects:   4% (6/127)[Kremote: Counting objects:   5% (7/127)[Kremote: Counting objects:   6% (8/127)[Kremote: Counting objects:   7% (9/127)[Kremote: Counting objects:   8% (11/127)[Kremote: Counting objects:   9% (12/127)[Kremote: Counting objects:  10% (13/127)[Kremote: Counting objects:  11% (14/127)[Kremote: Counting objects:  12% (16/127)[Kremote: Counting objects:  13% (17/127)[Kremote: Counting objects:  14% (18/127)[Kremote: Counting objects:  15% (20/127)[Kremote: Counting objects:  16% (21/127)[Kremote: Counting objects:  17% (22/127)[Kremote: Counting objects:  18% (23/127)[Kremote: Counting objects:  19% (25/127)[Kremote: Counting objects:  20% (26/127)[Kremote: Counting objects:  21% (27/

In [None]:
## required imports

import pickle
import torch
import torch.optim as optim
import copy
import random
import os

random.seed(42)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
path = os.getcwd() + os.sep + "files" + os.sep
path_checkpoints = os.getcwd() + os.sep + "checkpoints" + os.sep

In [None]:
path_checkpoints

'/content/small_LM/small_LM/checkpoints/'

## Data fetching

Selecting only "Ordinary life" dialogues.

In [None]:
used_lines = []

with open(path + "dialogues_topic.txt", encoding="utf-8") as topic:
  for i, line in enumerate(topic):
    if int(line) == 1:
      used_lines += [i]
lines = []

with open(path + "dialogues_text.txt", encoding="utf-8") as txt:
  for i, el in enumerate(txt):
    if i not in used_lines:
      continue
    lines.append(el)

Choosing "@" as a token for the end of a person's sentence in the dialogue, and cleaning the sentences.

We then concatenate the entire dataset into a single string: txt_chr.

In [None]:
for i, el in enumerate(lines):
  lines[i] = el.replace("\n", " ")
  lines[i] = lines[i].replace("__eou__", "@")

txt_chr = "".join(lines[:-1])

In [None]:
j=0
for i in range(len(lines)):
  j+= lines[i].count("@")
print(f"Averege number of turns per dialog: {j//len(lines)}")

Averege number of turns per dialog: 8


In [None]:
txt_chr[:250]

"The kitchen stinks . @ I'll throw out the garbage . @ So Dick , how about getting some coffee for tonight ? @ Coffee ? I don ’ t honestly like that kind of stuff . @ Come on , you can at least try a little , besides your cigarette . @ What ’ s wrong "

Creating a first encoding and decoding for our text.

In [None]:
chars = set(txt_chr)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}

In [None]:
ign_chars_list = [ '!', '"',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 ',',
 '-',
 '.',
 '/',
'~',
 '£',
 '¥',
 '°',
 '–',
 '—',
 '‘',
 '’',
 '“',
 '”',
 '′',
 '。']

Converting our txt_chr into integers, following the encoding.

In [None]:
txt_toi = []
ign_chars = set()
for chr in txt_chr:
  txt_toi.append(stoi[chr])
for c in ign_chars_list:
  ign_chars.add(stoi[c])
txt_toi[:5]

[10, 63, 87, 74, 36]

## Data manipulation

We now train our tokenizer, we want to have a total of 1000 tokens.

In [None]:
from tokenizer.tokenizer import token_train, merge, encode, decode

num_chars = len(chars)
new_tokens = 1000 - num_chars

tkn_dataset, merges, itos = token_train(txt_toi, itos, num_chars, new_tokens, ign_chars)

We check the compression rate of our tokenizer on the dataset.

In [None]:
comp_rate = abs(len(tkn_dataset) - len(txt_toi))/len(txt_toi)
print(f"Compression rate: {comp_rate*100:.2f}%")

Compression rate: 67.28%


We now encode the dataset we will use for training, validation and testing of the model.

In [None]:
dataset = encode(copy.deepcopy(lines), merges, stoi, num_chars, new_tokens)

if decode(dataset[0], itos) == lines[0]:
  print("Encoding and decoding works correctly!")
else :
  print("There is an error in encoding and decoding.")

print(f"Average length of dialogs after compression: {sum([len(x) for x in dataset])/len(dataset):.2f} tokens")

Encoding and decoding works correctly!
Average length of dialogs after compression: 153.59 tokens


We check some of the last tokens to ensure their meaningfulness.

In [None]:
print([itos[i] for i in range(970,1000)])

[' F', 'coun', 'might ', 'about the ', 'left ', 'another ', 'feel ', 'oul', 'ck ', 'have the ', 'en I ', 'sma', 'tri', 'aybe ', 'differ', ' @ This ', 'cost ', 'ile ', ' @ Can you ', 'tast', 'sale ', ' Can you ', 'even ', 'ves ', 'tick', 'coffee ', 'ded ', 'rece', ' How much ', 'ty']


We now save our "stoi", "itos", "merges" variables, needed for the encoding and decoding, and also the encoded dataset, for later use.

In [None]:
with open(path + 'stoi_itos_merges_dataset.pkl', 'wb') as f:  # Open in binary write mode
    pickle.dump([stoi, itos, merges, dataset], f)

## Creating datasets and dataloaders

We now create the target dataset from our inputs, by associating for each sequence of context_size lenght, the corresponding sequence in the text translated by one token.

We now divide the dataset in train, verification and test. We also trasform our datasets and targets into torch tensors.

In [None]:
from utils.data import split
train_dataset, val_dataset, test_dataset = split(dataset, t=0.7, v=0.2, seed=42, to_torch = True, device = device)

For consistency we save the randomly generated splits.

In [None]:
os.chdir(path)
torch.save([train_dataset, val_dataset,test_dataset], "dataset.pt")
os.chdir("..")

In [None]:
os.chdir(path)
train_dataset, val_dataset, test_dataset = torch.load("dataset.pt", weights_only= "True")
os.chdir("..")

We create a dataset and dataloader using pytorch utils, and wrap it on our files.

In [None]:
from torch.utils.data import DataLoader
from utils.data import SLM_dataset
bs = 32
cs = 64

train = SLM_dataset(train_dataset, cs)
val = SLM_dataset(val_dataset, cs)

train_dataloader = DataLoader(train, batch_size=bs, shuffle=True, num_workers=0)
val_dataloader = DataLoader(val, batch_size=bs, shuffle=False, num_workers=0)

## Model training

We import our model, and generation function. We then initialize the model.

In [None]:
from model.model import GPTModel, generate

context_size = 64

model = GPTModel(block_size=context_size, vocab_size=1000, n_embd=512, n_head=8, n_layer=6)

We can load saved weights.

In [None]:
checkpoint = "test1_epoch6.pt"  # specify your checkpoint file here
model.load_state_dict(torch.load(path_checkpoints + checkpoint, map_location=device, weights_only= False))

<All keys matched successfully>

We now perform 100 epochs of training.

In [None]:
from model.train import train

model, train_loss, val_loss = train(model, train_dataloader, val_dataloader,
                                    lr = 1e-4, weight_decay=1e-3, epochs=3,
                                    opt_name="adam", device=device,
                                    checkpoint_path=path_checkpoints, name = "new_tkn_2")

Epoch 1/3: 100%|██████████| 231/231 [00:19<00:00, 11.57it/s]


Epoch 1: Train Loss = 2.6492 | Val Loss = 3.0657


Epoch 2/3: 100%|██████████| 231/231 [00:20<00:00, 11.48it/s]


Epoch 2: Train Loss = 2.5806 | Val Loss = 3.0546


Epoch 3/3: 100%|██████████| 231/231 [00:19<00:00, 11.58it/s]


Epoch 3: Train Loss = 2.5485 | Val Loss = 3.0481
Training completato!


Here we load the saved dicts, needed for encoding and decoding during generation.

In [None]:
with open(path + 'stoi_itos_merges_dataset.pkl', "rb") as f:  # Python 3: open(..., 'rb')
    stoi, itos, merges, dataset = pickle.load(f)

In [None]:
generate(model, " I'm going to buy a new pair of shoes . @ ",
         50, stoi, itos, merges, context_size, conversation=False, temperature= 0.6, top_k=100, top_p = 0.8)

" I'm going to buy a new pair of shoes . @ surf the second . @ Because the soft boiling is better . @ OK . @ Very good . @ Bye . @ Yeah , right ? I'm glad you know . S"