In [None]:
## required imports

import pickle
import torch
import torch.optim as optim
import copy
import random

random.seed(42)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

Selecting only "Ordinary life" dialogues.

In [None]:
used_lines = []

with open("dialogues_topic.txt") as topic:
  for i, line in enumerate(topic):
    if int(line) == 1:
      used_lines += [i]
lines = []

with open("dialogues_text.txt") as txt:
  for i, el in enumerate(txt):
    if i not in used_lines:
      continue
    lines.append(el)

Choosing "@" as a token for the end of a person's sentence in the dialogue, and cleaning the sentences. 

We then concatenate the entire dataset into a single string: txt_chr.

In [None]:
for i, el in enumerate(lines):
  lines[i] = el.replace("\n", "")
  lines[i] = lines[i].replace("__eou__", "@")
  
txt_chr = "".join(lines[:-1])

In [None]:
j=0
for i in range(len(lines)):
  j+= lines[i].count("@")
print(f"Averege number of turns per dialog: {j//len(lines)}")

Creating a first encoding and decoding for our text.

In [None]:
chars = set(txt_chr)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}

Converting our txt_chr into integers, following the encoding.

In [None]:
txt_toi = []
for chr in txt_chr:
  txt_toi.append(stoi[chr])
  
txt_toi[:5]

We now train our tokenizer, we want to have a total of 1000 tokens.

In [None]:
from tokenizer.tokenizer import token_train, merge, encode, decode

num_chars = len(chars)
new_tokens = 1000 - num_chars

tkn_dataset, merges, itos = token_train(txt_toi, itos, num_chars, new_tokens)

We check the compression rate of our tokenizer on the dataset.

In [None]:
comp_rate = abs(len(tkn_dataset) - len(txt_toi))/len(txt_toi)
print(f"Compression rate: {comp_rate*100:.2f}%")

We now encode the dataset we will use for training, validation and testing of the model.

In [None]:
dataset = encode(copy.deepcopy(lines), merges, stoi, num_chars, new_tokens)

if decode(dataset[0], merges, itos) == lines[0]:
  print("Encoding and decoding works correctly!")
else :
  print("There is an error in encoding and decoding.")

print(f"Average length of dialogs after compression: {sum([len(x) for x in dataset])/len(dataset):.2f} tokens")

We check some of the last tokens to ensure their meaningfulness.

In [None]:
print([itos[i] for i in range(970,1000)])

We now save our "stoi", "itos", "merges" variables, needed for the encoding and decoding, and also the encoded dataset, for later use.

In [None]:
with open('stoi_itos_merges_dataset.pkl', 'wb') as f:  # Open in binary write mode
    pickle.dump([stoi, itos, merges, dataset], f)

Here we can retrieve the saved data.

In [None]:
with open('stoi_itos_merges_dataset.pkl', "rb") as f:  # Python 3: open(..., 'rb')
    stoi, itos, merges, dataset = pickle.load(f)

We now create the target dataset from our inputs, by associating for each sequence of context_size lenght, the corresponding sequence in the text translated by one token.

In [None]:
target = []
context_size = 64

for dialog in dataset:
    for i in range(len(dialog) - context_size):
        target_seq = dialog[i + 1:i + context_size + 1]
        target.append(target_seq)

We now divide the dataset in train, verification and test.

In [None]:
n = len(dataset)
n_train = int(n * 0.7)
n_val = int(n * 0.2)

indices = list(range(n))
random.shuffle(indices)

train_target = [target[i] for i in indices[:n_train]]
val_target = [target[i] for i in indices[n_train:n_train + n_val]]
test_target = [target[i] for i in indices[n_train + n_val:]]

train_dataset = [dataset[i] for i in indices[:n_train]]
val_dataset = [dataset[i] for i in indices[n_train:n_train + n_val]]
test_dataset = [dataset[i] for i in indices[n_train + n_val:]]

For consistency we save the randomly generated splits.

In [None]:
with open('train_val_test.pkl', 'wb') as f:  # Open in binary write mode
    pickle.dump([train_dataset, val_dataset, test_dataset], f)
with open('train_val_test_target.pkl', 'wb') as f:  # Open in binary write mode
    pickle.dump([train_target, val_target, test_target], f)

In [None]:
with open('stoi_itos_merges_dataset.pkl', "rb") as f:  # Python 3: open(..., 'rb')
    train_dataset, val_dataset, test_dataset = pickle.load(f)
with open('train_val_test_target.pkl', "rb") as f:  # Python 3: open(..., 'rb')
    train_target, val_target, test_target = pickle.load(f)

We now trasform our datasets and targets into torch tensors.

In [None]:
train_dataset = [torch.tensor(seq, dtype=torch.long).to(device) for seq in train_dataset]
val_dataset = [torch.tensor(seq, dtype=torch.long).to(device) for seq in val_dataset]
test_dataset = [torch.tensor(seq, dtype=torch.long).to(device) for seq in test_dataset]    

train_target = [torch.tensor(seq, dtype=torch.long).to(device) for seq in train_target]
val_target = [torch.tensor(seq, dtype=torch.long).to(device) for seq in val_target]
test_target = [torch.tensor(seq, dtype=torch.long).to(device) for seq in test_target]

We import our model, and generation function. We then initialize the model.

In [None]:
from model.model import GPTModel, generate

model = GPTModel(block_size=context_size, vocab_size=len(itos), n_embd=512, n_head=8, n_layer=6)