In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np

import logging
logging.getLogger().setLevel(logging.CRITICAL)

import warnings
warnings.filterwarnings('ignore')

device = 'cpu'
if torch.has_mps:
    device = 'mps'
if torch.cuda.is_available():
    device = 'cuda'

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
model = model.to(device)

In [3]:
B_SEN = '<|bos|>'
E_SEN = '<|eos|>'
SEP_T = '<|bt|>'
SEP_D = "<|bd|>"

special_tokens = { "bos_token": B_SEN,
"eos_token": E_SEN,
"additional_special_tokens": [SEP_T, SEP_D] }
#bd = beginning of definition
#
tokenizer.add_special_tokens(special_tokens)

4

In [4]:
def choose_from_top(probs, n=5):
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob) # Normalize
    choice = np.random.choice(n, 1, p = top_prob)
    token_id = ind[choice][0]
    return int(token_id)

In [24]:
from torch.utils.data import Dataset
from torch.utils.data import Dataset, DataLoader
import os
import json
import pandas as pd
import re

class AnglishDataset(Dataset):
    def __init__(self, dataset_path):
        super().__init__()

        self.word_list = []
        self.B_SEN = '<|bos|>'
        self.E_SEN = '<|eos|>'
        self.SEP_T = '<|bt|>'
        self.SEP_D = "<|bd|>"

        df = pd.read_excel(dataset_path)

        raw = ""

        x = 0
        for index, row in df.iterrows():
            word_str = f"{B_SEN} {row[0]} {SEP_T} {row[3]} {SEP_T}"
            #᛫ to act in accordance with ᛫ to remain ᛫ to wait ᛫ to wait for ᛫ to await ᛫ to persist ᛫ to reside ᛫ to face ( in combat etc ) ᛫
            definitions = row[2]
            definitions = definitions.replace('•', '᛫')
            definitions = definitions.split('᛫')
            for i, definition in enumerate(definitions):
                if not definition:
                    continue
                if i == len(definition) - 1:
                    word_str += f" {definition}"
                else:
                    word_str += f" {definition} {SEP_D}"

            word_str += E_SEN
            word_str = re.sub(" +", " ", word_str)

            self.word_list.append(word_str)
            raw += word_str

        print("Unique Characters: ", ''.join(set(raw)))

    def __len__(self):
        return len(self.word_list)

    def __getitem__(self, item):
        return self.word_list[item]

In [25]:
dataset = AnglishDataset(dataset_path='data/anglish_wordbook.xlsx')
anglish_loader = DataLoader(dataset, batch_size=1, shuffle=True)

Unique Characters:  íy>85IpÞᚾ?l3 ᛏÖk4fSᚩ<j'u᛬ZP)þMOYgᛚr|[WHÐ–ᛋe0vç+2"Å&FcR▪q d⁊êXNöh—è1TAas,bwtEQ/!GxnBimD-æ*oKð:L.zÜJ(UCᚹ]ä9éV


In [26]:
for idx,word in enumerate(anglish_loader):
    if idx < 10:
        print(word)
        print(type(word))

['<|bos|> witethew <|bt|> N᛬AJ <|bt|> someone who is in slavery as part of punishment ᛬ in slavery as punishment for a crime <|bd|><|eos|>']
<class 'list'>
['<|bos|> beswathe <|bt|> V <|bt|> to cover ( literally or figuratively ) <|bd|><|eos|>']
<class 'list'>
['<|bos|> worter <|bt|> N <|bt|> a herbalist <|bd|> an herbalist <|bd|><|eos|>']
<class 'list'>
['<|bos|> uptake <|bt|> N᛬V <|bt|> absorption or occupation of a limited resource <|bd|> a ventilation pipe ᛬ to take up <|bd|> to expend or to occupy a limited resource <|bd|><|eos|>']
<class 'list'>
['<|bos|> guild <|bt|> N <|bt|> an association of merchants or craftsmen <|bd|><|eos|>']
<class 'list'>
['<|bos|> stickler <|bt|> N <|bt|> a person who insists on a certain quality or type of behaviour <|bd|> a referee <|bd|> an umpire <|bd|> a mediator <|bd|> a moderator <|bd|><|eos|>']
<class 'list'>
['<|bos|> inkhorn <|bt|> N <|bt|> a fancy word borrowed into English though it is not needed <|bd|><|eos|>']
<class 'list'>
['<|bos|> umbs

In [27]:
BATCH_SIZE = 16
EPOCHS = 1
LEARNING_RATE = 3e-5
WARMUP_STEPS = 5000
MAX_SEQ_LEN = 400
from transformers import AdamW, get_linear_schedule_with_warmup

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
if torch.has_mps:
    device = 'mps'

In [28]:
from alive_progress import alive_bar

In [29]:
model = model.to(device)
model.train()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps = -1)
proc_seq_count = 0
sum_loss = 0.0
batch_count = 0

tmp_words_tens = None
models_folder = "trained_models"
if not os.path.exists(models_folder):
    os.mkdir(models_folder)

for epoch in range(EPOCHS):

    print(f"EPOCH {epoch} started" + '=' * 30)

    with alive_bar(len(anglish_loader), force_tty=True) as bar:
        for idx,word in enumerate(anglish_loader):
            bar()
            #################### "Fit as many word sequences into MAX_SEQ_LEN sequence as possible" logic start ####
            word_tens = torch.tensor(tokenizer.encode(word[0])).unsqueeze(0).to(device)
            #Skip sample from dataset if it is longer than MAX_SEQ_LEN
            if word_tens.size()[1] > MAX_SEQ_LEN:
                continue

            #The first word sequence in the sequence
            if not torch.is_tensor(tmp_words_tens):
                tmp_words_tens = word_tens
                continue
            else:
                #The next word does not fit in so we process the sequence and leave the last word
                #as the start for next sequence
                if tmp_words_tens.size()[1] + word_tens.size()[1] > MAX_SEQ_LEN:
                    work_words_tens = tmp_words_tens
                    tmp_words_tens = word_tens
                else:
                    #Add the word to sequence, continue and try to add more
                    tmp_words_tens = torch.cat([tmp_words_tens, word_tens[:,1:]], dim=1)
                    continue
            ################## Sequence ready, process it trough the model ##################

            outputs = model(work_words_tens, labels=work_words_tens)
            loss, logits = outputs[:2]
            loss.backward()
            sum_loss = sum_loss + loss.detach().data

            proc_seq_count = proc_seq_count + 1
            if proc_seq_count == BATCH_SIZE:
                proc_seq_count = 0
                batch_count += 1
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            if batch_count == 100:
                print(f"sum loss {sum_loss}")
                batch_count = 0
                sum_loss = 0.0

    # Store the model after each epoch to compare the performance of them
    torch.save(model.state_dict(), os.path.join(models_folder, f"gpt2_medium_dictionary_{epoch}.pt"))
    print(f"EPOCH {epoch} completed" + '=' * 30)

|█▏⚠︎                                     | (!) 221/8074 [3%] in 6.8s (32.97/s)                                          


KeyboardInterrupt: 

In [53]:
MODEL_EPOCH = 0

models_folder = "trained_models"

model_path = os.path.join(models_folder, f"gpt2_medium_dictionary_{MODEL_EPOCH}.pt")
model.load_state_dict(torch.load(model_path))

words_output_file_path = f'generated_{MODEL_EPOCH}.words'

model.eval()
if os.path.exists(words_output_file_path):
    os.remove(words_output_file_path)

word_num = 0
total = 10

with torch.no_grad():
    with alive_bar(total, force_tty=True) as bar:
        for word_idx in range(total):

            word_finished = False

            cur_ids = torch.tensor(tokenizer.encode("word:")).unsqueeze(0).to(device)

            for i in range(100):
                outputs = model(cur_ids, labels=cur_ids)
                loss, logits = outputs[:2]
                softmax_logits = torch.softmax(logits[0,-1], dim=0) #Take the first(from only one in this case) batch and the last predicted embedding
                if i < 3:
                    n = 20
                else:
                    n = 3
                next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n) #Randomly(from the topN probability distribution) select the next word
                cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1) # Add the last word to the running sequence

                if next_token_id in tokenizer.encode('<|endoftext|>'):
                    word_finished = True
                    break


            if word_finished:

                word_num = word_num + 1

                output_list = list(cur_ids.squeeze().to('cpu').numpy())
                output_text = tokenizer.decode(output_list)

                with open(words_output_file_path, 'a') as f:
                    f.write(f"{output_text} \n\n")

            bar()

|████████████████████████████████████████| 10/10 [100%] in 1:55.6 (0.08/s)                                              
