## Train a character-level GPT on some text data

The inputs here are simple text files, which we chop up to individual characters and then train GPT on. So you could say this is a char-transformer instead of a char-rnn. Doesn't quite roll off the tongue as well. In this example we will feed it some Shakespeare, which we'll get it to predict character-level.

In [1]:
from allennlp.data.tokenizers import Token, Tokenizer, PretrainedTransformerTokenizer

import nltk
import numpy as np
from os import listdir
from os.path import join as pathjoin
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import functional as F
import tqdm

from mingpt.model import GPT, GPTConfig
from mingpt.trainer import Trainer, TrainerConfig
# make deterministic
from mingpt.utils import sample, set_seed
set_seed(42)
np.random.seed(42)

In [2]:
DATA_DIR = '/home/mlepekhin/data'
MODELS_DIR = '/home/mlepekhin/models'
transformer_model = 'DeepPavlov/rubert-base-cased'

In [3]:
import math
from torch.utils.data import Dataset


def detokenize(tokens):
    return ' '.join([str(x) for x in tokens[1:-1]]).replace(' ##', '')

class BPEDataset(Dataset):
    def __init__(self, data, block_size):
        chars = sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)
        print('data has %d characters, %d unique.' % (data_size, vocab_size))
        
        self.stoi = { ch:i for i,ch in enumerate(chars) }
        self.itos = { i:ch for i,ch in enumerate(chars) }
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data
    
    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        chunk = self.data[idx:idx + self.block_size + 1]
        # encode every character to an integer
        dix = [self.stoi[s] for s in chunk]
        
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y

In [4]:
block_size = 128
tokenizer = PretrainedTransformerTokenizer(transformer_model)

In [6]:
genre_list = np.unique(pd.read_csv('/home/mlepekhin/data/min_gpt_bpe/ru').target.values)
print(genre_list)

['A1' 'A11' 'A12' 'A14' 'A16' 'A17' 'A4' 'A7' 'A8' 'A9']


In [12]:
from mingpt.utils import sample

def generate_single_dataset(train_text_file, state_dict_file, n_layer=4, n_head=4, n_embd=256,
                     texts_per_genre=1, text_len=100):
    text_sentences = nltk.tokenize.sent_tokenize(open(train_text_file, 'r').read())
    tokens = np.concatenate([tokenizer.tokenize(sent) for sent in text_sentences])
    tokens = [str(token) for token in tokens]
    train_dataset = BPEDataset(tokens, block_size) 
    tokens_set = set(train_dataset.stoi.keys())
    print("dataset is loaded")
    
    mconf = GPTConfig(
        train_dataset.vocab_size, train_dataset.block_size,
        n_layer=n_layer, n_head=n_head, n_embd=n_embd
    )
    model = GPT(mconf)
    model.load_state_dict(torch.load(state_dict_file))
    print("model is loaded")
    
    tconf = TrainerConfig(num_workers=1)
    trainer = Trainer(model, train_dataset, None, tconf)
    
    for genre in genre_list:
        for text_id in range(texts_per_genre):
            context = [str(token) for token in tokenizer.tokenize(f"#{genre}#")]
            x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None,...].to(trainer.device)
            y = sample(model, x, text_len, temperature=1.0, sample=True, top_k=10)[0]
            completion = ' '.join([train_dataset.itos[int(i)] for i in y])
            completion = completion.replace(' ##', '').replace('[CLS]', '').replace('[SEP]', '')
            yield completion, genre

In [13]:
GENRE_DATA_DIR = '/home/mlepekhin/data/genre'
GPT_MODELS_DIR = '/home/mlepekhin/models/mini_gpt_bpe/'
LANG = 'ru'

In [14]:
result_df = pd.DataFrame()

In [None]:
for text, genre in generate_single_dataset(
    pathjoin(GENRE_DATA_DIR, LANG, 'one_dataset.txt'),
    pathjoin(GPT_MODELS_DIR, LANG, 'one_generator')
):
    result_df = result_df.append({'text': text, 'target': genre}, ignore_index=True)

In [None]:
result_df.head()

In [None]:
result_df.to_csv('/home/mlepekhin/data/min_gpt_bpe/single_ru.csv')