In [0]:
!pip install --progress-bar off --quiet torch
!pip install --progress-bar off --quiet tqdm
!pip install --progress-bar off --quiet seaborn
!pip install --progress-bar off --quiet numpy
!pip install --progress-bar off --quiet sklearn
!pip install --progress-bar off --quiet youtokentome
!sudo apt-get -q install cuda

# get bpe_wiki.pickle

In [0]:
!wget -q https://www.dropbox.com/s/u6oyaphhu3uj6pa/bpe_wiki.pickle?dl=0

# if bpe_wiki.pickle, no need to download the dump

In [0]:
#!wget -q https://www.dropbox.com/s/jmom1y51hrf8i5r/shuffled_no_ids.tsv?dl=0
#!uconv -x lower < shuffled_no_ids.tsv?dl=0 | sed -e "s/[[:punct:]]\+//g" | sed -e "s/[0-9]\+//g" | sed -e "s/  */ /g" > multi_dump.txt
#!head multi_dump.txt

xыx ғасырда француз ғалымы мессье барлық тұмандықтардың жүйелі тізімін жасады оған жүзден астам тұмандықтар енді тек xx ғасырда ғана бұл түмандықтардың табиғаттары анықталды олардың тозаң мен газ араласқан түмандықтардан шар тәрізді және шашыраған газ шоғырларынан галактикалардан түратыны белгілі болды жүлдыздар арасындағы кеңістік бос тәрізді болып көрінеді шынында барлық жүлдыздар арасындағы кеңістіктер заттарға толы xx ғасырдың басында жүлдыздар жарығының жұтылу немесе әлсіреу қасиеті ашылды жарықты жұтатын заттың құс жолында шоғырланғаны және шүйке тәрізді құрылысы бар екені анықталды бұл зат физикалық құрамы жақсы зерттелген тозаңдардан тұрады жұлдыздар арасында тозаңнан басқа өте үлкен мөлшерде газ бар тозаңнан жүз есе көп олар бейтарап сутегінің см толқын ұзындығында сәулелер шығарады егер бейтарап сутегі бұлтына жақын жерден көк ыстық жұлдыз тұтанса жұлдызаралық газдар мен тозаңдардың сәулеленуі байқалады жұлдыздың шығарған ультракүлгін кванттарын бұлттың атомдары жұтады да осы

In [0]:
import math
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
import youtokentome as yttm
from tqdm import tqdm
from matplotlib import pyplot as plt

In [0]:
vocab_size = 16000
model_path = 'bpe.model'

### if bpe_wiki.pickle, no need to train the bpe model

In [0]:
#%%time
#yttm.BPE.train(data='multi_dump.txt', vocab_size=vocab_size, model=model_path)

CPU times: user 51.7 s, sys: 5.08 s, total: 56.8 s
Wall time: 31.7 s


<youtokentome.youtokentome.BPE at 0x7f2bef9feb70>

In [0]:
#tokenizer = yttm.BPE(model=model_path)

In [0]:
#tokenizer.vocab()[:15]

['<PAD>',
 '<UNK>',
 '<BOS>',
 '<EOS>',
 '▁',
 'а',
 'е',
 'н',
 'р',
 'л',
 'и',
 'ы',
 'т',
 'д',
 'к']

In [0]:
# function to encode a sequence with bpe subword2index and EOS index
def encode(sentence):
    tokens = tokenizer.encode(sentence, bos=True)
    return tokens

In [0]:
import os
import pickle


def load_encoded_dump(file_path="bpe_wiki.pickle?dl=0"):
    """
    :input: path to bpe encoded dump
    :output: dict {id: encoded sequence}
    """
    if not os.path.exists(file_path):
        return "No model found"
    encoded = pickle.load(open(file_path, "rb"))
    return encoded

In [0]:
"""
tokenized = []

with open("multi_dump.txt", "r", encoding="utf-8") as f:
  for sentence in tqdm(f.read().split("\n")):
    # cut a sequence by max_len of 80 tokens
    tokenized.append(encode(sentence)[:80])
"""

100%|██████████| 2280847/2280847 [01:40<00:00, 22615.92it/s]


In [0]:
# не используется

class LanguageModelData(Dataset):
    def __init__(self, data, max_len, pad_index, eos_index):
        self.data = data
        self.max_len = max_len
        self.pad_index = pad_index
        self.eos_index = eos_index
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        sequence = self.data[index][:self.max_len]
        x = sequence[:]
        y = sequence[1:] + [self.eos_index]
        assert len(x) == len(y)
        pad = [self.pad_index] * (self.max_len - len(x))
        x = torch.tensor(x + pad).long()
        y = torch.tensor(y + pad).long()
        return x, y

In [0]:
tokenized = list(load_encoded_dump().values())
tokenized = sorted(tokenized, key=lambda x: len(x))

batches = []
batch_size = 128

for i_batch in range(math.ceil(len(tokenized) / batch_size)):
    batches.append(tokenized[i_batch*batch_size:(i_batch+1)*batch_size])

In [0]:
# выбор максимальной длины последовательности – в итоге выбираем на глазок

lengths = np.array([len(x) for x in tokenized])
np.percentile(lengths, q=95)

80.0

In [0]:
max_len = 80
pad_index = 0
eos_index = 3

In [0]:
len(batches)

17820

In [0]:
random.shuffle(batches)

In [0]:
class SequenceBucketingData(Dataset):
    def __init__(self, data, max_len, pad_index, eos_index):
        self.data = data
        self.max_len = max_len
        self.pad_index = pad_index
        self.eos_index = eos_index
        
    def __len__(self):
        return len(self.data)
    
    def prepare_sample(self, sequence, max_len):
        sequence = sequence[:max_len]
        x = sequence[:]
        y = sequence[1:] + [self.eos_index]
        assert len(x) == len(y)
        pads = [self.pad_index] * (max_len - len(x))
        x += pads
        y += pads
        return x, y
    
    def __getitem__(self, index):
        batch = self.data[index]
        max_len = min([self.max_len, max([len(sample) for sample in batch])])

        batch_x = []
        batch_y = []
        
        for sample in batch:
            x, y = self.prepare_sample(sample, max_len)
            batch_x.append(x)
            batch_y.append(y)
        
        batch_x = torch.tensor(batch_x).long()
        batch_y = torch.tensor(batch_y).long()
        
        return batch_x, batch_y

In [0]:
validation_start_index = int(len(batches) * 0.05)

In [0]:
train_dataset = SequenceBucketingData(batches[:-validation_start_index], max_len, pad_index, eos_index)
validation_dataset = SequenceBucketingData(batches[-validation_start_index:], max_len, pad_index, eos_index)

In [0]:
# исключаем лишнюю размерность

def collate_fn(x):
    x, y = x[0]
    return x, y

In [0]:
# ставим batch_size=1 потому что датасет уже отдает батч
# добавим shuffle=True

train_loader = DataLoader(train_dataset, batch_size=1, collate_fn=collate_fn, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=1, collate_fn=collate_fn, shuffle=True)

In [0]:
# пробежимся по итератору, чтобы убедиться что ничего не падает и он работает достаточно быстро

progress_bar = tqdm(total=len(validation_loader.dataset), desc='Testing')

for x, y in validation_loader:
    progress_bar.update()
    
progress_bar.close()

Testing: 100%|██████████| 891/891 [00:00<00:00, 1253.09it/s]


In [0]:
x

tensor([[    2, 14762,  5600,  ...,  5474,  7763,  8102],
        [    2,  7397,    43,  ...,  5571,  6523,  6969],
        [    2,  5294,  5415,  ...,     7,  5437,  5501],
        ...,
        [    2,  5384, 14463,  ...,  8061,  7149, 10274],
        [    2,  9893,  5378,  ...,  7538,  6988, 15449],
        [    2,  5608,  8913,  ...,  7318, 10239,     5]])

In [0]:
y

tensor([[14762,  5600,  7632,  ...,  7763,  8102,     3],
        [ 7397,    43,  5285,  ...,  6523,  6969,     3],
        [ 5294,  5415,  5252,  ...,  5437,  5501,     3],
        ...,
        [ 5384, 14463,    61,  ...,  7149, 10274,     3],
        [ 9893,  5378,  8598,  ...,  6988, 15449,     3],
        [ 5608,  8913,  5388,  ..., 10239,     5,     3]])

In [0]:
x[:, 1] == y[:, 0]

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True])

In [0]:
class SpatialDropout(torch.nn.Dropout2d):
    """
    Dropout specific for RNN
    """
    def __init__(self, p=0.5):
        super().__init__()
        self.p = p
    
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x

In [0]:
class LanguageModel(torch.nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, model_dim, num_layers,
                 padding_idx, dropout=0.35, weight_tying=True):
        super(LanguageModel, self).__init__()
        self.embedding_layer = torch.nn.Embedding(num_embeddings=vocab_size,
                                                  embedding_dim=embedding_dim, padding_idx=padding_idx)
        self.embedding_dropout = SpatialDropout(p=dropout)
        self.lstm = torch.nn.LSTM(input_size=embedding_dim, hidden_size=model_dim, 
                                  num_layers=num_layers, dropout=dropout, batch_first=True)
        self.language_model_head = torch.nn.Linear(in_features=model_dim, out_features=vocab_size, bias=False)
        # share weights between the input and the output layers
        if weight_tying and embedding_dim == model_dim:
            self.language_model_head.weight = self.embedding_layer.weight
        
    def forward(self, x):
        x = self.embedding_layer(x)
        x = self.embedding_dropout(x)
        x, _ = self.lstm(x)
        x = self.language_model_head(x)
        return x

In [0]:
def train(model, loader, criterion, optimizer, clip=1, last_n_losses=500, verbose=True):
    losses = []
    progress_bar = tqdm(total=len(loader.dataset), disable=not verbose, desc='Train')
    model.train()

    for x, y in loader:
        
        x = x.to(device)
        y = y.to(device)
        
        pred = model(x)
        loss = criterion(pred.view(-1, pred.size(-1)), y.view(-1))
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        losses.append(loss.item())
        progress_bar.set_postfix(loss=np.mean(losses[-last_n_losses:]),
                                 perplexity=np.exp(np.mean(losses[-last_n_losses:])))
        progress_bar.update()
    progress_bar.close()
    return losses

def evaluate(model, loader, criterion, last_n_losses=500, verbose=True):
    losses = []
    progress_bar = tqdm(total=len(loader), disable=not verbose, desc='Evaluate')
    model.eval()

    for x, y in loader:

        x = x.to(device)
        y = y.to(device)

        with torch.no_grad():
            pred = model(x)
        loss = criterion(pred.view(-1, pred.size(-1)), y.view(-1))
        losses.append(loss.item())
        progress_bar.set_postfix(loss=np.mean(losses[-last_n_losses:]),
                                 perplexity=np.exp(np.mean(losses[-last_n_losses:])))
        progress_bar.update()
    progress_bar.close()
    return losses

In [0]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

embedding_dim = 256
model_dim = 256
num_layers = 3

model = LanguageModel(vocab_size=vocab_size, embedding_dim=embedding_dim, model_dim=model_dim, 
                      num_layers=num_layers, padding_idx=pad_index, weight_tying=True)

model.to(device)

LanguageModel(
  (embedding_layer): Embedding(16000, 256, padding_idx=0)
  (embedding_dropout): SpatialDropout(p=0.35, inplace=False)
  (lstm): LSTM(256, 256, num_layers=3, batch_first=True, dropout=0.35)
  (language_model_head): Linear(in_features=256, out_features=16000, bias=False)
)

In [0]:
# pad и eos игнорируются

criterion = torch.nn.CrossEntropyLoss(ignore_index=pad_index)
optimizer = torch.optim.Adam(params=model.parameters())

In [0]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [0]:
print(f'Количество обучаемых параметров в сети: {count_parameters(model):,}')

Количество обучаемых параметров в сети: 5,675,008


In [0]:
epoch_losses = train(model, validation_loader, criterion, optimizer)

In [0]:
epochs = 10

train_losses = []
validation_losses = []

train_perplexities = []
validation_perplexities = []

best_validation_loss = 100000

for n_epoch in range(1, epochs + 1):
    
    epoch_train_losses = train(model, train_loader, criterion, optimizer)
    epoch_validation_losses = evaluate(model, validation_loader, criterion)
    
    mean_train_loss = np.mean(epoch_train_losses)
    mean_validation_loss = np.mean(epoch_validation_losses)
    
    train_losses.append(epoch_train_losses)
    train_perplexities.append(np.exp(mean_train_loss))
    
    validation_losses.append(epoch_validation_losses)
    validation_perplexities.append(np.exp(mean_validation_loss))
    
    message = f'Epoch: {n_epoch}\n'
    message += f'Train: loss - {mean_train_loss:.4f} | perplexity - {train_perplexities[-1]:.3f}\n'
    message += f'Validation: loss - {mean_validation_loss:.4f} | perplexity - {validation_perplexities[-1]:.3f}'
    
    print(message)
    
    if mean_validation_loss < best_validation_loss:
        
        best_validation_loss = mean_validation_loss
        
        torch.save(model.state_dict(), f'best_language_model_state_dict.pth')
        torch.save(optimizer.state_dict(), 'best_optimizer_state_dict.pth')
        
    torch.save(model.state_dict(), f'last_language_model_state_dict.pth')
    torch.save(optimizer.state_dict(), 'last_optimizer_state_dict.pth')

    with open(f'info_{n_epoch}.json', 'w') as file_object:

        info = {
            'message': message,
            'train_losses': train_losses,
            'validation_losses': validation_losses,
            'train_perplexities': train_perplexities,
            'validation_perplexities': validation_perplexities
        }

        file_object.write(json.dumps(info, indent=2))


Train:   0%|          | 0/16929 [00:00<?, ?it/s][A
Train:   0%|          | 0/16929 [00:00<?, ?it/s, loss=9.36, perplexity=1.16e+4][A
Train:   0%|          | 1/16929 [00:00<1:39:57,  2.82it/s, loss=9.36, perplexity=1.16e+4][A
Train:   0%|          | 1/16929 [00:00<1:39:57,  2.82it/s, loss=9.21, perplexity=9.98e+3][A
Train:   0%|          | 2/16929 [00:00<1:19:14,  3.56it/s, loss=9.21, perplexity=9.98e+3][A
Train:   0%|          | 2/16929 [00:00<1:19:14,  3.56it/s, loss=9.21, perplexity=1e+4]   [A
Train:   0%|          | 3/16929 [00:00<1:10:09,  4.02it/s, loss=9.21, perplexity=1e+4][A
Train:   0%|          | 3/16929 [00:00<1:10:09,  4.02it/s, loss=9.21, perplexity=9.98e+3][A
Train:   0%|          | 4/16929 [00:00<1:05:00,  4.34it/s, loss=9.21, perplexity=9.98e+3][A
Train:   0%|          | 4/16929 [00:00<1:05:00,  4.34it/s, loss=9.1, perplexity=8.93e+3] [A
Train:   0%|          | 5/16929 [00:01<1:05:00,  4.34it/s, loss=9.13, perplexity=9.27e+3][A
Train:   0%|          | 6/1692

KeyboardInterrupt: ignored

In [0]:
plot_train = []

for epoch in train_losses:
    plot_train.extend(epoch)

plt.figure(figsize=(14, 14))
plt.xlabel('Батчи')
plt.ylabel('Функция потерь')
plt.title('Тренировочный датасет')
plt.plot(plot_train)

In [0]:
plot_validation = []

for epoch in validation_losses:
    plot_validation.extend(epoch)

plt.figure(figsize=(14, 14))
plt.xlabel('Батчи')
plt.ylabel('Функция потерь')
plt.title('Валидационный датасет')
plt.plot(plot_validation)

In [0]:
best_validation_loss

# generate some text with seed token

In [0]:
seed = 'россияда'

bos_index = 2

tokenized = tokenizer.encode([seed])
tokenized[0].insert(0, bos_index)
x = torch.tensor(tokenized).long().to(device)
x

In [0]:
model.eval()

with torch.no_grad():
    pred = []
    emb = model.embedding_layer(x)
    emb = model.embedding_dropout(emb)
    lstm_out, mem = model.lstm(emb)
    token_pred = model.language_model_head(lstm_out)
    current_token = x[:, -1].unsqueeze(0)
    for timestamp in range(512):
        emb = model.embedding_layer(current_token)
        emb = model.embedding_dropout(emb)
        lstm_out, mem = model.lstm(emb, mem)
        next_token_prediction = model.language_model_head(lstm_out)
        pred.append(next_token_prediction)
        current_token = next_token_prediction.argmax(dim=2)
        if current_token == eos_index:
            break
    pred = torch.cat(pred, dim=1)

tokens = pred.argmax(dim=-1).detach().cpu().numpy()
predicted_texts = tokenizer.decode(tokens.tolist())[0]
print(text + ' ' + predicted_texts.replace('<EOS>', ' '))