In [1]:
import torch 
from torch import nn, optim, Tensor 
from torch.nn import functional as F 
from torchmetrics import Accuracy 
from torch.utils.data import DataLoader, Dataset 
import numpy as np 
import os 
from tqdm import tqdm 

In [102]:
with open('../data/heroes_of_our_times.txt', 'r') as file:
    text = file.read() 

In [70]:
chars = set(text)
chars.symmetric_difference_update(set(int2char.tolist()))
chars - set(int2char.tolist())

In [71]:
chars - set(int2char.tolist())

{'/', 'V', '“', '„'}

In [68]:
chars.symmetric_difference_update(set(int2char.tolist()))
print(chars)

{'k', '/', 'C', 'Й', 'x', '“', 'a', 'F', '’', 'v', 'y', '€', 'j', 'b', 'V', 'M', 'q', 'f', '„', 'Ь', 'g', 'z', 'd', 'Ы', 'N'}


In [3]:
chars = list()
int2char = np.array(list(set(text)))
char2int = {char : idx for idx, char in enumerate(int2char.tolist())}

In [104]:
text_encoded = [char2int[char] for char in text]

In [5]:
class CharModelingDatasets(Dataset):
    def __init__(self, chunks):
        self.chunks = chunks 
    
    def __len__(self):
        return len(self.chunks) 
    
    def __getitem__(self, idx):
        chunk = self.chunks[idx]
        return chunk[:-1], chunk[1:].to(torch.int64) 

In [6]:
class CharModelingModel(nn.Module):
    def __init__(self, 
                vocab_size, emb_dim, 
                rnn_hidden_dim, fc_hidden_dim, 
                rnn_drop, fc_drop,
                num_layers):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=emb_dim)
        
        self.lstm = nn.LSTM(
            input_size=emb_dim,
            hidden_size=rnn_hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=rnn_drop
        )
        
        self.classifier = nn.Sequential(
            nn.Linear(rnn_hidden_dim, fc_hidden_dim),
            nn.BatchNorm1d(fc_hidden_dim),
            nn.ReLU(True),
            nn.Dropout(fc_drop),
            nn.Linear(fc_hidden_dim, vocab_size)
        )
        
        self.num_layers = num_layers 
        self.hidden_size = rnn_hidden_dim
    
    def forward(self, inputs: Tensor, hidden: Tensor, cell: Tensor):
        inputs = self.embedding(inputs.unsqueeze(1))
        outputs, (hidden, cell) = self.lstm(inputs, (hidden, cell))
        outputs = self.classifier(outputs.squeeze(1))
        return outputs, (hidden, cell)
    
    def init_hidden_cell(self, batch_size):
        device = next(self.parameters()).device
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size)
        cell = torch.zeros(self.num_layers, batch_size, self.hidden_size)
        return hidden.to(device), cell.to(device)
        
        

In [132]:
seq_length = 200
chunk_length = seq_length + 1

chunks = [ torch.tensor(
    text_encoded[idx : idx+chunk_length]) for idx in range(len(text_encoded) - chunk_length)]

In [133]:
train_data = CharModelingDatasets(chunks)

In [141]:
batch_size = 512
train_dl = DataLoader(
    train_data,
    batch_size,
    shuffle=True,
#    num_workers=os.cpu_count(),
    pin_memory=True,
    drop_last=True
)

In [13]:
def trainer(
    model,
    train_dl,
    optimizer,
    loss_fn,
    metric,
    device,
    epochs):
    
    model.train() 
    best_loss = float('inf')
    best_metric = None 
    best_epoch = None 
    train_loop = tqdm(range(epochs), desc='[Train]', leave=False)
    
    for epoch in train_loop:
        optimizer.zero_grad()
        x_batch, y_target = next(iter(train_dl))
        x_batch, y_target = x_batch.to(device), y_target.to(device)
        hidden, cell = model.init_hidden_cell(batch_size)
        loss = 0
        cur_metric = 0
        for idx in range(seq_length):
            y_pred, (hidden, cell) = model(x_batch[:, idx], hidden, cell)
            loss += loss_fn(y_pred, y_target[:, idx])
            cur_metric += metric(y_pred, y_target[:, idx])
        
        loss /= seq_length
        loss.backward() 
        optimizer.step()
        cur_metric /= seq_length
        
        loss, cur_metric = loss.item(), cur_metric.item() 
        train_loop.set_description(desc=f'[Train] Epoch {epoch + 1}: loss={loss:.3f}, metric={cur_metric:.3f}')
        
        if loss < best_loss:
            best_loss = loss 
            best_metric = cur_metric
            best_epoch = epoch + 1
            
            
    with open("../results/hero_results.txt", "w+") as file:
        file.write(f'Обучение модели на тексте "Герой нашего времени"\n' + 
                    f'Лучшая эпоха: {best_epoch}\n' + 
                    f'Лучший лосс: {best_loss:.3f}\n' + 
                    f'Лучшая метрика: {best_metric:.3f}')

        
        
        
        

In [11]:
len(int2char)

127

In [14]:
vocab_size = len(int2char)
emb_dim = 256
rnn_hidden_dim = 256
fc_hidden_dim = 256
rnn_drop = 0.2
fc_drop = 0.4
num_layers = 2
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu ')

model = CharModelingModel(
    vocab_size,
    emb_dim,
    rnn_hidden_dim,
    fc_hidden_dim,
    rnn_drop,
    fc_drop,
    num_layers
).to(device)


In [15]:
optimizer = optim.Adam(
    model.parameters(),
    lr=1e-3
)
loss_fn = nn.CrossEntropyLoss()
metric = Accuracy(task='multiclass', num_classes=vocab_size).to(device)

In [31]:
trainer(
    model, 
    train_dl,
    optimizer,
    loss_fn,
    metric,
    device,
    epochs=3500
)

                                                                                                 

In [32]:
model.eval()

CharModelingModel(
  (embedding): Embedding(127, 256)
  (lstm): LSTM(256, 256, num_layers=2, batch_first=True, dropout=0.2)
  (classifier): Sequential(
    (0): Linear(in_features=256, out_features=256, bias=True)
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=256, out_features=127, bias=True)
  )
)

In [33]:
from torch.distributions.categorical import Categorical

In [34]:
def generate_text(start_string, len_generate=500, temperature=1.0):
    str_encoded = torch.tensor([char2int[char] for char in start_string]).view(1, -1).to(device)
    
    
    hidden, cell = model.init_hidden_cell(1)
    for idx in range(len(start_string) - 1):
        _, (hidden, cell) = model(str_encoded[:, idx], hidden, cell)
    
    last_char = str_encoded[:, -1]
    generated_text = start_string
     
    for _ in range(len_generate):
        logits, (hidden, cell) = model(last_char, hidden, cell)
        logits = temperature * logits 
        sampler = Categorical(logits=logits)
        last_char = sampler.sample()
        generated_text += f'{int2char[last_char]}'
    
    return generated_text

In [99]:
model.eval() 

CharModelingModel(
  (embedding): Embedding(127, 256)
  (lstm): LSTM(256, 256, num_layers=2, batch_first=True, dropout=0.2)
  (classifier): Sequential(
    (0): Linear(in_features=256, out_features=256, bias=True)
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=256, out_features=127, bias=True)
  )
)

In [101]:
generate_text("Привет", temperature=3)

'Приветный Пугачеву. Тело было заставлять ее в моих положении. Я все стоял на деревушку, и все предвещало скорое и благополучное окончание.Вскоре князь Голицын, под крепостию Татищевой, разбил Пугачева, рассеял его толпы, освободил Оренбург и, казалось, нанес бунту последний и решительный удар. Зурин был в то время отряжен противу шайки мятежных башкирцев, которые рассеялись прежде, нежели мы их увидали. Весна осадила нас в татарской деревушке. Речки разлились, и дороги стали непроходимы. Мы утешались '

In [49]:
checkpoint = {
    'epoch': 10500,
    'model_state_dict' : model.state_dict(),
    'optimizer_state_dict' : optimizer.state_dict(),
    'loss' : 0.503,
    'metric' : 0.843
}

In [53]:
torch.save(checkpoint, '../checkpoints/checkpoint_hero/checkpoint.pth')

In [55]:
torch.load('../checkpoints/checkpoint_hero/checkpoint.pth').keys()

dict_keys(['epoch', 'model_state_dict', 'optimizer_state_dict', 'loss', 'metric'])

In [79]:
with open('../data/captain_daughter_utf8.txt', 'r') as file:
    text = file.read() 

In [75]:
text = text.replace('/', '').replace('V', '').replace('“', '').replace('„', '') 

In [76]:
with open('../data/captain_daughter_utf8.txt', 'w+') as file:
    file.write(text)

In [81]:
chars = set(text)
chars.symmetric_difference_update(set(int2char.tolist()))
chars - set(int2char.tolist())

set()

In [87]:
text[:30]

'Капитанская дочкаАлександр Сер'

Обучение на "Капитанская дочка"

In [96]:
checkpoint = {
    'epoch' : 5000,
    'model_state_dict' : model.state_dict(),
    'optimizer_state_dict' : optimizer.state_dict(),
    'best_loss' : 0.367,
    'best_metric' : 0.881
}

In [97]:
torch.save(checkpoint, '../checkpoints/checkpoint_capitan/checkpoint.pth')

In [147]:
trainer(
    model, 
    train_dl,
    optimizer,
    loss_fn,
    metric,
    device,
    epochs=500
)

                                                                                              

In [148]:
model.eval()

CharModelingModel(
  (embedding): Embedding(127, 256)
  (lstm): LSTM(256, 256, num_layers=2, batch_first=True, dropout=0.2)
  (classifier): Sequential(
    (0): Linear(in_features=256, out_features=256, bias=True)
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=256, out_features=127, bias=True)
  )
)

In [168]:
generate_text('Свобода', temperature=3, len_generate=200)

'Свобода… Она выпустила мою одежду, и я слышал, как она не убьешь, что наши приятеля, своивал ее болезнью, я заглебил и обугаться; и положив его следом. Разбудил я.–\xa0Не может быть, не забуду! Потому что я люб'