# Подключим необходимые пакеты

In [1]:
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import numpy as np

from datasets import load_dataset
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
import nltk

from collections import Counter
from typing import List
from tqdm import tqdm

import seaborn
seaborn.set(palette='summer')

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

# Загрузим данные

In [3]:
conn = sqlite3.connect('/kaggle/input/wikibooks-dataset/wikibooks.sqlite')

df = pd.read_sql_query("SELECT * FROM ru LIMIT 3300", conn)

In [4]:
sentences = []

for sentence in tqdm(df['body_text']):
    sentences.extend(
        [x.lower() for x in sent_tokenize(sentence, language='russian') if len(x) < 256]
        )
    
print("Количество предложений", len(sentences))

100%|██████████| 3300/3300 [00:10<00:00, 304.16it/s]

Количество предложений 120873





# Train loop

In [29]:
def fit_epoch(model, train_loader, criterion, optimizer, sheduler = None):
    model.train()
    running_loss = 0.0
    running_corrects = 0
    processed_data = 0
    losses = []
    perplexity = []
    for batch in train_loader:
        optimizer.zero_grad()

        logits = model(batch['input_ids']).flatten(start_dim=0, end_dim=1)
        loss = criterion(
            logits, batch['target_ids'].flatten())
        loss.backward()
        optimizer.step()
        
        perplexity.append(torch.exp(loss).item())
        losses.append(loss.item())
        
    perplexity = sum(perplexity) / len(perplexity)
    losses = sum(losses) / len(losses)    
    return perplexity, losses



def eval_epoch(model, val_loader, criterion):
    model.eval()
    perplexity = []
    losses = []
    with torch.no_grad():
        for batch in val_loader:
            logits = model(batch['input_ids']).flatten(start_dim=0, end_dim=1)
            loss = criterion(
                logits,
                batch['target_ids'].flatten()
                )
            perplexity.append(torch.exp(loss).item())
            losses.append(loss.item())

    perplexity = sum(perplexity) / len(perplexity)
    losses = sum(losses) / len(losses)
    return perplexity, losses



def train(train_dataloader, eval_dataloader, model, epochs, ignore_index = char2ind['<pad>'] ,
          optimizer=None, criterion=None, sheduler=None):

    if optimizer is None:
      optimizer = torch.optim.Adam(model.parameters())

    if criterion is None:
      criterion = nn.CrossEntropyLoss(ignore_index=ignore_index)

    best_model_wts = model.state_dict()
    best_perplexity = 10e10

    history = []
    log_template = "\nEpoch {ep:03d} train_loss: {t_loss:0.4f} \
    val_loss {v_loss:0.4f} train_perplexirty {t_acc:0.4f} val_perplexirty {v_acc:0.4f}"

    with tqdm(desc="epoch", total=epochs) as pbar_outer:

        for epoch in range(epochs):
            train_perplexirty, train_loss = fit_epoch(model, train_dataloader, criterion, optimizer)

            val_perplexirty, val_loss = eval_epoch(model, eval_dataloader, criterion)
            history.append((train_loss, train_perplexirty, val_loss, val_perplexirty))
            if val_perplexirty < best_perplexity:
                best_perplexity = val_perplexirty
                best_model_wts = model.state_dict()

            pbar_outer.update(1)
            tqdm.write(log_template.format(ep=epoch+1, t_loss=train_loss,\
                                           v_loss=val_loss, t_acc=train_perplexirty, v_acc=val_perplexirty))

    print('Best val perplexirty: {:4f}'.format(best_perplexity))
    model.load_state_dict(best_model_wts)

    return model, history

# Посимвольная токенизация

In [6]:
stop_chars = ["\t", ",", ".", "!", "@", "'", '"', ";", "\n", "(", ")",
             "[", "]", "{", "}", "?", ":", "-", "_", "+", "=", "^", "*", 
              "&", "`", "~"]

chars = Counter()

for sentence in tqdm(sentences):
    for char in sentence:
        if char in stop_chars:
            continue
        chars[char] += 1
        
vocab = set(['<unk>', '<bos>', '<eos>', '<pad>'])
counter_threshold = 500

for char, cnt in chars.items():
    if cnt > counter_threshold:
        vocab.add(char)
        
print("Размер словаря:", len(vocab))

100%|██████████| 120873/120873 [00:09<00:00, 12853.64it/s]

Размер словаря: 91





In [7]:
char2ind = {char: i for i, char in enumerate(vocab)}
ind2char = {i: char for char, i in char2ind.items()}

In [41]:
class CharDataset:
    def __init__(self, sentences):
        self.data = sentences
        self.unk_id = char2ind['<unk>']
        self.bos_id = char2ind['<bos>']
        self.eos_id = char2ind['<eos>']
        self.pad_id = char2ind['<pad>']

    def __getitem__(self, idx: int) -> List[int]:
        tokenized_sentence = [self.bos_id]
        tokenized_sentence += [char2ind.get(char, self.unk_id) for char in self.data[idx]]
        tokenized_sentence += [self.eos_id]

        return tokenized_sentence

    def __len__(self) -> int:
        return len(self.data)
    
    
class WordDataset:
    def __init__(self, sentences):
        self.data = sentences
        self.unk_id = word2ind['<unk>']
        self.bos_id = word2ind['<bos>']
        self.eos_id = word2ind['<eos>']
        self.pad_id = word2ind['<pad>']

    def __getitem__(self, idx: int) -> List[int]:
        tokenized_sentence = [self.bos_id]
        tokenized_sentence += [word2ind.get(word, self.unk_id) for word in nltk.word_tokenize(self.data[idx])]
        tokenized_sentence += [self.eos_id]
        
        return tokenized_sentence

    def __len__(self) -> int:
        return len(self.data)
    
    
    
def collate_fn_with_padding(
    input_batch: List[List[int]], pad_id=char2ind['<pad>']) -> torch.Tensor:
    seq_lens = [len(x) for x in input_batch]
    max_seq_len = max(seq_lens)

    new_batch = []
    for sequence in input_batch:
        for _ in range(max_seq_len - len(sequence)):
            sequence.append(pad_id)
        new_batch.append(sequence)

    sequences = torch.LongTensor(new_batch).to(device)

    new_batch = {
        'input_ids': sequences[:,:-1],
        'target_ids': sequences[:,1:]
    }

    return new_batch


def generate_sequence(model, dict_2ind ,ind2dict, starting_seq: str, max_seq_len: int = 256) -> str:
    device = 'cpu'
    model = model.to(device)
    input_ids = [dict_2ind['<bos>']] + [
        dict_2ind.get(char, dict_2ind['<unk>']) for char in starting_seq]
    input_ids = torch.LongTensor(input_ids).to(device)

    model.eval()
    with torch.no_grad():
        for i in range(max_seq_len):
            next_char_distribution = model(input_ids)[-1]
            next_char = next_char_distribution.squeeze().argmax()
            input_ids = torch.cat([input_ids, next_char.unsqueeze(0)])

            if next_char.item() == dict_2ind['<eos>']:
                break

    words = ' '.join([ind2dict[idx.item()] for idx in input_ids])

    return words

Разобьём датасет на train и eval, так же определим dataloader для train и eval

In [9]:
train_sentences, eval_sentences = train_test_split(sentences, test_size=0.2)

train_dataset = CharDataset(train_sentences)
eval_dataset = CharDataset(eval_sentences)

train_dataloader = DataLoader(
    train_dataset, collate_fn=collate_fn_with_padding, batch_size=256)

eval_dataloader = DataLoader(
    eval_dataset, collate_fn=collate_fn_with_padding, batch_size=256)

Определим архитектуру на основе LSTM

In [10]:
class LanguageModel(nn.Module):
    def __init__(self, vocab_size, hidden_dim, num_layers = 1):
        super().__init__()
        self.num_layers = num_layers
        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        self.lstm_layers = nn.ModuleList()
        for _ in range(num_layers):
            self.lstm_layers.append(nn.LSTM(hidden_dim, hidden_dim, batch_first=True))
            
        self.linear = nn.Linear(hidden_dim, hidden_dim)
        self.projection = nn.Linear(hidden_dim, vocab_size)

        self.non_lin = nn.Tanh()
        self.dropout = nn.Dropout(0.2)

    def forward(self, input_batch: torch.Tensor) -> torch.Tensor:
        embeddings = self.embedding(input_batch)  # [batch_size, seq_len, hidden_dim]
        output, _ = self.lstm_layers[0](embeddings)
        for i in range(1, self.num_layers):
            output1, _ = self.lstm_layers[i](output)
            output = output1 + output
        output = self.dropout(self.linear(self.non_lin(output)))  # [batch_size, seq_len, hidden_dim]
        projection = self.projection(self.non_lin(output))  # [batch_size, seq_len, vocab_size]

        return projection

In [11]:
model = LanguageModel(hidden_dim=256, vocab_size=len(vocab), num_layers = 1).to(device)
num_params = sum(p.numel() for p in model.parameters())
print(model)
print(f"Number of model parameters: {num_params:,}")

LanguageModel(
  (embedding): Embedding(91, 256)
  (lstm_layers): ModuleList(
    (0): LSTM(256, 256, batch_first=True)
  )
  (linear): Linear(in_features=256, out_features=256, bias=True)
  (projection): Linear(in_features=256, out_features=91, bias=True)
  (non_lin): Tanh()
  (dropout): Dropout(p=0.2, inplace=False)
)
Number of model parameters: 638,811


In [12]:
best_model, history = train(train_dataloader, eval_dataloader, model, 10)

epoch:  10%|█         | 1/10 [00:38<05:50, 38.95s/it]


Epoch 001 train_loss: 2.2684     val_loss 1.8093 train_perplexirty 11.2871 val_perplexirty 6.1087


epoch:  20%|██        | 2/10 [01:17<05:07, 38.45s/it]


Epoch 002 train_loss: 1.7334     val_loss 1.6459 train_perplexirty 5.6688 val_perplexirty 5.1875


epoch:  30%|███       | 3/10 [01:55<04:27, 38.28s/it]


Epoch 003 train_loss: 1.6260     val_loss 1.5770 train_perplexirty 5.0863 val_perplexirty 4.8421


epoch:  40%|████      | 4/10 [02:33<03:49, 38.22s/it]


Epoch 004 train_loss: 1.5694     val_loss 1.5345 train_perplexirty 4.8059 val_perplexirty 4.6406


epoch:  50%|█████     | 5/10 [03:11<03:10, 38.20s/it]


Epoch 005 train_loss: 1.5308     val_loss 1.5051 train_perplexirty 4.6237 val_perplexirty 4.5061


epoch:  60%|██████    | 6/10 [03:49<02:32, 38.20s/it]


Epoch 006 train_loss: 1.5025     val_loss 1.4830 train_perplexirty 4.4945 val_perplexirty 4.4075


epoch:  70%|███████   | 7/10 [04:27<01:54, 38.18s/it]


Epoch 007 train_loss: 1.4809     val_loss 1.4653 train_perplexirty 4.3984 val_perplexirty 4.3302


epoch:  80%|████████  | 8/10 [05:05<01:16, 38.17s/it]


Epoch 008 train_loss: 1.4633     val_loss 1.4515 train_perplexirty 4.3217 val_perplexirty 4.2709


epoch:  90%|█████████ | 9/10 [05:44<00:38, 38.16s/it]


Epoch 009 train_loss: 1.4493     val_loss 1.4402 train_perplexirty 4.2615 val_perplexirty 4.2231


epoch: 100%|██████████| 10/10 [06:22<00:00, 38.21s/it]


Epoch 010 train_loss: 1.4380     val_loss 1.4309 train_perplexirty 4.2135 val_perplexirty 4.1838
Best val perplexirty: 4.183791





In [17]:
generate_sequence(model, char2ind, ind2char, starting_seq='источник связан с ')

'<bos>источник связан с помощью программы в соответствии с помощью программы в соответствии с помощью программы в соответствии с помощью программы в соответствии с помощью программы в соответствии с помощью программы в соответствии с помощью программы в соответствии с помощью про'

In [18]:
import gc
torch.cuda.empty_cache()
gc.collect()

186

### Теперь добавим несколько слоев LSTM

In [19]:
model = LanguageModel(hidden_dim=256, vocab_size=len(vocab), num_layers = 3).to(device)
num_params = sum(p.numel() for p in model.parameters())
print(model)
print(f"Number of model parameters: {num_params:,}")

LanguageModel(
  (embedding): Embedding(91, 256)
  (lstm_layers): ModuleList(
    (0-2): 3 x LSTM(256, 256, batch_first=True)
  )
  (linear): Linear(in_features=256, out_features=256, bias=True)
  (projection): Linear(in_features=256, out_features=91, bias=True)
  (non_lin): Tanh()
  (dropout): Dropout(p=0.2, inplace=False)
)
Number of model parameters: 1,691,483


In [20]:
best_model, history = train(train_dataloader, eval_dataloader, model, 10)

epoch:  10%|█         | 1/10 [01:23<12:30, 83.39s/it]


Epoch 001 train_loss: 2.1932     val_loss 1.7179 train_perplexirty 10.6381 val_perplexirty 5.5749


epoch:  20%|██        | 2/10 [02:46<11:07, 83.39s/it]


Epoch 002 train_loss: 1.6379     val_loss 1.5451 train_perplexirty 5.1536 val_perplexirty 4.6904


epoch:  30%|███       | 3/10 [04:10<09:43, 83.35s/it]


Epoch 003 train_loss: 1.5146     val_loss 1.4654 train_perplexirty 4.5507 val_perplexirty 4.3308


epoch:  40%|████      | 4/10 [05:33<08:19, 83.29s/it]


Epoch 004 train_loss: 1.4459     val_loss 1.4181 train_perplexirty 4.2476 val_perplexirty 4.1307


epoch:  50%|█████     | 5/10 [06:55<06:55, 83.05s/it]


Epoch 005 train_loss: 1.4003     val_loss 1.3854 train_perplexirty 4.0579 val_perplexirty 3.9980


epoch:  60%|██████    | 6/10 [08:18<05:31, 82.84s/it]


Epoch 006 train_loss: 1.3662     val_loss 1.3589 train_perplexirty 3.9220 val_perplexirty 3.8935


epoch:  70%|███████   | 7/10 [09:40<04:08, 82.76s/it]


Epoch 007 train_loss: 1.3392     val_loss 1.3394 train_perplexirty 3.8173 val_perplexirty 3.8179


epoch:  80%|████████  | 8/10 [11:03<02:45, 82.71s/it]


Epoch 008 train_loss: 1.3167     val_loss 1.3255 train_perplexirty 3.7324 val_perplexirty 3.7654


epoch:  90%|█████████ | 9/10 [12:25<01:22, 82.61s/it]


Epoch 009 train_loss: 1.2982     val_loss 1.3158 train_perplexirty 3.6638 val_perplexirty 3.7291


epoch: 100%|██████████| 10/10 [13:48<00:00, 82.83s/it]


Epoch 010 train_loss: 1.2819     val_loss 1.3065 train_perplexirty 3.6046 val_perplexirty 3.6946
Best val perplexirty: 3.694581





In [22]:
generate_sequence(model, char2ind, ind2char, starting_seq='источник связан с ')

'<bos>источник связан с помощью статьи по проекту<unk><eos>'

# Пословная токенизация

In [23]:
torch.cuda.empty_cache()
gc.collect()

485

In [24]:
words = Counter()

for sentence in tqdm(sentences):
    for word in nltk.word_tokenize(sentence):
            words[word] += 1
            
vocab = set(['<unk>', '<bos>', '<eos>', '<pad>'])
vocab_size = 40000

for elem in words.most_common(vocab_size):
    vocab.add(elem[0])
    
print("Всего слов в словаре:", len(vocab))

100%|██████████| 120873/120873 [00:29<00:00, 4096.45it/s]


Всего слов в словаре: 40004


In [25]:
word2ind = {char: i for i, char in enumerate(vocab)}
ind2word = {i: char for char, i in word2ind.items()}

In [32]:
def collate_fn_with_padding(
    input_batch: List[List[int]], pad_id=word2ind['<pad>']) -> torch.Tensor:
    seq_lens = [len(x) for x in input_batch]
    max_seq_len = max(seq_lens)

    new_batch = []
    for sequence in input_batch:
        for _ in range(max_seq_len - len(sequence)):
            sequence.append(pad_id)
        new_batch.append(sequence)

    sequences = torch.LongTensor(new_batch).to(device)

    new_batch = {
        'input_ids': sequences[:,:-1],
        'target_ids': sequences[:,1:]
    }

    return new_batch

In [33]:
train_sentences, eval_sentences = train_test_split(sentences, test_size=0.2)

train_dataset = WordDataset(train_sentences)
eval_dataset = WordDataset(eval_sentences)

train_dataloader = DataLoader(
    train_dataset, collate_fn=collate_fn_with_padding, batch_size=64,)

eval_dataloader = DataLoader(
    eval_dataset, collate_fn=collate_fn_with_padding, batch_size=64)

In [34]:
model = LanguageModel(hidden_dim=256, vocab_size=len(vocab), num_layers = 1).to(device)

num_params = sum(p.numel() for p in model.parameters())
print(model)
print(f"Number of model parameters: {num_params:,}")

LanguageModel(
  (embedding): Embedding(40004, 256)
  (lstm_layers): ModuleList(
    (0): LSTM(256, 256, batch_first=True)
  )
  (linear): Linear(in_features=256, out_features=256, bias=True)
  (projection): Linear(in_features=256, out_features=40004, bias=True)
  (non_lin): Tanh()
  (dropout): Dropout(p=0.2, inplace=False)
)
Number of model parameters: 21,114,180


In [35]:
best_model, losses = train(train_dataloader, eval_dataloader, model, 10, ignore_index = word2ind["<pad>"])

epoch:  10%|█         | 1/10 [01:47<16:04, 107.18s/it]


Epoch 001 train_loss: 6.3350     val_loss 5.8254 train_perplexirty 832.8000 val_perplexirty 343.8051


epoch:  20%|██        | 2/10 [03:34<14:17, 107.14s/it]


Epoch 002 train_loss: 5.5044     val_loss 5.4580 train_perplexirty 251.8205 val_perplexirty 238.4559


epoch:  30%|███       | 3/10 [05:21<12:31, 107.35s/it]


Epoch 003 train_loss: 5.0642     val_loss 5.2847 train_perplexirty 161.3904 val_perplexirty 200.8015


epoch:  40%|████      | 4/10 [07:08<10:43, 107.23s/it]


Epoch 004 train_loss: 4.7353     val_loss 5.2213 train_perplexirty 115.9214 val_perplexirty 188.8237


epoch:  50%|█████     | 5/10 [08:57<08:57, 107.55s/it]


Epoch 005 train_loss: 4.4719     val_loss 5.2141 train_perplexirty 88.9743 val_perplexirty 187.8421


epoch:  60%|██████    | 6/10 [10:45<07:11, 107.88s/it]


Epoch 006 train_loss: 4.2506     val_loss 5.2342 train_perplexirty 71.2702 val_perplexirty 192.0732


epoch:  70%|███████   | 7/10 [12:33<05:23, 107.84s/it]


Epoch 007 train_loss: 4.0567     val_loss 5.2870 train_perplexirty 58.7072 val_perplexirty 202.8684


epoch:  80%|████████  | 8/10 [14:21<03:35, 107.90s/it]


Epoch 008 train_loss: 3.8872     val_loss 5.3552 train_perplexirty 49.5441 val_perplexirty 217.7147


epoch:  90%|█████████ | 9/10 [16:09<01:47, 107.99s/it]


Epoch 009 train_loss: 3.7329     val_loss 5.4208 train_perplexirty 42.4665 val_perplexirty 232.9360


epoch: 100%|██████████| 10/10 [17:57<00:00, 107.77s/it]


Epoch 010 train_loss: 3.5940     val_loss 5.4889 train_perplexirty 36.9549 val_perplexirty 249.8979
Best val perplexirty: 187.842122





In [47]:
generate_sequence(model, word2ind, ind2word,starting_seq=nltk.word_tokenize('история россии'))

'<bos> история россии – это наука о росте народонаселения . <eos>'

### Добавим несколько слоев LSTM

In [48]:
torch.cuda.empty_cache()
gc.collect()

1995

In [49]:
model = LanguageModel(hidden_dim=256, vocab_size=len(vocab), num_layers = 3).to(device)

num_params = sum(p.numel() for p in model.parameters())
print(model)
print(f"Number of model parameters: {num_params:,}")

LanguageModel(
  (embedding): Embedding(40004, 256)
  (lstm_layers): ModuleList(
    (0-2): 3 x LSTM(256, 256, batch_first=True)
  )
  (linear): Linear(in_features=256, out_features=256, bias=True)
  (projection): Linear(in_features=256, out_features=40004, bias=True)
  (non_lin): Tanh()
  (dropout): Dropout(p=0.2, inplace=False)
)
Number of model parameters: 22,166,852


In [50]:
best_model, losses = train(train_dataloader, eval_dataloader, model, 10, ignore_index = word2ind["<pad>"])

epoch:  10%|█         | 1/10 [02:05<18:45, 125.02s/it]


Epoch 001 train_loss: 6.3070     val_loss 5.8164 train_perplexirty 758.6220 val_perplexirty 340.7918


epoch:  20%|██        | 2/10 [04:09<16:39, 124.92s/it]


Epoch 002 train_loss: 5.4720     val_loss 5.4439 train_perplexirty 244.0570 val_perplexirty 235.2224


epoch:  30%|███       | 3/10 [06:14<14:34, 124.92s/it]


Epoch 003 train_loss: 5.0251     val_loss 5.2878 train_perplexirty 155.3410 val_perplexirty 201.6170


epoch:  40%|████      | 4/10 [08:19<12:29, 124.86s/it]


Epoch 004 train_loss: 4.6927     val_loss 5.2322 train_perplexirty 111.1670 val_perplexirty 191.0837


epoch:  50%|█████     | 5/10 [10:24<10:24, 124.86s/it]


Epoch 005 train_loss: 4.4245     val_loss 5.2410 train_perplexirty 84.9284 val_perplexirty 193.1412


epoch:  60%|██████    | 6/10 [12:29<08:19, 124.95s/it]


Epoch 006 train_loss: 4.1974     val_loss 5.2817 train_perplexirty 67.6328 val_perplexirty 201.5388


epoch:  70%|███████   | 7/10 [14:34<06:15, 125.00s/it]


Epoch 007 train_loss: 3.9985     val_loss 5.3267 train_perplexirty 55.4249 val_perplexirty 211.1879


epoch:  80%|████████  | 8/10 [16:39<04:10, 125.09s/it]


Epoch 008 train_loss: 3.8201     val_loss 5.3841 train_perplexirty 46.3572 val_perplexirty 224.0727


epoch:  90%|█████████ | 9/10 [18:45<02:05, 125.10s/it]


Epoch 009 train_loss: 3.6598     val_loss 5.4538 train_perplexirty 39.4794 val_perplexirty 240.7387


epoch: 100%|██████████| 10/10 [20:50<00:00, 125.01s/it]


Epoch 010 train_loss: 3.5142     val_loss 5.5317 train_perplexirty 34.1306 val_perplexirty 260.7878
Best val perplexirty: 191.083743





In [52]:
generate_sequence(model, word2ind, ind2word,starting_seq=nltk.word_tokenize('история россии определяется'))

'<bos> история россии определяется населением в россии . <eos>'