# Подключим необходимые пакеты

In [1]:
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import numpy as np

from datasets import load_dataset
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
import nltk

from collections import Counter
from typing import List
from tqdm import tqdm

import seaborn
seaborn.set(palette='summer')

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

# Load Data

В качестве датасета испльзуюся старницы Wiki на русском языке.
wikibooks-dataset

In [3]:
conn = sqlite3.connect('/kaggle/input/wikibooks-dataset/wikibooks.sqlite')

df = pd.read_sql_query("SELECT * FROM ru LIMIT 3300", conn)

# Preprocessing data

Разобьём страницы на тексты длиной 256 символов

In [4]:
sentences = []

for sentence in tqdm(df['body_text']):
    sentences.extend(
        [x.lower() for x in sent_tokenize(sentence, language='russian') if len(x) < 256]
        )
    
print("Количество предложений", len(sentences))

100%|██████████| 3300/3300 [00:10<00:00, 319.92it/s]

Количество предложений 120873





# Посимвольная токенизация

Определим словарь символов, так же исключим нежелательные символы

In [5]:
stop_chars = ["\t", ",", ".", "!", "@", "'", '"', ";", "\n", "(", ")",
             "[", "]", "{", "}", "?", ":", "-", "_", "+", "=", "^", "*", 
              "&", "`", "~"]

chars = Counter()

for sentence in tqdm(sentences):
    for char in sentence:
        if char in stop_chars:
            continue
        chars[char] += 1
        
vocab = set(['<unk>', '<bos>', '<eos>', '<pad>'])
counter_threshold = 500

for char, cnt in chars.items():
    if cnt > counter_threshold:
        vocab.add(char)
        
print("Размер словаря:", len(vocab))

100%|██████████| 120873/120873 [00:08<00:00, 13696.57it/s]

Размер словаря: 91





In [6]:
char2ind = {char: i for i, char in enumerate(vocab)}
ind2char = {i: char for char, i in char2ind.items()}

Определим класс CharDataset для последующей загрузки его в dataloader

In [7]:
class CharDataset:
    def __init__(self, sentences):
        self.data = sentences
        self.unk_id = char2ind['<unk>']
        self.bos_id = char2ind['<bos>']
        self.eos_id = char2ind['<eos>']
        self.pad_id = char2ind['<pad>']

    def __getitem__(self, idx: int) -> List[int]:
        tokenized_sentence = [self.bos_id]
        tokenized_sentence += [char2ind.get(char, self.unk_id) for char in self.data[idx]]
        tokenized_sentence += [self.eos_id]

        return tokenized_sentence

    def __len__(self) -> int:
        return len(self.data)

Определим функцию дополнения предложений до max_seq_len

In [8]:
def collate_fn_with_padding(
    input_batch: List[List[int]], pad_id=char2ind['<pad>']) -> torch.Tensor:
    seq_lens = [len(x) for x in input_batch]
    max_seq_len = max(seq_lens)

    new_batch = []
    for sequence in input_batch:
        for _ in range(max_seq_len - len(sequence)):
            sequence.append(pad_id)
        new_batch.append(sequence)

    sequences = torch.LongTensor(new_batch).to(device)

    new_batch = {
        'input_ids': sequences[:,:-1],
        'target_ids': sequences[:,1:]
    }

    return new_batch

Создадим dataloader для train и eval датасетов

In [9]:
train_sentences, eval_sentences = train_test_split(sentences, test_size=0.2)

train_dataset = CharDataset(train_sentences)
eval_dataset = CharDataset(eval_sentences)

train_dataloader = DataLoader(
    train_dataset, collate_fn=collate_fn_with_padding, batch_size=256)

eval_dataloader = DataLoader(
    eval_dataset, collate_fn=collate_fn_with_padding, batch_size=256)

# Train loop

In [10]:
def fit_epoch(model, train_loader, criterion, optimizer, sheduler = None):
    model.train()
    running_loss = 0.0
    running_corrects = 0
    processed_data = 0
    losses = []
    perplexity = []
    for batch in train_loader:
        optimizer.zero_grad()

        logits = model(batch['input_ids']).flatten(start_dim=0, end_dim=1)
        loss = criterion(
            logits, batch['target_ids'].flatten())
        loss.backward()
        optimizer.step()
        
        perplexity.append(torch.exp(loss).item())
        losses.append(loss.item())
        
    perplexity = sum(perplexity) / len(perplexity)
    losses = sum(losses) / len(losses)    
    return perplexity, losses

In [11]:
def eval_epoch(model, val_loader, criterion):
    model.eval()
    perplexity = []
    losses = []
    with torch.no_grad():
        for batch in val_loader:
            logits = model(batch['input_ids']).flatten(start_dim=0, end_dim=1)
            loss = criterion(
                logits,
                batch['target_ids'].flatten()
                )
            perplexity.append(torch.exp(loss).item())
            losses.append(loss.item())

    perplexity = sum(perplexity) / len(perplexity)
    losses = sum(losses) / len(losses)
    return perplexity, losses

In [35]:
def train(train_dataloader, eval_dataloader, model, epochs, ignore_index = char2ind['<pad>'] ,
          optimizer=None, criterion=None, sheduler=None):

    if optimizer is None:
      optimizer = torch.optim.Adam(model.parameters())

    if criterion is None:
      criterion = nn.CrossEntropyLoss(ignore_index=ignore_index)

    best_model_wts = model.state_dict()
    best_perplexity = 10e10

    history = []
    log_template = "\nEpoch {ep:03d} train_loss: {t_loss:0.4f} \
    val_loss {v_loss:0.4f} train_perplexirty {t_acc:0.4f} val_perplexirty {v_acc:0.4f}"

    with tqdm(desc="epoch", total=epochs) as pbar_outer:

        for epoch in range(epochs):
            train_perplexirty, train_loss = fit_epoch(model, train_dataloader, criterion, optimizer)

            val_perplexirty, val_loss = eval_epoch(model, eval_dataloader, criterion)
            history.append((train_loss, train_perplexirty, val_loss, val_perplexirty))
            if val_perplexirty < best_perplexity:
                best_perplexity = val_perplexirty
                best_model_wts = model.state_dict()

            pbar_outer.update(1)
            tqdm.write(log_template.format(ep=epoch+1, t_loss=train_loss,\
                                           v_loss=val_loss, t_acc=train_perplexirty, v_acc=val_perplexirty))

    print('Best val perplexirty: {:4f}'.format(best_perplexity))
    model.load_state_dict(best_model_wts)

    return model, history

# Main model

In [13]:
class CharLM(nn.Module):
    def __init__(self, hidden_dim: int, vocab_size: int):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        self.rnn = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, hidden_dim)
        self.projection = nn.Linear(hidden_dim, vocab_size)

        self.non_lin = nn.Tanh()
        self.dropout = nn.Dropout(p=0.1)

    def forward(self, input_batch) -> torch.Tensor:
        embeddings = self.embedding(input_batch)  # [batch_size, seq_len, hidden_dim]
        output, _ = self.rnn(embeddings)  # [batch_size, seq_len, hidden_dim]
        output = self.dropout(self.linear(self.non_lin(output)))  # [batch_size, seq_len, hidden_dim]
        projection = self.projection(self.non_lin(output))  # [batch_size, seq_len, vocab_size]

        return projection

In [14]:
model = CharLM(hidden_dim=256, vocab_size=len(vocab)).to(device)

num_params = sum(p.numel() for p in model.parameters())
print(model)
print(f"Number of model parameters: {num_params:,}")

CharLM(
  (embedding): Embedding(91, 256)
  (rnn): GRU(256, 256, batch_first=True)
  (linear): Linear(in_features=256, out_features=256, bias=True)
  (projection): Linear(in_features=256, out_features=91, bias=True)
  (non_lin): Tanh()
  (dropout): Dropout(p=0.1, inplace=False)
)
Number of model parameters: 507,227


In [15]:
model, history = train(train_dataloader, eval_dataloader, model, 10)

epoch:  10%|█         | 1/10 [00:35<05:15, 35.02s/it]


Epoch 001 train_loss: 2.1936     val_loss 1.7596 train_perplexirty 10.3569 val_perplexirty 5.8121


epoch:  20%|██        | 2/10 [01:09<04:36, 34.51s/it]


Epoch 002 train_loss: 1.6882     val_loss 1.6169 train_perplexirty 5.4163 val_perplexirty 5.0392


epoch:  30%|███       | 3/10 [01:43<04:00, 34.33s/it]


Epoch 003 train_loss: 1.5971     val_loss 1.5598 train_perplexirty 4.9411 val_perplexirty 4.7593


epoch:  40%|████      | 4/10 [02:17<03:25, 34.28s/it]


Epoch 004 train_loss: 1.5504     val_loss 1.5254 train_perplexirty 4.7150 val_perplexirty 4.5986


epoch:  50%|█████     | 5/10 [02:51<02:51, 34.25s/it]


Epoch 005 train_loss: 1.5201     val_loss 1.5022 train_perplexirty 4.5740 val_perplexirty 4.4928


epoch:  60%|██████    | 6/10 [03:25<02:16, 34.21s/it]


Epoch 006 train_loss: 1.4979     val_loss 1.4849 train_perplexirty 4.4738 val_perplexirty 4.4157


epoch:  70%|███████   | 7/10 [03:59<01:42, 34.19s/it]


Epoch 007 train_loss: 1.4810     val_loss 1.4726 train_perplexirty 4.3987 val_perplexirty 4.3619


epoch:  80%|████████  | 8/10 [04:34<01:08, 34.17s/it]


Epoch 008 train_loss: 1.4678     val_loss 1.4623 train_perplexirty 4.3410 val_perplexirty 4.3171


epoch:  90%|█████████ | 9/10 [05:08<00:34, 34.15s/it]


Epoch 009 train_loss: 1.4568     val_loss 1.4540 train_perplexirty 4.2933 val_perplexirty 4.2817


epoch: 100%|██████████| 10/10 [05:42<00:00, 34.24s/it]


Epoch 010 train_loss: 1.4475     val_loss 1.4468 train_perplexirty 4.2539 val_perplexirty 4.2509
Best val perplexirty: 4.250897





Функция для генерации последовательности

In [41]:
def generate_sequence(model, dict_2ind ,ind2dict, starting_seq: str, max_seq_len: int = 256) -> str:
    device = 'cpu'
    model = model.to(device)
    input_ids = [dict_2ind['<bos>']] + [
        dict_2ind.get(char, dict_2ind['<unk>']) for char in starting_seq]
    input_ids = torch.LongTensor(input_ids).to(device)

    model.eval()
    with torch.no_grad():
        for i in range(max_seq_len):
            next_char_distribution = model(input_ids)[-1]
            next_char = next_char_distribution.squeeze().argmax()
            input_ids = torch.cat([input_ids, next_char.unsqueeze(0)])

            if next_char.item() == dict_2ind['<eos>']:
                break

    words = ' '.join([ind2dict[idx.item()] for idx in input_ids])

    return words

In [19]:
generate_sequence(model, char2ind, ind2char, starting_seq='источник ')

'<bos>источник может быть производственные программирования и программирования программирования в программировании в программировании в программировании в программировании в программировании в программировании в программировании в программировании в программировании в пр'

# Токенизация по словам

In [20]:
import gc
torch.cuda.empty_cache()
gc.collect()

1086

In [21]:
sentences = []

for sentence in tqdm(df['body_text']):
    sentences.extend(
        [x.lower() for x in sent_tokenize(sentence, language='russian') if len(x) < 256]
        )
    
print("Количество предложений", len(sentences))

words = Counter()

for sentence in tqdm(sentences):
    for word in nltk.word_tokenize(sentence):
            words[word] += 1
            
vocab = set(['<unk>', '<bos>', '<eos>', '<pad>'])
vocab_size = 40000

for elem in words.most_common(vocab_size):
    vocab.add(elem[0])
    
print("Всего слов в словаре:", len(vocab))

100%|██████████| 3300/3300 [00:10<00:00, 319.10it/s]


Количество предложений 120873


100%|██████████| 120873/120873 [00:29<00:00, 4159.08it/s]


Всего слов в словаре: 40004


In [22]:
word2ind = {char: i for i, char in enumerate(vocab)}
ind2word = {i: char for char, i in word2ind.items()}

In [23]:
class WordDataset:
    def __init__(self, sentences):
        self.data = sentences
        self.unk_id = word2ind['<unk>']
        self.bos_id = word2ind['<bos>']
        self.eos_id = word2ind['<eos>']
        self.pad_id = word2ind['<pad>']

    def __getitem__(self, idx: int) -> List[int]:
        tokenized_sentence = [self.bos_id]
        tokenized_sentence += [word2ind.get(word, self.unk_id) for word in nltk.word_tokenize(self.data[idx])]
        tokenized_sentence += [self.eos_id]
        
        return tokenized_sentence

    def __len__(self) -> int:
        return len(self.data)

In [28]:
def collate_fn_with_padding(
    input_batch: List[List[int]], pad_id=word2ind['<pad>']) -> torch.Tensor:
    seq_lens = [len(x) for x in input_batch]
    max_seq_len = max(seq_lens)

    new_batch = []
    for sequence in input_batch:
        for _ in range(max_seq_len - len(sequence)):
            sequence.append(pad_id)
        new_batch.append(sequence)

    sequences = torch.LongTensor(new_batch).to(device)

    new_batch = {
        'input_ids': sequences[:,:-1],
        'target_ids': sequences[:,1:]
    }

    return new_batch

In [29]:
train_sentences, eval_sentences = train_test_split(sentences, test_size=0.2)

train_dataset = WordDataset(train_sentences)
eval_dataset = WordDataset(eval_sentences)

train_dataloader = DataLoader(
    train_dataset, collate_fn=collate_fn_with_padding, batch_size=64,)

eval_dataloader = DataLoader(
    eval_dataset, collate_fn=collate_fn_with_padding, batch_size=64)

In [31]:
model = CharLM(hidden_dim=256, vocab_size=len(vocab)).to(device)

num_params = sum(p.numel() for p in model.parameters())
print(model)
print(f"Number of model parameters: {num_params:,}")

CharLM(
  (embedding): Embedding(40004, 256)
  (rnn): GRU(256, 256, batch_first=True)
  (linear): Linear(in_features=256, out_features=256, bias=True)
  (projection): Linear(in_features=256, out_features=40004, bias=True)
  (non_lin): Tanh()
  (dropout): Dropout(p=0.1, inplace=False)
)
Number of model parameters: 20,982,596


In [36]:
best_model, losses = train(train_dataloader, eval_dataloader, model, 10, ignore_index = word2ind["<pad>"])

epoch:  10%|█         | 1/10 [01:45<15:45, 105.00s/it]


Epoch 001 train_loss: 4.8116     val_loss 5.2675 train_perplexirty 125.5927 val_perplexirty 198.0715


epoch:  20%|██        | 2/10 [03:30<14:00, 105.06s/it]


Epoch 002 train_loss: 4.4128     val_loss 5.2255 train_perplexirty 83.7987 val_perplexirty 190.3353


epoch:  30%|███       | 3/10 [05:15<12:16, 105.16s/it]


Epoch 003 train_loss: 4.1199     val_loss 5.2401 train_perplexirty 62.4511 val_perplexirty 193.5814


epoch:  40%|████      | 4/10 [07:00<10:30, 105.16s/it]


Epoch 004 train_loss: 3.8801     val_loss 5.2854 train_perplexirty 49.1157 val_perplexirty 203.0046


epoch:  50%|█████     | 5/10 [08:45<08:45, 105.08s/it]


Epoch 005 train_loss: 3.6778     val_loss 5.3549 train_perplexirty 40.0971 val_perplexirty 218.1294


epoch:  60%|██████    | 6/10 [10:30<07:00, 105.04s/it]


Epoch 006 train_loss: 3.5041     val_loss 5.4265 train_perplexirty 33.6972 val_perplexirty 234.8418


epoch:  70%|███████   | 7/10 [12:15<05:14, 105.00s/it]


Epoch 007 train_loss: 3.3560     val_loss 5.5033 train_perplexirty 29.0495 val_perplexirty 254.1279


epoch:  80%|████████  | 8/10 [14:00<03:29, 104.98s/it]


Epoch 008 train_loss: 3.2282     val_loss 5.5834 train_perplexirty 25.5596 val_perplexirty 275.9186


epoch:  90%|█████████ | 9/10 [15:45<01:44, 104.99s/it]


Epoch 009 train_loss: 3.1171     val_loss 5.6592 train_perplexirty 22.8660 val_perplexirty 298.2022


epoch: 100%|██████████| 10/10 [17:30<00:00, 105.02s/it]


Epoch 010 train_loss: 3.0210     val_loss 5.7391 train_perplexirty 20.7655 val_perplexirty 323.7256
Best val perplexirty: 190.335309





In [52]:
generate_sequence(model, word2ind, ind2word, starting_seq=nltk.word_tokenize('кот'))

'<bos> кот высотой 2 марта 2008 года выборы президента российской федерации выборы уважаемый избиратель ! <eos>'