In [1]:
import os
from psutil import virtual_memory
import numpy as np
import cv2
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
import albumentations as A
from albumentations.pytorch import ToTensorV2
import time
import math

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torchvision
from torchvision import models
import torchvision.transforms as transforms
from torchsummary import summary #pip install torchsummary
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

import tensorflow_datasets as tfds
import pandas as pd

In [439]:
ram_bit = virtual_memory().total
ram_gb = ram_bit / (2**10) / (2**10) / (2**10)

In [440]:
ram_gb

31.300193786621094

In [None]:
gdown.download('https://storage.yandexcloud.net/aiueducation/Content/advanced/l3/rus-eng.zip', None, quiet=True)

In [None]:
!unzip -o rus-eng.zip 

In [4]:
train = []
with open("rus.txt", 'r', encoding='utf-8') as f: # Открываем файл словаря в режиме чтения
    lines = f.read().split('\n')                  # Читаем весь файл, режем на строки

for i,line in enumerate(lines):
    try:
        if len(line.split("\t")[0])<=40:
            input_text, target_text, _ = line.split("\t")
            
            train.append({'rus':target_text, 'eng':input_text})
    except:
        print(line == '')
        continue

# ds = tf.data.Dataset.from_tensor_slices(pd.DataFrame.from_dict(train).to_dict(orient="list"))

tokenizer_ru = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (pd.DataFrame.from_dict(train).to_dict(orient="list")['rus']), target_vocab_size=2**13)

tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (pd.DataFrame.from_dict(train).to_dict(orient="list")['eng']), target_vocab_size=2**13)


both = np.array(list(map(lambda x:[x['rus'],x['eng']], train)))
for_filter = np.array(list(map(lambda x: True if '\xa0' not in x else False, both[:,1])))
both = both[for_filter]

ru_corpus = np.array(list(map(lambda x:torch.Tensor(np.array([tokenizer_ru.vocab_size] + tokenizer_ru.encode(x) + \
                     [tokenizer_ru.vocab_size+1])).type(torch.int64), both[:,0])))

en_corpus = np.array(list(map(lambda x:torch.Tensor(np.array([tokenizer_en.vocab_size] + \
                                                             tokenizer_en.encode(x))).type(torch.int64), both[:,1])))

en_target = np.array(list(map(lambda x:torch.Tensor(np.array(tokenizer_en.encode(x) + \
                                     [tokenizer_en.vocab_size+1])).type(torch.int64), both[:,1])))

In [20]:
# https://github.com/tunz/transformer-pytorch/blob/master/model/transformer.py

def initialize_weight(x):
    nn.init.xavier_uniform_(x.weight)
    if x.bias is not None:
        nn.init.constant_(x.bias, 0)

class MultiHeadAttention(nn.Module):
    def __init__(self, hidden_size, dropout_rate, head_size=8):
        super(MultiHeadAttention, self).__init__()

        self.head_size = head_size

        self.att_size = att_size = hidden_size // head_size
        self.scale = att_size ** -0.5

        self.linear_q = nn.Linear(hidden_size, head_size * att_size, bias=False)
        self.linear_k = nn.Linear(hidden_size, head_size * att_size, bias=False)
        self.linear_v = nn.Linear(hidden_size, head_size * att_size, bias=False)
        
        initialize_weight(self.linear_q)
        initialize_weight(self.linear_k)
        initialize_weight(self.linear_v)

        self.att_dropout = nn.Dropout(dropout_rate)

        self.output_layer = nn.Linear(head_size * att_size, hidden_size,
                                      bias=False)
        initialize_weight(self.output_layer)

    def forward(self, q, k, v):
        orig_q_size = q.size()

        d_k = self.att_size
        d_v = self.att_size
        batch_size = q.size(0)

        # head_i = Attention(Q(W^Q)_i, K(W^K)_i, V(W^V)_i)
        q = self.linear_q(q).view(batch_size, -1, self.head_size, d_k)
        k = self.linear_k(k).view(batch_size, -1, self.head_size, d_k)
        v = self.linear_v(v).view(batch_size, -1, self.head_size, d_v)

        q = q.transpose(1, 2)                  # [b, h, q_len, d_k]
        v = v.transpose(1, 2)                  # [b, h, v_len, d_v]
        k = k.transpose(1, 2).transpose(2, 3)  # [b, h, d_k, k_len]

        # Scaled Dot-Product Attention.
        # Attention(Q, K, V) = softmax((QK^T)/sqrt(d_k))V
        q.mul_(self.scale)
        x = torch.matmul(q, k)  # [b, h, q_len, k_len]

        x = torch.softmax(x, dim=3)
        x = self.att_dropout(x)
        x = x.matmul(v)  # [b, h, q_len, attn]

        x = x.transpose(1, 2).contiguous()  # [b, q_len, h, attn]
        x = x.view(batch_size, -1, self.head_size * d_v)

        x = self.output_layer(x)

        assert x.size() == orig_q_size
        return x


In [21]:
class EncoderLayer(nn.Module):
    def __init__(self, hidden_size, filter_size, dropout_rate):
        super(EncoderLayer, self).__init__()
        
        self.self_attention_norm = nn.LayerNorm(hidden_size, eps=1e-6)
        self.self_attention = MultiHeadAttention(hidden_size, dropout_rate)
        self.self_attention_dropout = nn.Dropout(dropout_rate)

        self.ffn_norm = nn.LayerNorm(hidden_size, eps=1e-6)
        self.ffn = FeedForwardNetwork(hidden_size, filter_size, dropout_rate)
        self.ffn_dropout = nn.Dropout(dropout_rate)

    def forward(self, x):  # pylint: disable=arguments-differ
        y = self.self_attention_norm(x)
        y = self.self_attention(y, y, y)
        y = self.self_attention_dropout(y)
        x = x + y

        y = self.ffn_norm(x)
        y = self.ffn(y)
        y = self.ffn_dropout(y)
        x = x + y
        return x

In [60]:
class Encoder(nn.Module):
    def __init__(self, hidden_size, filter_size, dropout_rate, n_layers):
        super(Encoder, self).__init__()
        
        self.embed = nn.Embedding(tokenizer_en.vocab_size + 2, 512, 40)
        
        encoders = [EncoderLayer(hidden_size, filter_size, dropout_rate)
                    for _ in range(n_layers)]
        self.layers = nn.ModuleList(encoders)

        self.last_norm = nn.LayerNorm(hidden_size, eps=1e-6)
        self.PE = PositionalEncoding(hidden_size)

    def forward(self, inputs):
        encoder_output = inputs
        
        encoder_output = self.embed(encoder_output)
        
        encoder_output = self.PE(encoder_output)
        
        for enc_layer in self.layers:
            encoder_output = enc_layer(encoder_output)
        return self.last_norm(encoder_output)

In [13]:
class DecoderLayer(nn.Module):
    def __init__(self, hidden_size, filter_size, dropout_rate):
        super(DecoderLayer, self).__init__()

        self.self_attention_norm = nn.LayerNorm(hidden_size, eps=1e-6)
        self.self_attention = MultiHeadAttention(hidden_size, dropout_rate)
        self.self_attention_dropout = nn.Dropout(dropout_rate)

        self.enc_dec_attention_norm = nn.LayerNorm(hidden_size, eps=1e-6)
        self.enc_dec_attention = MultiHeadAttention(hidden_size, dropout_rate)
        self.enc_dec_attention_dropout = nn.Dropout(dropout_rate)

        self.ffn_norm = nn.LayerNorm(hidden_size, eps=1e-6)
        self.ffn = FeedForwardNetwork(hidden_size, filter_size, dropout_rate)
        self.ffn_dropout = nn.Dropout(dropout_rate)

    def forward(self, x, enc_output):
        y = self.self_attention_norm(x)
        y = self.self_attention(y, y, y)
        y = self.self_attention_dropout(y)
        x = x + y

        if enc_output is not None:
            y = self.enc_dec_attention_norm(x)
            y = self.enc_dec_attention(y, enc_output, enc_output)
            y = self.enc_dec_attention_dropout(y)
            x = x + y

        y = self.ffn_norm(x)
        y = self.ffn(y)
        y = self.ffn_dropout(y)
        x = x + y
        return x

In [170]:
class Decoder(nn.Module):
    def __init__(self, hidden_size, filter_size, dropout_rate, n_layers):
        
        super(Decoder, self).__init__()
        
        self.embed = nn.Embedding(tokenizer_en.vocab_size + 2, 512, 40)
        
        decoders = [DecoderLayer(hidden_size, filter_size, dropout_rate)
                    for _ in range(n_layers)]
        self.layers = nn.ModuleList(decoders)

        self.last_norm = nn.LayerNorm(hidden_size, eps=1e-6)
        self.PE = PositionalEncoding(hidden_size)

    def forward(self, targets, enc_output):
        
        decoder_output = targets
        
        decoder_output = self.embed(decoder_output)
        
        decoder_output = self.PE(decoder_output)
        
        for i, dec_layer in enumerate(self.layers):
            decoder_output = dec_layer(decoder_output, enc_output)

        return self.last_norm(decoder_output)

In [199]:
class FeedForwardNetwork(nn.Module):
    def __init__(self, hidden_size, filter_size, dropout_rate):
        super(FeedForwardNetwork, self).__init__()

        self.layer1 = nn.Linear(hidden_size, filter_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.layer2 = nn.Linear(filter_size, hidden_size)

        initialize_weight(self.layer1)
        initialize_weight(self.layer2)

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.layer2(x)
        return x

In [200]:
class FinalFeedForwardNetwork(nn.Module):
    def __init__(self, hidden_size, filter_size, dropout_rate):
        super(FinalFeedForwardNetwork, self).__init__()

        self.layer1 = nn.Linear(hidden_size, filter_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.layer2 = nn.Linear(filter_size, tokenizer_en.vocab_size + 2)

        initialize_weight(self.layer1)
        initialize_weight(self.layer2)

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.layer2(x)
        return x

In [201]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, max_len=40):
        """
        Inputs
            d_model - Hidden dimensionality of the input.
            max_len - Maximum length of a sequence to expect.
        """
        super().__init__()

        # Create matrix of [SeqLen, HiddenDim] representing the positional encoding for max_len inputs
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)

        # register_buffer => Tensor which is not a parameter, but should be part of the modules state.
        # Used for tensors that need to be on the same device as the module.
        # persistent=False tells PyTorch to not add the buffer to the state dict (e.g. when we save the model)
        self.register_buffer('pe', pe, persistent=False)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x

### Данные

In [183]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

params = {'EPOCHS': 2,
          'DEVICE': 'cuda:0',
          'BATCH': 64}


In [None]:
# ru_corpus_dev = torch.nn.utils.rnn.pad_sequence(ru_corpus, batch_first=True).to(params['DEVICE'])
# en_corpus_dev = torch.nn.utils.rnn.pad_sequence(en_corpus, batch_first=True).to(params['DEVICE'])
# en_target_dev = torch.nn.utils.rnn.pad_sequence(en_target, batch_first=True).to(params['DEVICE'])

In [448]:
!nvidia-smi

Thu May  4 18:04:14 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.41.03              Driver Version: 530.41.03    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce GTX 1060 6GB     Off| 00000000:07:00.0  On |                  N/A |
|  0%   45C    P8                7W / 120W|    697MiB /  6144MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [467]:
class Simple_transf(nn.Module):

    def __init__(self, hidden_size=512):
        super().__init__()
        
        self.encoder = Encoder(hidden_size=512, filter_size=2048, dropout_rate=0.2, n_layers=4)
        
        self.decoder = Decoder(hidden_size=512, filter_size=2048, dropout_rate=0.2, n_layers=4)
        
        self.ff = FinalFeedForwardNetwork(512, filter_size=tokenizer_en.vocab_size + 2, dropout_rate=0.2)
        
    def forward(self, input_enc, input_target):
        
        x = encoder(input_enc)
        y = decoder(input_target, x)
        final = ff(y)
        
        return final

In [None]:
train_dataloader = DataLoader(range(len(ru_corpus)), batch_size=64, shuffle=True)

In [468]:
model = Simple_transf()
print(5893)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=50, gamma=0.1)
    
best_model_wts = model.state_dict()
best_loss = 1000
counter = 0
first = True
for i in range(2):

    time.sleep(0.2)
    model.train(True)
    train_loss = 0.0

    for bat in train_dataloader:
        
        batch_ru = torch.nn.utils.rnn.pad_sequence(ru_corpus[batch], batch_first=True)
        batch_en = torch.nn.utils.rnn.pad_sequence(en_corpus[batch], batch_first=True)
        true_en = torch.nn.utils.rnn.pad_sequence(en_target[batch], batch_first=True)
        
        prediction = model(batch_ru, batch_en)
        loss = loss_fn(prediction.view(64*13,8219), true_en.view(64*13))
        
        optimizer.zero_grad()
        
        loss.backward()
        
        optimizer.step()
        
        counter += 1
#         print(counter)
        if counter%10==0:
            print(counter,', train_loss:',loss)
    current_lr = optimizer.param_groups[0]['lr']
    sheduler.step()


5893
10 , train_loss: tensor(8.9666, grad_fn=<NllLossBackward>)
20 , train_loss: tensor(8.9723, grad_fn=<NllLossBackward>)


KeyboardInterrupt: 