In [1]:
import sys
sys.path.insert(1, '/home/genvekt/Dev/DL_Assignment_3/RU_EN_Translation/')


In [2]:
from data_loaders.vocabulary import Vocabulary
from utils import text_to_pickle, train_val_split
from torch import nn
from data_loaders.dataset import TextDataset
import torch 
import torch.nn.functional as F
import math
import copy
import time

[nltk_data] Downloading package punkt to /home/genvekt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
torch.cuda.empty_cache()

## Parse Russian Vocabulary

In [3]:
vocab = Vocabulary()
vocab.parse_file("../data/corpus.en_ru.1m.ru", language='russian')
vocab.dump("../data/vocabularies/ru.voc")

100%|██████████| 1000000/1000000 [02:42<00:00, 6141.07it/s]


## Parse English Vocablary

In [4]:
vocab = Vocabulary()
vocab.parse_file("../data/corpus.en_ru.1m.en", language='english')
vocab.dump("../data/vocabularies/en.voc")

100%|██████████| 1000000/1000000 [02:22<00:00, 7013.32it/s]


# Parse texts to tokens

In [3]:
text_to_pickle("../data/corpus.en_ru.1m.en", "../data/corpus.en_ru.1m.en_tokens.p", language="english")

100%|██████████| 1000000/1000000 [02:15<00:00, 7356.39it/s]


In [4]:
text_to_pickle("../data/corpus.en_ru.1m.ru", "../data/corpus.en_ru.1m.ru_tokens.p", language="russian")

100%|██████████| 1000000/1000000 [02:35<00:00, 6415.40it/s]


# Split corpus into 80% train and 20% test splits

In [5]:
train_val_split(text_file1="../data/corpus.en_ru.1m.ru_tokens.p",
                text_file2="../data/corpus.en_ru.1m.en_tokens.p",
                train_file1="../data/corpus.en_ru.1m.ru_tokens_train.p",
                train_file2="../data/corpus.en_ru.1m.en_tokens_train.p",
                val_file1="../data/corpus.en_ru.1m.ru_tokens_val.p",
                val_file2="../data/corpus.en_ru.1m.en_tokens_val.p")

In [4]:
class Embedder(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
    def forward(self, x):
        return self.embed(x)

In [5]:
class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_seq_len = 80):
        super().__init__()
        self.d_model = d_model
        
        # create constant 'pe' matrix with values dependant on 
        # pos and i
        pe = torch.zeros(max_seq_len, d_model)
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = \
                math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = \
                math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
                
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
 
    
    def forward(self, x):
        # make embeddings relatively larger
        x = x * math.sqrt(self.d_model)
        #add constant to embedding
        seq_len = x.size(1)
        x = x + Variable(self.pe[:,:seq_len], requires_grad=False).type_as(x)
        return x



In [6]:
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, d_model, dropout = 0.1):
        super().__init__()
        
        self.d_model = d_model
        self.d_k = d_model // heads
        self.h = heads
        
        self.q_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)
    
    def forward(self, q, k, v, mask=None):
        
        bs = q.size(0)
        
        # perform linear operation and split into h heads
        
        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
        
        # transpose to get dimensions bs * h * sl * d_model
       
        k = k.transpose(1,2)
        q = q.transpose(1,2)
        v = v.transpose(1,2)# calculate attention using function we will define next
        scores = attention(q, k, v, self.d_k, mask, self.dropout)
        
        # concatenate heads and put through final linear layer
        concat = scores.transpose(1,2).contiguous()\
        .view(bs, -1, self.d_model)
        
        output = self.out(concat)
    
        return output

In [7]:
def attention(q, k, v, d_k, mask=None, dropout=None):
    
    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
    if mask is not None:
        mask = mask.unsqueeze(1)
        scores = scores.masked_fill(mask == 0, -1e9)
        scores = F.softmax(scores, dim=-1)
    
    if dropout is not None:
        scores = dropout(scores)
        
    output = torch.matmul(scores, v)
    return output

In [8]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
        super().__init__() 
        # We set d_ff as a default to 2048
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)
    def forward(self, x):
        x = self.dropout(F.relu(self.linear_1(x)))
        x = self.linear_2(x)
        return x

In [9]:
class Norm(nn.Module):
    def __init__(self, d_model, eps = 1e-6):
        super().__init__()
    
        self.size = d_model
        # create two learnable parameters to calibrate normalisation
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))
        self.eps = eps
    def forward(self, x):
        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
        return norm

In [10]:
# build an encoder layer with one multi-head attention layer and one # feed-forward layer
class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout = 0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.attn = MultiHeadAttention(heads, d_model)
        self.ff = FeedForward(d_model)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.ff(x2))
        return x
    
# build a decoder layer with two multi-head attention layers and
# one feed-forward layer
class DecoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.norm_3 = Norm(d_model)
        
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.dropout_3 = nn.Dropout(dropout)
        
        self.attn_1 = MultiHeadAttention(heads, d_model)
        self.attn_2 = MultiHeadAttention(heads, d_model)
        self.ff = FeedForward(d_model)
    def forward(self, x, e_outputs, src_mask, trg_mask):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs,
        src_mask))
        x2 = self.norm_3(x)
        x = x + self.dropout_3(self.ff(x2))
        return x# We can then build a convenient cloning function that can generate multiple layers:
def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

In [11]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model)
        self.layers = get_clones(EncoderLayer(d_model, heads), N)
        self.norm = Norm(d_model)
    def forward(self, src, mask):
        x = self.embed(src)
        x = self.pe(x)
        for i in range(N):
            x = self.layers[i](x, mask)
        return self.norm(x)
    
class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model)
        self.layers = get_clones(DecoderLayer(d_model, heads), N)
        self.norm = Norm(d_model)
    def forward(self, trg, e_outputs, src_mask, trg_mask):
        x = self.embed(trg)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
        return self.norm(x)

In [12]:
class Transformer(nn.Module):
    def __init__(self, src_vocab, trg_vocab, d_model, N, heads):
        super().__init__()
        self.encoder = Encoder(src_vocab, d_model, N, heads)
        self.decoder = Decoder(trg_vocab, d_model, N, heads)
        self.out = nn.Linear(d_model, trg_vocab)
    def forward(self, src, trg, src_mask, trg_mask):
        e_outputs = self.encoder(src, src_mask)
        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
        output = self.out(d_output)
        return output# we don't perform softmax on the output as this will be handled 
# automatically by our loss function

In [13]:
train_ds = TextDataset(ru_tokens="../data/corpus.en_ru.1m.ru_tokens_train.p",
                 ru_voc="../data/vocabularies/ru.voc",
                 en_tokens="../data/corpus.en_ru.1m.en_tokens_train.p",
                 en_voc="../data/vocabularies/en.voc")

In [14]:
import random

class Batches:
    def __init__(self, dataset, batch_size, sort=True, train=True):
        self.train = train
        self.dataset = dataset
        self.batch_size = batch_size
        self.len = len(dataset) // batch_size
        if self.len * batch_size < len(dataset):
            self.len += 1
            
        if self.train:
            self.dataset.sort()
            
    def iterate(self):
        # Create batch indexes and shuffle them
        batch_ids = list(range(self.len))
        random.shuffle(batch_ids)
        
        for idx in batch_ids:
            start_id = idx*self.batch_size
            end_id = min((idx+1)*self.batch_size, len(self.dataset))
            
            batch_ru = []
            batch_en = []
            # Create ru batch
            for item_idx in range(start_id, end_id):
                items = self.dataset[item_idx]
                if self.train:
                    batch_ru.append(items[0])
                    batch_en.append(items[1])
                else:
                    batch_ru.append(items)
            
            batch_ru =  self.pad_batch(batch_ru, 
                                       self.dataset.ru_voc.word_2_idx['<PAD>'])
            
            if self.train:
                batch_en =  self.pad_batch(batch_en, 
                                           self.dataset.en_voc.word_2_idx['<PAD>'])
                
                yield batch_ru, batch_en
                
            else:
                yield batch_ru
        
        
    def pad_batch(self, batch, pad_idx):
        # Get the length of longest sentence in batch
        max_len = max([len(item) for item in batch])
        
        # Create empty batch
        padded_batch = torch.full((len(batch), max_len) , fill_value=pad_idx)
        for item_id in range(len(batch)):
            item = batch[item_id]
            padded_batch[item_id, :len(item)] = item
        return padded_batch
    
    def pad_idx(self, is_source=True):
        if is_source:
            return self.dataset.ru_voc.word_2_idx['<PAD>']
        else:
            return self.dataset.en_voc.word_2_idx['<PAD>']

In [15]:
batched_train_ds = Batches(train_ds, 5, train=True)

In [16]:
import numpy as np
from torch.autograd import Variable

def nopeak_mask(size):
    np_mask = np.triu(np.ones((1, size, size)), k=1).astype('uint8')
    np_mask =  Variable(torch.from_numpy(np_mask) == 0)
    return np_mask

def create_masks(src_batch, src_pad, trg_batch=None, trg_pad=None):
    src_msk = (src_batch != src_pad).unsqueeze(-2).long()
    
    if trg_batch is not None:
        trg_msk = (trg_batch != trg_pad).unsqueeze(-2).bool()
        trg_len = trg_batch.size(1)
        np_msk = nopeak_mask(trg_len).type_as(trg_batch).bool()
        trg_msk = (trg_msk & np_msk).long()
    else:
        trg_mask = None
    return src_msk, trg_msk

In [17]:
d_model = 512
heads = 8
N = 6
src_vocab = len(train_ds.ru_voc)
trg_vocab = len(train_ds.en_voc)

model = Transformer(src_vocab, trg_vocab, d_model, N, heads)

for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)# this code is very important! It initialises the parameters with a
# range of values that stops the signal fading or getting too big.
# See this blog for a mathematical explanation.optim = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [18]:
model = model.cuda()
model.train()
print_every=10    
start = time.time()
temp = start
    
total_loss = 0
for epoch in range(2):
    for i, batch in enumerate(batched_train_ds.iterate()):
        src, trg = batch
        src = src.cuda()
        trg = trg.cuda()

        trg_input = trg[:, :-1]
        targets = trg[:, 1:].contiguous().view(-1)

        src_msk, trg_msk = create_masks(src, batched_train_ds.pad_idx(is_source=True),
                                        trg_input, batched_train_ds.pad_idx(is_source=False))

        preds = model(src, trg_input, src_msk, trg_msk)

        optim.zero_grad()

        loss = F.cross_entropy(preds.view(-1, preds.size(-1)), results, ignore_index=batched_train_ds.pad_idx(is_source=False))

        loss.backward()
        optim.step()

        total_loss += loss.data[0]
        if (i + 1) % print_every == 0:
            loss_avg = total_loss / print_every
            print("time = %dm, epoch %d, iter = %d, loss = %.3f, %ds per %d iters" % ((time.time() - start) // 60,epoch + 1, i + 1, loss_avg, time.time() - temp,print_every))
            total_loss = 0
            temp = time.time()

RuntimeError: CUDA out of memory. Tried to allocate 896.00 MiB (GPU 0; 3.95 GiB total capacity; 2.60 GiB already allocated; 505.31 MiB free; 2.61 GiB reserved in total by PyTorch)

In [23]:
from nltk.translate.bleu_score import sentence_bleu

In [32]:
if torch.rand(1)[0] > 0:
    print("a")

a


In [19]:
torch.__version__

'1.7.1'

In [20]:
!pip install torch --upgrade

Collecting torch
  Downloading torch-1.8.0-cp38-cp38-manylinux1_x86_64.whl (735.5 MB)
[K     |████████████████████████████████| 735.5 MB 38 kB/s  eta 0:00:01    |██▍                             | 56.0 MB 3.5 MB/s eta 0:03:12     |██████████████████▍             | 421.7 MB 4.2 MB/s eta 0:01:16     |█████████████████████▏          | 486.2 MB 4.5 MB/s eta 0:00:55     |█████████████████████▉          | 502.3 MB 4.5 MB/s eta 0:00:52     |███████████████████████▊        | 544.6 MB 4.6 MB/s eta 0:00:42     |██████████████████████████▎     | 604.2 MB 2.6 MB/s eta 0:00:51     |████████████████████████████▍   | 653.1 MB 2.1 MB/s eta 0:00:39
Installing collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 1.7.1
    Uninstalling torch-1.7.1:
      Successfully uninstalled torch-1.7.1
Successfully installed torch-1.8.0


In [1]:
!git status

On branch master

No commits yet

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31m../__pycache__/[m
	[31m../data/[m
	[31m../data_loaders/[m
	[31m./[m
	[31m../utils.py[m

nothing added to commit but untracked files present (use "git add" to track)


In [2]:
!git add ../data_loaders/

In [3]:
!git add ../utils.py

In [4]:
!git status

On branch master

No commits yet

Changes to be committed:
  (use "git rm --cached <file>..." to unstage)
	[32mnew file:   ../data_loaders/__pycache__/dataset.cpython-38.pyc[m
	[32mnew file:   ../data_loaders/__pycache__/vocabulary.cpython-38.pyc[m
	[32mnew file:   ../data_loaders/dataset.py[m
	[32mnew file:   ../data_loaders/vocabulary.py[m
	[32mnew file:   ../utils.py[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31m../__pycache__/[m
	[31m../data/[m
	[31m./[m



In [None]:
!git add ..