In [2]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118


In [7]:
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np

In [4]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self,d_k):
        super().__init__()
        self.d_k = d_k

    def forward(self,q,k,v,mask=None):
        scores = torch.matmul(q,k.transpose(-2,-1)) / torch.sqrt(torch.tensor(self.d_k))
        if mask is not None:
            scores = scores.masked_fill(mask == 0,-1e9)
        attention_weights = F.softmax(scores,dim=-1)
        output = torch.matmul(attention_weights,v)
        return output,attention_weights

In [5]:
class MultiheadAttention(nn.Module):
    def __init__(self,d_model,h):
        super().__init__()
        self.h = h
        self.d_model = d_model
        self.depth = d_model // h

        self.wq = nn.Linear(d_model,d_model)
        self.wk = nn.Linear(d_model,d_model)
        self.wv = nn.Linear(d_model,d_model)

        self.dense = nn.Linear(d_model, d_model)

    def split_heads(self,x,batch_size):
        x = x.view(batch_size,-1,self.h,self.depth)
        return x.transpose(1,2)

    def forward(self,q,k,v,mask=None):
        batch_size = q.size(0)

        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)

        attn = ScaledDotProductAttention(self.depth)
        scaled_attention, attention_weights = attn(q,k,v,mask)
        scaled_attention = scaled_attention.transpose(1,2)
        concat_attention = scaled_attention.contiguous().view(batch_size,-1,self.d_model)
        output = self.dense(concat_attention)
        return output, attention_weights


In [11]:
class PositionalEncoding(nn.Module):
    def __init__(self,d_model,max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len,d_model)

        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)

        div_term = torch.exp(torch.arange(0,d_model,2).float()*(-np.log(10000.0)))/d_model
        pe[:, 0::2] = torch.sin(position*div_term)
        pe[:, 1::2] = torch.cos(position*div_term)

        pe = pe.unsqueeze(0).transpose(0,1)
        self.register_buffer('pe', pe)
    
    def forward(self,x):
        return x + self.pe[:x.size(0), :]

In [17]:
class EmbeddingLayer(nn.Module):
    def __init__(self,vocab_size,d_model):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,d_model)
        self.d_model = d_model
    
    def forward(self,x):
        return self.embedding(x) * torch.sqrt(torch.tensor(self.d_model))

In [None]:
class Encoder(nn.Module):
    def __init__(self,X):
        super().__init__()
        #Have to code the encoder