In [None]:
import numpy as np                                                                             # model(x_train)은 model.forward(x_train)와 동일함.
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
import torch.optim as optim

In [None]:
class EncoderDecoder(nn.Module):
  def __init__(self,dmodel,Encoder,Decoder,input_embed,output_embed,Generator):                 #만일 init에서 encoder, edcoder 인자로해주면,(def __init__(self,dmodel,outputsize,encoder,decoder): 로써주면)
                                                                                                 #나중에 EncoderDecoder 호출(사용)할 때, EncoderDecoder(Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N...)이런식으로 초기화 해줘야함
                                                                                                 #init에서 encoder decoder 인자로 하지 않으면, self에서 해줘야함
    super(EncoderDecoder,self).__init__()                                                         # model(x_train)은 model.forward(x_train)와 동일함.

    self.encoder=Encoder
    self.decoder=Decoder
    self.generator=Generator
    self.input_embed=input_embed
    self.output_embed=output_embed

  def forward(self,input,input_mask,output,output_mask):
    embedded_input=self.input_embed(input)
    memory=self.encoder(embedded_input,False)
    embedded_output=self.output_embed(output)
    decoder_result=self.decoder(memory,embedded_output,False,True)
    
    decoder_result=self.generator(decoder_result)
    return decoder_result.view(-1,decoder_result.size(-1))                                                       #decoder_result.view(-1,decoder_result.size(-1))   지우기 원래는 decoder result만 return


class Generator(nn.Module):
  def __init__(self,dmodel,outputsize):
    super(Generator,self).__init__()
    self.linear=nn.Linear(dmodel,outputsize)

  def forward(self,x):
      y=self.linear(x)
      return F.softmax(y, dim=-1)                                               #행 방향으로 한다 dim=-1 차원을 제거한다는것,즉 열 차원을 제거하니, 행방향을 기준으로 계산,


def clone(module,N):
     
  return nn.ModuleList([copy.deepcopy(module) for i in range(N)])  

class Encoder(nn.Module):
  def __init__(self,Sublayer,layer_num):
    super(Encoder,self).__init__()
    self.layers=clone(Sublayer,layer_num)
    self.layer_num=layer_num


  def forward(self,x,x_mask):
    for layer in self.layers:
      x=layer(x,x_mask)
    return x                                                          

class EncoderSublayer(nn.Module):
  def __init__(self,dmodel,self_attention,feed_forward,dropout):      
    super(EncoderSublayer,self).__init__()
    self.Attention=self_attention       
    self.FeedForward=feed_forward    
    self.norm=nn.LayerNorm(dmodel)
    self.dropout = nn.Dropout(p=dropout)

  def forward(self,x,x_mask):
    y=self.norm(x+self.dropout(self.Attention(x,x,x,False)))
    output=self.norm(y+self.FeedForward(y))

    return output


class Decoder(nn.Module):
  def __init__(self,Sublayer,layer_num):
    super(Decoder,self).__init__()
    self.layers=clone(Sublayer,layer_num)
    self.layer_num=layer_num


  def forward(self,memory,embedded_output,input_mask,output_mask):
    for layer in self.layers:
      x=layer(memory,embedded_output,input_mask,output_mask)
    return x


class DecoderSublayer(nn.Module):
  def __init__(self,dmodel,attention,feed_forward,dropout):
    super(DecoderSublayer,self).__init__()
    self.attention=attention       #나중에 만들기
    self.feedforward=feed_forward    #나중에 만들기
    self.norm=nn.LayerNorm(dmodel)
    self.dropout = nn.Dropout(p=dropout)

  def forward(self,memory,x,input_mask,output_mask):                            #x는 embedded output, Output_mask는 True, input은 false임
    y=self.norm(x+self.dropout(self.attention(x,x,x,True)))
    y=self.norm(y+self.dropout(self.attention(memory,memory,y,False)))
    output=self.norm(y+self.feedforward(y))

    return output



#subsequent mask??

class multihead_Attention(nn.Module):
    def __init__(self,d_model, num_head,dropout=0.1):
        super(multihead_Attention, self).__init__()
        self.d_k=d_model//num_head                                               #d_q=d_k=d_v
        self.num_head=num_head
        self.Qlinear=nn.Linear(d_model,d_model)
        self.Klinear=nn.Linear(d_model,d_model)
        self.Vlinear=nn.Linear(d_model,d_model)
        self.Olinear=nn.Linear(d_model,d_model)
        
        
        self.dropout=nn.Dropout(dropout)

    def forward(self,query_input,key_input,value_input,is_mask):                              #encoder에선 input이 다 똑같은 tensor 들어옴, decoder의 multihead에선 쿼리,키는 encoder의 ouput을 받음   
        batch_size=query_input.size(0)

        Q=self.Qlinear(query_input).view(batch_size,-1,self.num_head,self.d_k).transpose(1,2)
        K=self.Klinear(key_input).view(batch_size,-1,self.num_head,self.d_k).transpose(1,2)
        V=self.Vlinear(value_input).view(batch_size,-1,self.num_head,self.d_k).transpose(1,2)
        
        attention_value=Self_Attention(query=Q,key=K,value=V,is_mask=is_mask,dropout=self.dropout)
        multihead_attention_result = attention_value.transpose(1,2).contiguous().view(batch_size, -1, self.num_head * self.d_k)
        
        return multihead_attention_result


        
    

        
def Self_Attention(query,key,value,is_mask,dropout):
        d_k=query.size(-1)

        scale_dot_attention=torch.matmul(query,key.transpose(-1,-2))/math.sqrt(d_k)
        
        if is_mask==True:
          seq_len=query.size()[-2]
          mask=torch.triu(torch.ones(seq_len,seq_len),diagonal=1)*(-1.0e9)
          scale_dot_attention= scale_dot_attention+mask

          

        attention_score=F.softmax(scale_dot_attention,dim=-1)

        if dropout is not None:
          attention_score = dropout(attention_score)
        attention_value=torch.matmul(attention_score,value)
        

        return attention_value


class FeedForward(nn.Module):
  def __init__(self,dmodel,dropout=0.1):
    super(FeedForward,self).__init__()
    self.linear1=nn.Linear(dmodel,2048)
    self.linear2=nn.Linear(2048,dmodel)
    self.relu=F.relu
    self.dropout=nn.Dropout(dropout)

  def forward(self,x):
    y=self.linear1(x)
    y=self.relu(y)
    y=self.dropout(y)
    y=self.linear2(y)

    return y  


class Embedding(nn.Module):
  def __init__(self,dmodel,vocab):
    super(Embedding,self).__init__()
    self.embedding=nn.Embedding(vocab,dmodel)
    self.dmodel=dmodel
    #print(vocab)
    #print(dmodel)

  def forward(self,input):
    print(input)
    
    y=self.embedding(input)*math.sqrt(self.dmodel)
    seq_len=input.size()[1]
    
    position_encoding=self.Positional_Encoding(seq_len,self.dmodel)

    return y+position_encoding

  def Positional_Encoding(self,seq_len,dmodel):
  
  
    position=torch.arange(seq_len).unsqueeze(1)
    dimension=torch.arange(dmodel).unsqueeze(0)

    angle=(position/np.power(10000,(2*(dimension//2))/dmodel))

    pe=np.zeros((seq_len,dmodel))
    pe[:,0::2]=np.sin(angle[:,0::2])
    pe[:,1::2]=np.cos(angle[:,1::2])      

    return torch.FloatTensor(pe)



def make_model(source_vocab, target_vocab, N=6,d_model=512, d_ff=2048, h=8, dropout=0.1):     # N은 sublayer 개수, d_ff는 feed forward dimension
  
  c=copy.deepcopy
  attention=multihead_Attention(d_model,h,dropout)
  feedforward=FeedForward(d_model,dropout)

  model=EncoderDecoder(d_model,Encoder(EncoderSublayer(d_model,c(attention),c(feedforward),dropout),N),
                 Decoder(DecoderSublayer(d_model,c(attention),c(feedforward),dropout),N),
                 Embedding(d_model,source_vocab),
                 Embedding(d_model,target_vocab),
                 Generator(d_model,target_vocab))
  for p in model.parameters():
        if p.dim() > 1:
            # nn.init.xavier_uniform(p)
            nn.init.xavier_uniform_(p)

  return model


#inputs = torch.LongTensor([[1,2,3,4]])  
#outputs = torch.LongTensor([[1,2,3,4]])  

#model = make_model(20,20,6,512, h=1)
#output_probabilities = model(inputs,False,outputs,True)
#print(output_probabilities.size())


#여기서부터는 test

def make_batch(sentences):
    input_batch = [[src_vocab[n] for n in sentences[0].split()]]
    output_batch = [[tgt_vocab[n] for n in sentences[1].split()]]
    target_batch = [[tgt_vocab[n] for n in sentences[2].split()]]
    return torch.LongTensor(input_batch), torch.LongTensor(output_batch), torch.LongTensor(target_batch)


sentences = ['ich mochte ein bier P', 'S i want a beer', 'i want a beer E']

# Transformer Parameters
# Padding Should be Zero
src_vocab = {'P': 0, 'ich': 1, 'mochte': 2, 'ein': 3, 'bier': 4}
src_vocab_size = len(src_vocab)

tgt_vocab = {'P': 0, 'i': 1, 'want': 2, 'a': 3, 'beer': 4, 'S': 5, 'E': 6}
number_dict = {i: w for i, w in enumerate(tgt_vocab)}
tgt_vocab_size = len(tgt_vocab)

src_len = 5 # length of source
tgt_len = 5 # length of target

d_model = 512  # Embedding Size
d_ff = 2048  # FeedForward dimension
d_k = d_v = 64  # dimension of K(=Q), V
n_layers = 6  # number of Encoder of Decoder Layer
n_heads = 8  # number of heads in Multi-Head Attention

model = make_model(source_vocab=src_vocab_size,target_vocab=tgt_vocab_size,N=6,d_model=512, d_ff=2048, h=8, dropout=0.1)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

enc_inputs, dec_inputs, target_batch = make_batch(sentences)

for epoch in range(200):
  optimizer.zero_grad()
  
  outputs = model(enc_inputs,False, dec_inputs,True)
  loss = criterion(outputs, target_batch.contiguous().view(-1))
  print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
  loss.backward()
  optimizer.step()



tensor([[1, 2, 3, 4, 0]])
tensor([[5, 1, 2, 3, 4]])
Epoch: 0001 cost = 1.963730




tensor([[1, 2, 3, 4, 0]])
tensor([[5, 1, 2, 3, 4]])
Epoch: 0002 cost = 1.772311
tensor([[1, 2, 3, 4, 0]])
tensor([[5, 1, 2, 3, 4]])
Epoch: 0003 cost = 1.767583
tensor([[1, 2, 3, 4, 0]])
tensor([[5, 1, 2, 3, 4]])
Epoch: 0004 cost = 1.794448
tensor([[1, 2, 3, 4, 0]])
tensor([[5, 1, 2, 3, 4]])
Epoch: 0005 cost = 1.765237
tensor([[1, 2, 3, 4, 0]])
tensor([[5, 1, 2, 3, 4]])
Epoch: 0006 cost = 1.737499
tensor([[1, 2, 3, 4, 0]])
tensor([[5, 1, 2, 3, 4]])
Epoch: 0007 cost = 1.581072
tensor([[1, 2, 3, 4, 0]])
tensor([[5, 1, 2, 3, 4]])
Epoch: 0008 cost = 1.557556
tensor([[1, 2, 3, 4, 0]])
tensor([[5, 1, 2, 3, 4]])
Epoch: 0009 cost = 1.558209
tensor([[1, 2, 3, 4, 0]])
tensor([[5, 1, 2, 3, 4]])
Epoch: 0010 cost = 1.554073
tensor([[1, 2, 3, 4, 0]])
tensor([[5, 1, 2, 3, 4]])
Epoch: 0011 cost = 1.565090
tensor([[1, 2, 3, 4, 0]])
tensor([[5, 1, 2, 3, 4]])
Epoch: 0012 cost = 1.565506
tensor([[1, 2, 3, 4, 0]])
tensor([[5, 1, 2, 3, 4]])
Epoch: 0013 cost = 1.565569
tensor([[1, 2, 3, 4, 0]])
tensor([[5, 1,