# lib

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
import string
import re
import nltk
import collections

In [3]:
import pandas as pd

In [4]:
import math

In [5]:
import Data

In [6]:
import Model

# Data

### data_draft

In [9]:
# 去除标点
chi_string = '？！“”。，《》[]〖〗'
def dePunctuation(line):
    line = line.translate(str.maketrans('', '', string.punctuation))
    line = line.translate(str.maketrans('', '', chi_string))
    return line

In [10]:
def clean(text):
    text_new = dePunctuation(text)
    return text_new

English + TAB + The Other Language + TAB + Attribution

Attribution包含了来源材料的域名、句子的ID号以及句子所有者的用户名，可以直接忽略

In [11]:
file = open("fra_clean.txt", "r")
ori_text = []
for line in file:
    ori_text.append(line.strip())
file.close()

In [12]:
# 为了减少运算量少取几行
raw_text = [ori_text[i] for i in range(2000)]

In [13]:
lines = '\t'.join(raw_text)
lines[0:40]

'Go.\tVa !\tGo.\tMarche.\tGo.\tEn route !\tGo.\t'

In [14]:
text_clean = clean(lines)
text_clean = text_clean.split('\t')

In [15]:
corpus = [line.split('\t') for line in text_clean]
corpus = [word.split() for line in corpus for word in line]

In [16]:
corpus_en = corpus[0::2]
corpus_de = corpus[1::2]

In [17]:
def make_vocab(corpus):
    vocab = [word for line in corpus for word in line]
    vocab = set(vocab)
    vocab_size = len(vocab)
        
    return vocab, vocab_size

In [18]:
vocab_en, vocab_size_en = make_vocab(corpus_en)
vocab_de, vocab_size_de = make_vocab(corpus_de)

In [19]:
word2idx_en = {word:i for i, word in enumerate(vocab_en)}
idx2word_en = {i:word for i, word in enumerate(vocab_en)}

In [20]:
en_nan = torch.zeros((1, vocab_size_en), dtype=torch.float32)
en_nan.shape

torch.Size([1, 570])

In [21]:
word2idx_de = {word:i for i, word in enumerate(vocab_de)}
idx2word_de = {i:word for i, word in enumerate(vocab_de)}

encoder和decoder的词要分开编码：因为decoder时，生成的词必须是目标语言

# model

## 试一个sample

### data

In [22]:
encoder = text_clean[0::2]
decoder = text_clean[1::2]

encoder

In [23]:
batch_size = len(encoder)
batch_size

2000

In [24]:
encoder_lines = [line.split() for line in encoder]
seq_length = max(len(line) for line in encoder_lines)
seq_length

4

In [25]:
len(encoder_lines)

2000

In [26]:
# 创建df,batch_szie行, seq_length列
df = pd.DataFrame(index=range(seq_length), columns=range(batch_size))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Columns: 2000 entries, 0 to 1999
dtypes: object(2000)
memory usage: 62.6+ KB


In [27]:
df = pd.DataFrame(encoder_lines)
df = df.map(lambda x: word2idx_en.get(x) if x in word2idx_en else -1).T

In [28]:
array = df.to_numpy()

new_array = np.empty((seq_length,batch_size,vocab_size_en))

for a in range(len(array)):
    new_array[a] = np.eye(vocab_size_en)[array[a]]

In [29]:
encoder_tensor = torch.from_numpy(new_array).to(torch.float32).transpose(0,1)
encoder_tensor.shape

torch.Size([2000, 4, 570])

In [30]:
# 删除最后一个维度的最后一列，因为最后一列是-1填充的nan值
if encoder_tensor.shape[-1]%2 == 0:
    encoder_tensor[:, :, -1] = 0
else:
    encoder_tensor = encoder_tensor[:, :, :-1]
    
encoder_tensor.shape

torch.Size([2000, 4, 570])

data from Data.py

In [7]:
corpus = Data.Data('fra_clean.txt',2000)

# encoder
corpus_tensor_en, vocab_size_en, word2idx_en, idx2word_en = corpus.encoder()

# decoder
corpus_tensor_de, vocab_size_de, word2idx_de, idx2word_de = corpus.decoder()

batch_size, seq_len, vocab_size

### encoder

inputs

1. 不考虑降维的话，embedding层可以用独热编码
2. 位置编码方法：每个位置对应一个d维向量(vocab_size)；为正弦和余弦对；
   
   p_t = [sin(w_1 * t), cos(w_1 * t), sin(w_2 * t), cos(w_2 * t), ... , sin(w_d/2 * t), cos(w_d/2 * t)], 共d/2 *2 维，d需要能被2整除。
   
   w_k = 1/(10000^2k/d)

   整个position encoding的维度为seq_length * vocab_size

   位置编码是固定的，使用register_buffer：PyTorch中nn.Module类的一个方法，用于注册一个不需要进行训练的缓冲张量（buffer tensor）

Q, K, V

In [8]:
class Weights(nn.Module):
    def __init__(self, hidden_size, input_size, num_heads):
        super(Weights, self).__init__()
        
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.num_heads = num_heads
        
        # Q, K, V，Q = inputs * W_Q
        self.W_Q = nn.Linear(input_size, hidden_size)
        # k, v的维度需要一致；为了计算方便，将此处Q的维度也设为input_s
        self.W_K = nn.Linear(input_size, hidden_size)
        self.W_V = nn.Linear(input_size, hidden_size)

        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, inputs):

        batch_size, seq_length, _ = inputs.shape
        hidden_size = self.hidden_size
        num_heads = self.num_heads
        
        Q = self.W_Q(inputs)
        V = self.W_V(inputs)
        K = self.W_K(inputs)

        # multi-head
        Q = Q.view(batch_size, -1, seq_length, hidden_size//num_heads)
        V = V.view(batch_size, -1, seq_length, hidden_size//num_heads)
        K = K.view(batch_size, -1, seq_length, hidden_size//num_heads).transpose(-1,-2)
            
        return Q, V, K

In [9]:
corpus_tensor_en.shape

torch.Size([2000, 4, 570])

In [10]:
hidden_size = 128
input_size = 570
num_heads = 2

In [11]:
weights = Weights(hidden_size, input_size, num_heads)

In [12]:
Q, V, K = weights(corpus_tensor_en)

In [13]:
Q.shape

torch.Size([2000, 2, 4, 64])

In [14]:
V.shape

torch.Size([2000, 2, 4, 64])

In [15]:
K.shape

torch.Size([2000, 2, 64, 4])

In [16]:
e = torch.matmul(Q, K)

In [17]:
e.shape

torch.Size([2000, 2, 4, 4])

In [18]:
corpus_tensor_en.shape

torch.Size([2000, 4, 570])

In [22]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size

        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, Q, K, V, mask = False):

        batch_size, seq_length, _, _ = Q.shape
        hidden_size = self.hidden_size

        # scaled scores
        e = torch.matmul(Q, K)/ math.sqrt(hidden_size)

        # mask
        if mask:
            e.masked_fill_(torch.triu(torch.ones_like(e), diagonal=1) == 1, float("-inf"))

        # attention_distribution
        softmax = self.softmax
        a = softmax(e)

        # output
        o = torch.matmul(a,V)
        o = o.view(batch_size, seq_length, hidden_size)
            
        return o

In [26]:
attention = Attention(hidden_size = 128)

In [27]:
o = attention(Q, K, V)

RuntimeError: Expected size for first two dimensions of batch2 tensor to be: [4000, 64] but got: [4000, 4].

#### embedding

In [30]:
x = corpus_tensor_en

In [31]:
x.shape

torch.Size([2000, 4, 570])

In [35]:
x[0][0][0:10]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [32]:
d = x.shape[-1]
d

570

In [33]:
embedding = nn.Linear(d,d)

In [34]:
y = embedding(x)
y.shape

torch.Size([2000, 4, 570])

In [36]:
y[0][0][0:10]

tensor([-0.0092, -0.0451,  0.0186, -0.0702,  0.0322,  0.0152, -0.0232,  0.0250,
         0.0185, -0.0034], grad_fn=<SliceBackward0>)

In [29]:
# embedding
emb_x = x
emb_x.shape

torch.Size([2, 4, 570])

In [30]:
position_encode = Position.Encode()
p = position_encode.get_position(emb_x.shape)
p.shape

torch.Size([2, 4, 570])

In [31]:
# combine
inputs = (p + emb_x)
inputs.size()

torch.Size([2, 4, 570])

class

In [45]:
position_embedding = Model.PositionEmbedding(corpus_tensor_en.shape)

In [48]:
inputs = position_embedding(corpus_tensor_en)

In [49]:
inputs.shape

torch.Size([2000, 4, 570])

#### attention

batch_size, num_heads, seq_length, hidden_size/num_heads

In [32]:
# input_size_en = vocab_size_en
hidden_size =128
num_heads = 4
batch_size, seq_length, input_size = inputs.shape
multi_attention = MultiAttention.MultiAttention(hidden_size, input_size, num_heads)

In [37]:
o = multi_attention(inputs)

In [38]:
o.shape

torch.Size([2, 4, 128])

In [8]:
def attention(inputs, hidden_size, num_heads):
    batch_size, seq_length, input_size = inputs.shape
    multi_attention = Model.MultiAttention(hidden_size, input_size, num_heads)
    o = multi_attention(inputs)
    return o

In [9]:
hidden_size =128
num_heads = 4

In [10]:
o = attention(inputs, hidden_size, num_heads)

In [11]:
o.shape

torch.Size([2000, 4, 128])

每个token的attention output

#### feed froward

In [52]:
attn_size = o.shape[-1]
ff_size = 64

In [53]:
feedforward = FeedForward.FeedForward(attn_size, ff_size)

In [54]:
x = feedforward.forward(o)

In [55]:
x.shape

torch.Size([2, 4, 128])

In [12]:
def feedforward(o, ff_size):
    attn_size = o.shape[-1]
    feedforward = Model.FeedForward(attn_size, ff_size)
    x = feedforward.forward(o)
    return x

In [13]:
ff_size = 64

In [14]:
x = feedforward(o, ff_size)

In [15]:
x.shape

torch.Size([2000, 4, 128])

#### linear and softmax

In [24]:
class Outputs(nn.Module):
    def __init__(self, ff_size, vocab_size):
        super(Outputs, self).__init__()
        self.linear = nn.Linear(ff_size, vocab_size)
        self.softmax = nn.Softmax(dim =-1)
        
    def forward(self, x):
        x = self.linear(x)
        x = self.softmax(x)
        return x

In [28]:
outputs = Outputs(ff_size=100, vocab_size=vocab_size_en)

In [29]:
y = outputs(x)

In [30]:
y.shape

torch.Size([2000, 4, 570])

In [26]:
linear = nn.Linear(128,vocab_size_en)
softmax = nn.Softmax(dim =-1)

In [27]:
y = linear(x)

In [28]:
outputs = softmax(y)

In [29]:
outputs.shape

torch.Size([2000, 4, 570])

#### encoder

In [None]:
import Components

encoder = Components.Encoder(corpus_tensor_en, hidden_size = 128, num_heads = 4, ff_size = 100)

x = encoder.ff

x.shape

### model

## 类

In [None]:
class Transformer(nn.Module):
    def __init__():

    def forward():

    def encoder():

    def decoder():

In [None]:
class Decoder(nn.Module):

In [None]:
class EncoderLayer(nn.Module):

In [None]:
class DecoderLayer(nn.Module):