In [1]:
import torch
from torch import nn
import numpy as np
from torchsummary import summary


device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


## LoadData

In [2]:
import torchtext 

train_data = list(torchtext.datasets.IMDB(split='train',root=r'L:\Datasets'))[12500:12600]
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
def yield_tokens(data):
    for (_,text) in data:
        yield tokenizer(text)

vocab = torchtext.vocab.build_vocab_from_iterator(yield_tokens(train_data),specials=['<unk>','<sos>','<eos>'],min_freq=3) ## '<pad>',
vocab.set_default_index(vocab['<unk>'])

In [3]:
def collate_batch_noLable(data_batch):
    text_lst = []
    for _, _text in data_batch:
        tk_text = vocab(['<sos>'] + tokenizer(_text) + ['<eos>'])
        text_lst.append(torch.tensor(tk_text,dtype=torch.int64))
    text_lst = torch.nn.utils.rnn.pad_sequence(text_lst, padding_value=float(vocab['<eos>']) )  ## pad 0 to equal length
    return text_lst.to(device)

test_dl_noLable = torch.utils.data.DataLoader(train_data, batch_size=20, shuffle=True, collate_fn = collate_batch_noLable) 
## [seq_len, batch_size]

## Pre-train Embedder
主要是为了后续 Retrieve from Embedding Vector；此处只是个玩具，建议使用pre-train的word2vec等模型

In [4]:
class Emb2Class(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, embed_dim)
        self.fc = nn.Linear(embed_dim, vocab_size)
    def forward(self,inputs):
        emb = self.emb(inputs)
        return self.fc(emb)#.argmax(dim=2)

embed_dim = 300
vocab_size = len(vocab)
Emb2Class_model = Emb2Class(vocab_size, embed_dim)
Emb2Class_model

Emb2Class(
  (emb): Embedding(1110, 300)
  (fc): Linear(in_features=300, out_features=1110, bias=True)
)

In [5]:
ce = nn.CrossEntropyLoss()
def Emb2Class_lossfn(pred,real):
    loss = 0
    for i in range(real.shape[0]):
        loss += ce(pred[i],real[i])
    return loss/real.shape[0]

In [6]:
Emb2Class_optimizer = torch.optim.SGD(Emb2Class_model.parameters(), lr=1e-3)

def Emb2Class_train(dataloader, model, loss_fn, optimizer):
    lossSum = 0
    model.train()                              ### set training mode
    for inputs in dataloader:
        pred = model(inputs)
        loss = loss_fn(pred,inputs)
        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        print("=".format(epoch+1),end='')
    print("last batch loss:{}".format(loss))

In [7]:
for epoch in range(5):
    print("Epoch:{}".format(epoch+1),end='\t')
    Emb2Class_train(test_dl_noLable, Emb2Class_model, Emb2Class_lossfn, Emb2Class_optimizer)

Epoch:1	=====last batch loss:6.354733943939209
Epoch:2	=====last batch loss:5.915564060211182
Epoch:3	=====last batch loss:5.263095855712891
Epoch:4	=====last batch loss:4.849503040313721
Epoch:5	=====last batch loss:4.178357124328613


In [8]:
pre_param_emb = Emb2Class_model.emb.state_dict()

## RNN Encoder-Decoder

```
Encoder:
ht_0 --> RNN_Cell --> ht_1 --> RNN_Cell --> ht_2 --> RNN_Cell --> .... --> RNN_Cell --> ht_end
             ^                    ^                      ^                    ^
         word1_batch             word2                  word3                word_end


Decoder:
ht_0 --> RNN_Cell --> ht_1 --> RNN_Cell --> ht_2 --> RNN_Cell --> .... --> RNN_Cell --> ht_stop <eos>
          ^            |       ^              |       ^
<SOS>   __|            o_1   __|              o_2   __|       也可以每步将 ht_0 + o_t or [o_1,...o_t] 作为输入


GRU/Transformer 同理类似
```

### 参考
https://zhuanlan.zhihu.com/p/80866196

LSTM: https://curow.github.io/blog/LSTM-Encoder-Decoder/

In [9]:
# tt = nn.RNNCell(8,6)
# tt( torch.zeros((1,8)) ,torch.zeros((1,6)) ).shape   ##==>torch.Size([1, 6])

In [10]:
class RNN_encoder(nn.Module):
    def __init__(self, embed_dim, hidden_unitE):
        super().__init__()
        self.rnn_cell = nn.RNNCell(embed_dim,hidden_unitE)
        self.hidden_unitE = hidden_unitE
    def forward(self,inputs):
        htE = torch.zeros((inputs.shape[1],self.hidden_unitE))  ## (batch_size,hidden_unitE)
        for word in inputs:
            # if len(torch.nonzero(word-vocab['eos']))== 0:      ## when all words == vocab['eos']
            #     break
            htE = self.rnn_cell(word,htE)
        return htE

class RNN_decoder(nn.Module):
    def __init__(self, embed_dim, hidden_unitD):
        super().__init__()
        self.rnn_cell = nn.RNNCell(embed_dim, hidden_unitD)
        self.fc = nn.Linear(hidden_unitD, embed_dim)         ## output word's embedding vector
    def forward(self, inputs, htD):
        out_lst = []
        batch_size = htD.shape[0]
        for word in inputs:
            htD = self.rnn_cell(word,htD)           ## htD:(batch_size,hidden_unitD)
            out = self.fc(htD)   
            out_lst.append(out)
        return torch.stack(out_lst),htD

In [11]:
class RNN_AE_Net(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_unitE, hidden_unitD):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, embed_dim)           ## 可以先预训练一下？
        self.rnnEncoder = RNN_encoder(embed_dim, hidden_unitE)
        self.fc_E2D = nn.Linear(hidden_unitE, hidden_unitD)
        self.rnnDecoder = RNN_decoder(embed_dim, hidden_unitD)
    def forward(self,inputs):
        emb = self.emb(inputs)
        x = self.rnnEncoder(emb)
        x = self.fc_E2D(x)
        x,htD = self.rnnDecoder(emb,x)
        return x,htD


embed_dim = 300
hidden_unit = 200
vocab_size = len(vocab)

model = RNN_AE_Net(vocab_size, embed_dim, hidden_unit, hidden_unit)
model

RNN_AE_Net(
  (emb): Embedding(1110, 300)
  (rnnEncoder): RNN_encoder(
    (rnn_cell): RNNCell(300, 200)
  )
  (fc_E2D): Linear(in_features=200, out_features=200, bias=True)
  (rnnDecoder): RNN_decoder(
    (rnn_cell): RNNCell(300, 200)
    (fc): Linear(in_features=200, out_features=300, bias=True)
  )
)

In [12]:
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
model.emb.load_state_dict(pre_param_emb,strict=False)
model.emb.requires_grad = False

mse =  nn.MSELoss()
def loss_fn(pred,real):
    loss = 0
    for i in range(real.shape[0]):
        for j in range(real.shape[1]):
            loss += mse(pred[i,j,:],real[i,j,:])           ## loss for each word
    return loss/(real.shape[0]*real.shape[1])

def train(dataloader, model, loss_fn, optimizer):
    lossSum = 0
    model.train()                              ### set training mode
    for inputs in dataloader:
        pred,htD = model(inputs)
        real = model.emb(inputs)               ### torch.Size([seqlen, batch, embedsize])  otherwise .transpose(0, 1)
        loss = loss_fn(pred,real)
        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        print("=".format(epoch+1),end='')
    print("last batch loss:{}".format(loss))

In [13]:
for epoch in range(5):
    print("Epoch:{}".format(epoch+1),end='\t')
    train(test_dl_noLable, model, loss_fn, optimizer)

Epoch:1	=====last batch loss:1.1256271600723267
Epoch:2	=====last batch loss:1.1218613386154175
Epoch:3	=====last batch loss:1.116965413093567
Epoch:4	=====last batch loss:1.113127589225769
Epoch:5	=====last batch loss:1.1077748537063599


## Retrieve from Embedding Vector

?? 似乎完全没有用

In [14]:
for inputs in test_dl_noLable:
    break

In [15]:
emb_out,_ = model(inputs)
pred_out = Emb2Class_model.fc(emb_out).argmax(dim=2)

In [20]:
str = ""
for ii in inputs[:,0].numpy():
    str += vocab.lookup_token(ii)
    str += " "
str

'<sos> busy phillips put in one <unk> of a performance , both comedic and dramatic . erika christensen was good but busy <unk> the show . it was a nice <unk> after the <unk> , a movie starring busy , which <unk> all that great . if busy <unk> get a <unk> of any kind for this film it would be a <unk> . forget <unk> <unk> <unk> , see home room . <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> 

In [16]:
str = ""
for ii in pred_out[:,0].numpy():
    str += vocab.lookup_token(ii)
    str += " "
str

'adult winkelman scene as mention philipps queen happening tragedy want ever looked on graduate heard <eos> little anyone happen grandfather big winkelman victor still mental each video no dead amazing my wake still couple colour off those deep trier ever guys unique golden eyes kills uncanny channels winkelman unique matter deep girls happening unique must setting up less computer busy boring tragedy girls whatever read tragedy unique unique colour worthy grade their camera subtle subtle subtle kind subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subtle subt

In [17]:
ii

796

In [18]:
vocab['<eos>']

2

In [19]:
vocab.lookup_token(vocab['<eos>'])

'<eos>'