In [103]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import numpy as np

In [101]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

In [92]:
file = '/scepter/corpus/written/EFCAMDAT/efcamdat2_tgt.txt'

raw_text = open(file, 'r', encoding='utf8').readlines()[:100000]#.replace('\n', ' ')
raw_text = ' '.join(raw_text)

In [159]:
seq_length = 50
batch_size = 32
embedding_dim = 256
hidden_dim = 256
learning_rate = 0.0001
dropout = 0.2
epochs = 10
log_interval = 25

In [94]:
# 語料裡所有出現過的 words
words = raw_text.split(' ')
unq_words = sorted(list(set(words)))

# 給每個 word 一個對應的 index，比較好做接下來的任務
word_to_int = dict((w, i) for i, w in enumerate(unq_words))
int_to_words = dict((i, w) for i, w in enumerate(unq_words))

# 共生成 N 個 input-target pair，
# 每個 input 長度為 seq_length，target 長度為 1
n_words = len(words)
dataX = [] # N x seq_length
dataY = [] # N x 1

for i in range(0, n_words - seq_length):
    seq_in = words[i:i + seq_length]
    seq_out = words[i + seq_length]
    dataX.append([word_to_int[word] for word in seq_in])
    dataY.append(word_to_int[seq_out])

In [95]:
# 採用 mini-batch，尾巴不足 batch_size 的直接捨棄
n_patterns = len(dataY)
n_patterns = n_patterns - n_patterns % batch_size
X = dataX[:n_patterns]
Y = dataY[:n_patterns]

# 把 array 每 batch_size 筆資料包成一組，並包成 tensor
X = np.array(X)
_, seq_length = X.shape
X = X.reshape(-1, batch_size, seq_length)
X = torch.cuda.LongTensor(X)

Y = np.array(Y)
Y = Y.reshape(-1, batch_size)
Y = torch.cuda.LongTensor(Y)

In [160]:
class Net(nn.Module):
    def __init__(self, n_vocab, embedding_dim, hidden_dim, dropout=0.2):
        super(Net, self).__init__()
        
        self.embedding_dim = embedding_dim
        
        self.hidden_dim = hidden_dim
        
        # nn.Embedding 可以幫我們建立好字典中每個字對應的 vector
        self.embeddings = nn.Embedding(n_vocab, embedding_dim)
        
        # LSTM layer，形狀為 (input_size, hidden_size, ...)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, dropout=dropout)
        
        # Fully-connected layer，把 hidden state 線性轉換成 output
        self.hidden2out = nn.Linear(hidden_dim, n_vocab)
        
        
    def forward(self, seq_in):
        # LSTM 接受的 input 形狀為 (timesteps, batch, features)，
        # 即 (seq_length, batch_size, embedding_dim)
        # 所以先把形狀為 (batch_size, seq_length) 的 input 轉置後，
        # 再把每個 value (char index) 轉成 embedding vector
        embeddings = self.embeddings(seq_in.t())
        
        # LSTM 層的 output (lstm_out) 有每個 timestep 出來的結果
        #（也就是每個字進去都會輸出一個 hidden state）
        # 這邊我們取最後一層的結果，即最近一次的結果，來預測下一個字
        lstm_out, _ = self.lstm(embeddings)
        ht = lstm_out[-1]
        
        # 線性轉換至 output
        out = self.hidden2out(ht)
        return out

In [161]:
def train(model, optimizer, epoch, data, log_interval):
    # 設一下 flag
    model.train()
    
    # Mini-batch 訓練 
    print(len(data))
    for batch_i, (seq_in, target) in enumerate(data):
        seq_in, target = Variable(seq_in), Variable(target)
        optimizer.zero_grad()
        output = model(seq_in)                  # 取得預測
        loss = F.cross_entropy(output, target)  # 計算 loss
        loss.backward()                         # Backpropagation
        optimizer.step()                        # 更新參數
        
        # Log 訓練進度
        if batch_i % log_interval == 0:
            print('Train epoch: {} ({:2.0f}%)\tLoss: {:.6f}'.format(epoch, 100. * batch_i / len(data), loss.data[0]))

In [164]:
# 載入資料，建立模型
train_data = list(zip(X, Y))

model = Net(len(unq_words), embedding_dim, hidden_dim, dropout=dropout)
model.cuda()

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# model.load_state_dict(torch.load(f='./rnn/model.ml'))

  "num_layers={}".format(dropout, num_layers))


In [None]:
# 訓練
for epoch in range(epochs):
    train(model, optimizer, epoch, train_data, log_interval=log_interval)

    if (epoch + 1) % 5 == 0:
        model.eval()      
        torch.save(model.state_dict(), 'model.ml')

In [204]:
# 隨機選擇一序列作為開端
start = np.random.randint(0, n_patterns - 1)
pattern = dataX[start]

# 共 n_sent 句子要生成
cnt = 0
while cnt < 1: 
    seq_in = np.array(pattern)
    seq_in = seq_in.reshape(1, -1) # batch_size = 1
    seq_in = Variable(torch.cuda.LongTensor(seq_in))
    
    # 生成此序列下一個字
    pred = model(seq_in)
    pred = F.softmax(pred).data[0].cpu().numpy() # softmax 後轉成機率分佈
    word = np.random.choice(unq_words, p=pred)          # 依機率分佈選字
    word_idx = word_to_int[word]
    
    # 印出
    print(word, end=' ')
    
    # 將字附在原序列後並移除第一個字，作為下一個 input 序列
    pattern.append(word_idx)
    pattern = pattern[1:]
    
    # 若印出代表句子結尾的標點符號，則完成一個句子生成
    if word == '.':
        # restart_seq 決定要不要重新挑選一個序列，或是完成一個完整段落
#         if restart_seq:
#             start = np.random.randint(0, n_patterns - 1)
#             pattern = dataX[start]
#             print()
        cnt += 1

  


to other people all live together in a long China .
 professionals is very bright on weekends .
 I 've six work in mushrooms .
 The infections had waving responsibility , he , presence how entered man , he was a black and straight , normal man can called in his sisters .
 On the other hand , my dreams always 've fainting me in the end of the store opposite the delay without mopped studying Last 1 year ago .
 Finally , when I opened the door was assigned my business and computer , and submit my responsibilities .
 I however it later , it was a great flight .
 I called quickly now .
 I 'm going to six other Pierre There .
 I am inviting our friends to You have to come to party on !
 See you tomorrow .
 It is opposite the market .
 So , they affected the trees that food , people days and Rafael you can come to party .
 I wearing pink sweaters and brown .
 My sister is fixed and thin .
 She looks very friendly .
 My mother 's grandfather , but my little sister , they are wearing Design sho