In [1]:
# import tensorflow as tf
import numpy as np
# import matplotlib.pyplot as plt
import itertools
import nltk
# import keras

from sklearn.metrics import confusion_matrix
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from collections import Counter
from string import punctuation, digits
from nltk.corpus import stopwords

import torch
import torch.nn as nn
from torch.autograd import Variable


## 1. Data preprocessing

In [2]:
# 读取全集
newsgroups_all = fetch_20newsgroups(subset='all').data
print(type(newsgroups_all))
print(len(newsgroups_all))

<class 'list'>
18846


In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

MAX_NB_WORDS = 20000 # 保留最常见的20000个单词
MAX_SEQUENCE_LENGTH = 1000 # 每篇文档限定1000词以内

tokenizer = Tokenizer(num_words=MAX_NB_WORDS) # 分词器
tokenizer.fit_on_texts(newsgroups_all)

# 文本转化为数字序列
sequences = tokenizer.texts_to_sequences(newsgroups_all)

# 词汇表：单词-数字映射
word_index = tokenizer.word_index # {'the': 1, 'to': 2, 'of': 3, 'a': 4, 'and': 5, ……}

Using TensorFlow backend.
  (fname, cnt))
  (fname, cnt))


In [4]:
print(sequences[:2])

[[14, 19415, 455, 559, 15, 29, 2552, 1240, 5609, 33, 322, 767, 2175, 2121, 871, 1343, 32, 251, 88, 77, 84, 12087, 455, 559, 15, 7, 122, 228, 63, 3, 2552, 1240, 20, 517, 3490, 50, 1, 1393, 3, 61, 437, 3, 1507, 50, 1, 1302, 2552, 3027, 3, 1, 2701, 309, 7, 122, 243, 16334, 175, 5, 4, 243, 19416, 268, 7, 122, 194, 2, 296, 37, 337, 2, 369, 4389, 22, 4, 243, 3, 7286, 12, 1, 2552, 349, 30, 20, 1502, 137, 2701, 1382, 90, 7, 397, 5987, 74, 2025, 13, 130, 56, 8, 140, 215, 90, 93, 1457, 770, 1963, 56, 8, 97, 4, 308, 9186, 1857, 2, 1306, 6, 1, 2327, 6760, 115, 348, 5987, 21, 4, 308, 3, 1857, 6, 1, 365, 658, 3, 467, 185, 1, 2552, 20, 194, 2, 1985, 1, 66, 3, 3215, 608, 7, 26, 132, 8755, 19, 2, 131, 1, 3280, 2000, 1, 1151, 1457, 770, 283, 2552, 1222], [14, 2333, 1832, 2803, 15, 1285, 129, 29, 67, 315, 847, 2812, 556, 344, 660, 3380, 4868, 12, 2812, 556, 344, 88, 77, 84, 2333, 1832, 2803, 15, 33, 560, 181, 591, 72, 3, 4839, 4815, 603, 219, 426, 5849, 3570, 2812, 32, 383, 38, 2459, 8, 6, 1, 1182, 12, 4

In [5]:
len(word_index) # vocab size

179209

In [6]:
# 按照最大文本长度截断文本或补0
features = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [7]:
print(features[:2])

[[   0    0    0 ...  283 2552 1222]
 [   0    0    0 ... 8276  129   95]]


In [8]:
labels = fetch_20newsgroups(subset='all').target

In [9]:
labels

array([10,  3, 17, ...,  3,  1,  7])

In [10]:
# 标签 独热编码 ============= pytorch CrossEntropyLoss don't need ont-hot ===============
# from keras.utils import to_categorical
# labels = to_categorical(labels)
# labels[:2]

In [11]:
print(features.shape)
print(labels.shape)

(18846, 1000)
(18846,)


In [12]:
# 拆分训练集，验证集和测试集
from sklearn.model_selection import train_test_split

x_tv, x_test, y_tv, y_test = train_test_split(features, labels, test_size=0.4, shuffle=False) # 测试集占据0.4
x_train, x_val, y_train, y_val = train_test_split(x_tv, y_tv, test_size=0.2, shuffle=False) # 验证集占据0.6数据集中的0.2 = 0.12

print(x_train.shape, x_val.shape, x_test.shape)
print(y_train.shape, y_val.shape, y_test.shape)

(9045, 1000) (2262, 1000) (7539, 1000)
(9045,) (2262,) (7539,)


## 2. GloVe pre-trained word vectors

In [13]:
# 从GloVe文件中解析出每个词和它所对应的词向量，并用字典的方式存储
# 使用的词向量：100维
embeddings_index = {}
with open('glove.6B.100d.txt', 'r', encoding='utf-8') as txtfile:
    lines = txtfile.readlines()
    for line in lines:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print(len(embeddings_index))

400000


In [14]:
print(embeddings_index['the'])

[-0.038194 -0.24487   0.72812  -0.39961   0.083172  0.043953 -0.39141
  0.3344   -0.57545   0.087459  0.28787  -0.06731   0.30906  -0.26384
 -0.13231  -0.20757   0.33395  -0.33848  -0.31743  -0.48336   0.1464
 -0.37304   0.34577   0.052041  0.44946  -0.46971   0.02628  -0.54155
 -0.15518  -0.14107  -0.039722  0.28277   0.14393   0.23464  -0.31021
  0.086173  0.20397   0.52624   0.17164  -0.082378 -0.71787  -0.41531
  0.20335  -0.12763   0.41367   0.55187   0.57908  -0.33477  -0.36559
 -0.54857  -0.062892  0.26584   0.30205   0.99775  -0.80481  -3.0243
  0.01254  -0.36942   2.2167    0.72201  -0.24978   0.92136   0.034514
  0.46745   1.1079   -0.19358  -0.074575  0.23353  -0.052062 -0.22044
  0.057162 -0.15806  -0.30798  -0.41625   0.37972   0.15006  -0.53212
 -0.2055   -1.2526    0.071624  0.70565   0.49744  -0.42063   0.26148
 -1.538    -0.30223  -0.073438 -0.28312   0.37104  -0.25217   0.016215
 -0.017099 -0.38984   0.87424  -0.72569  -0.51058  -0.52028  -0.1459
  0.8278    0.27062 ]

In [15]:
# 构建词向量矩阵, 没有的单词的词向量均为0
embedding_dim = 100
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, idx in word_index.items():
    embed_vector = embeddings_index.get(word)
    if embed_vector is not None:
        embedding_matrix[idx] = embed_vector

In [16]:
embedding_matrix.shape

(179210, 100)

## 3. RNN - LSTM

In [17]:
# 分批次训练
# def get_batches(x, y, batch_size):  # 会导致 batch_size 不同
#     n_batches = len(x) // batch_size

#     for ii in range(0, n_batches * batch_size, batch_size):
#         # 最后一个批次取余下的所有元素
#         if ii != (n_batches - 1) * batch_size: 
#             every_x, every_y = x[ii:ii+batch_size], y[ii:ii+batch_size]
#         else:
#             every_x, every_y = x[ii:], y[ii:]
#         yield every_x, every_y

# iterations 
def get_batches(x, y, batch_size=100):
    # 每个迭代之后最后剩余的s不足一个batch 的 samples 会加入下个epoch的batch
    n_batch = len(x) // batch_size
    x, y = x[:batch_size*n_batch], y[:batch_size*n_batch]
    for i in range(0, len(x), batch_size):
        yield x[i:i+batch_size], y[i:i+batch_size]

In [18]:
# truncated backpropagation
def detach(states):
    return [state.detach() for state in states]

In [19]:
# Hyper Parameters
embed_size = 100
hidden_size = 200
num_layers = 1
num_epochs = 10
batch_size = 200
# seq_length = 1000
# learning_rate = 0.002
num_classes = 20

In [20]:
vocab_size = len(word_index) + 1
vocab_size

179210

In [21]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.dropout = nn.Dropout(p=0.2)
        self.linear = nn.Linear(hidden_size, num_classes)
        self.init_weights()
        
    def init_weights(self):
        self.embed.weight.data.copy_(torch.from_numpy(embedding_matrix))
        self.linear.weight.data.uniform_(-0.1, 0.1)
        self.linear.bias.data.fill_(0)
        
    def forward(self, inputs, hidden):
        # embed word ids to vectors
        inputs = self.embed(inputs)
        
        # forward
        out, hidden = self.lstm(inputs, hidden)
        out = self.dropout(out)
        out = self.linear(out[:, -1, :])
        # out = self.sigmoid(out)
        return out, hidden

In [22]:
rnn = RNN(vocab_size, embed_size, hidden_size, num_layers, num_classes)
rnn.cuda()
print(rnn)

RNN(
  (embed): Embedding(179210, 100)
  (lstm): LSTM(100, 200, batch_first=True)
  (dropout): Dropout(p=0.2)
  (linear): Linear(in_features=200, out_features=20, bias=True)
)


In [23]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters())

In [26]:
x_train.dtype

dtype('int32')

In [27]:
# pytorch nn.Embedding needs to  Long Tensor 
# x_train = x_train.astype(np.int64)
# x_val = x_val.astype(np.int64)
# x_test = x_test.astype(np.int64)

# x_train.dtype # dtype('int64')

In [28]:
# x_train = x_train.astype(np.int32)
# x_train.dtype

In [None]:
#### merge 30 epochs #######
### ============== tune epochs ================

In [49]:
# training the model
c = 0
for epoch in range(num_epochs):
    # training
    rnn.train()
    
    # initial hidden state and memory state
    states = (Variable(torch.zeros(num_layers, batch_size, hidden_size)).cuda(), 
              Variable(torch.zeros(num_layers, batch_size, hidden_size)).cuda())
    
    for i, (x, y) in enumerate(get_batches(x_train, y_train, batch_size), 1):
        inputs = Variable(torch.from_numpy(x).long()).cuda() # trans int tensor to long tensor
        targets = Variable(torch.from_numpy(y)).cuda()
        
        # forward , backward , optim
        rnn.zero_grad()
        states = detach(states)
        outputs, states = rnn(inputs, states)
        
        loss = criterion(outputs, targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm(rnn.parameters(), 0.5)
        optimizer.step()
        
        c += 1
        if c % 20 == 0:
            print("Epoch: {}/{}...".format(epoch+1, num_epochs), 
                  "Iterations: {}...".format(c), 
                  "Loss: {:5.4f}".format(loss.data[0]))
    
    # evaluate
    rnn.eval()

    corr = total = 0

    for i, (x, y) in enumerate(get_batches(x_val, y_val, batch_size), 1):
        inputs = Variable(torch.from_numpy(x).long()).cuda()
        # targets = Variable(torch.from_numpy(y))

        # forward, backward, optimize
        outputs, _ = rnn(inputs, states)
        total += y.shape[0]
        _, pred = torch.max(outputs, 1)

        corr += (pred.data.cpu().numpy() == y).sum()

    print('Validation Accuracy: %f %%' % (100 * corr / total))

    

Epoch: 1/10... Iterations: 20... Loss: 1.8922
Epoch: 1/10... Iterations: 40... Loss: 1.7248
Validation Accuracy: 40.409091 %
Epoch: 2/10... Iterations: 60... Loss: 1.8462
Epoch: 2/10... Iterations: 80... Loss: 1.8866
Validation Accuracy: 37.409091 %
Epoch: 3/10... Iterations: 100... Loss: 1.7093
Epoch: 3/10... Iterations: 120... Loss: 1.7938
Validation Accuracy: 42.318182 %
Epoch: 4/10... Iterations: 140... Loss: 1.4754
Epoch: 4/10... Iterations: 160... Loss: 1.4191
Epoch: 4/10... Iterations: 180... Loss: 1.1286
Validation Accuracy: 48.090909 %
Epoch: 5/10... Iterations: 200... Loss: 1.1340
Epoch: 5/10... Iterations: 220... Loss: 1.2323
Validation Accuracy: 51.818182 %
Epoch: 6/10... Iterations: 240... Loss: 1.1777
Epoch: 6/10... Iterations: 260... Loss: 1.1179
Validation Accuracy: 57.409091 %
Epoch: 7/10... Iterations: 280... Loss: 0.9180
Epoch: 7/10... Iterations: 300... Loss: 1.1726
Validation Accuracy: 61.590909 %
Epoch: 8/10... Iterations: 320... Loss: 0.9154
Epoch: 8/10... Iterat

In [51]:
# training the model
c = 0
for epoch in range(num_epochs):
    # training
    rnn.train()
    
    # initial hidden state and memory state
    states = (Variable(torch.zeros(num_layers, batch_size, hidden_size)).cuda(), 
              Variable(torch.zeros(num_layers, batch_size, hidden_size)).cuda())
    
    for i, (x, y) in enumerate(get_batches(x_train, y_train, batch_size), 1):
        inputs = Variable(torch.from_numpy(x).long()).cuda() # trans int tensor to long tensor
        targets = Variable(torch.from_numpy(y)).cuda()
        
        # forward , backward , optim
        rnn.zero_grad()
        states = detach(states)
        outputs, states = rnn(inputs, states)
        
        loss = criterion(outputs, targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm(rnn.parameters(), 0.5)
        optimizer.step()
        
        c += 1
        if c % 20 == 0:
            print("Epoch: {}/{}...".format(epoch+1, num_epochs), 
                  "Iterations: {}...".format(c), 
                  "Loss: {:5.4f}".format(loss.data[0]))
    
    # evaluate
    rnn.eval()

    corr = total = 0

    for i, (x, y) in enumerate(get_batches(x_val, y_val, batch_size), 1):
        inputs = Variable(torch.from_numpy(x).long()).cuda()
        # targets = Variable(torch.from_numpy(y))

        # forward, backward, optimize
        outputs, _ = rnn(inputs, states)
        total += y.shape[0]
        _, pred = torch.max(outputs, 1)

        corr += (pred.data.cpu().numpy() == y).sum()

    print('Validation Accuracy: %f %%' % (100 * corr / total))

Epoch: 1/10... Iterations: 20... Loss: 0.4513
Epoch: 1/10... Iterations: 40... Loss: 0.4644
Validation Accuracy: 69.181818 %
Epoch: 2/10... Iterations: 60... Loss: 0.5091
Epoch: 2/10... Iterations: 80... Loss: 0.5442
Validation Accuracy: 71.590909 %
Epoch: 3/10... Iterations: 100... Loss: 0.5636
Epoch: 3/10... Iterations: 120... Loss: 0.3634
Validation Accuracy: 73.000000 %
Epoch: 4/10... Iterations: 140... Loss: 0.3175
Epoch: 4/10... Iterations: 160... Loss: 0.2846
Epoch: 4/10... Iterations: 180... Loss: 0.2258
Validation Accuracy: 72.545455 %
Epoch: 5/10... Iterations: 200... Loss: 0.1173
Epoch: 5/10... Iterations: 220... Loss: 0.2071
Validation Accuracy: 75.500000 %
Epoch: 6/10... Iterations: 240... Loss: 0.1786
Epoch: 6/10... Iterations: 260... Loss: 0.1700
Validation Accuracy: 76.272727 %
Epoch: 7/10... Iterations: 280... Loss: 0.1853
Epoch: 7/10... Iterations: 300... Loss: 0.1815
Validation Accuracy: 76.500000 %
Epoch: 8/10... Iterations: 320... Loss: 0.1744
Epoch: 8/10... Iterat

In [52]:
# training the model
c = 0
for epoch in range(num_epochs):
    # training
    rnn.train()
    
    # initial hidden state and memory state
    states = (Variable(torch.zeros(num_layers, batch_size, hidden_size)).cuda(), 
              Variable(torch.zeros(num_layers, batch_size, hidden_size)).cuda())
    
    for i, (x, y) in enumerate(get_batches(x_train, y_train, batch_size), 1):
        inputs = Variable(torch.from_numpy(x).long()).cuda() # trans int tensor to long tensor
        targets = Variable(torch.from_numpy(y)).cuda()
        
        # forward , backward , optim
        rnn.zero_grad()
        states = detach(states)
        outputs, states = rnn(inputs, states)
        
        loss = criterion(outputs, targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm(rnn.parameters(), 0.5)
        optimizer.step()
        
        c += 1
        if c % 20 == 0:
            print("Epoch: {}/{}...".format(epoch+1, num_epochs), 
                  "Iterations: {}...".format(c), 
                  "Loss: {:5.4f}".format(loss.data[0]))
    
    # evaluate
    rnn.eval()

    corr = total = 0

    for i, (x, y) in enumerate(get_batches(x_val, y_val, batch_size), 1):
        inputs = Variable(torch.from_numpy(x).long()).cuda()
        # targets = Variable(torch.from_numpy(y))

        # forward, backward, optimize
        outputs, _ = rnn(inputs, states)
        total += y.shape[0]
        _, pred = torch.max(outputs, 1)

        corr += (pred.data.cpu().numpy() == y).sum()

    print('Validation Accuracy: %f %%' % (100 * corr / total))

Epoch: 1/10... Iterations: 20... Loss: 0.0875
Epoch: 1/10... Iterations: 40... Loss: 0.0732
Validation Accuracy: 78.000000 %
Epoch: 2/10... Iterations: 60... Loss: 0.0602
Epoch: 2/10... Iterations: 80... Loss: 0.1877
Validation Accuracy: 77.954545 %
Epoch: 3/10... Iterations: 100... Loss: 0.0813
Epoch: 3/10... Iterations: 120... Loss: 0.0501
Validation Accuracy: 78.818182 %
Epoch: 4/10... Iterations: 140... Loss: 0.0428
Epoch: 4/10... Iterations: 160... Loss: 0.0257
Epoch: 4/10... Iterations: 180... Loss: 0.0293
Validation Accuracy: 78.863636 %
Epoch: 5/10... Iterations: 200... Loss: 0.0282
Epoch: 5/10... Iterations: 220... Loss: 0.0302
Validation Accuracy: 79.136364 %
Epoch: 6/10... Iterations: 240... Loss: 0.0327
Epoch: 6/10... Iterations: 260... Loss: 0.0520
Validation Accuracy: 80.272727 %
Epoch: 7/10... Iterations: 280... Loss: 0.0188
Epoch: 7/10... Iterations: 300... Loss: 0.0358
Validation Accuracy: 79.318182 %
Epoch: 8/10... Iterations: 320... Loss: 0.0220
Epoch: 8/10... Iterat

In [53]:
# Testing
corr = total = 0

for i, (x, y) in enumerate(get_batches(x_test, y_test, batch_size), 1):
    inputs = Variable(torch.from_numpy(x).long()).cuda()

    # forward, backward, optimize
    outputs, _ = rnn(inputs, states)
    total += targets.size(0)
    
    _, pred = torch.max(outputs, 1)

    corr += (pred.data.cpu().numpy() == y).sum()

print('Test Accuracy: %f %%' % (100 * corr / total))
# 10 epochs : 0.6632
# 30 epochs : 0.7968

Test Accuracy: 79.675676 %


## 4. RNN - GRU

In [66]:
# Hyper Parameters
embed_size = 100
hidden_size = 200
num_layers = 1
num_epochs = 30
batch_size = 200
# seq_length = 1000
# learning_rate = 0.002
num_classes = 20
vocab_size = len(word_index) + 1

In [67]:
class GRU_RNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, num_classes):
        super(GRU_RNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True, dropout=0.2)
        self.linear = nn.Linear(hidden_size, num_classes)
        self.init_weights()
        
    def init_weights(self):
        self.embed.weight.data.copy_(torch.from_numpy(embedding_matrix))
        self.linear.weight.data.uniform_(-0.1, 0.1)
        self.linear.bias.data.fill_(0)
        
    def forward(self, inputs, hidden):
        # embed word ids to vectors
        inputs = self.embed(inputs)
        
        # forward
        out, hidden = self.gru(inputs, hidden)
        out = self.linear(out[:, -1, :])
        # out = self.sigmoid(out)
        return out, hidden

In [68]:
gru_rnn = GRU_RNN(vocab_size, embed_size, hidden_size, num_layers, num_classes)
gru_rnn.cuda()
print(gru_rnn)

GRU_RNN(
  (embed): Embedding(179210, 100)
  (gru): GRU(100, 200, batch_first=True, dropout=0.2)
  (linear): Linear(in_features=200, out_features=20, bias=True)
)


In [69]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(gru_rnn.parameters())

In [70]:
# training the model
c = 0
for epoch in range(num_epochs):
    # training
    gru_rnn.train()
    
    # initial hidden state and memory state
    hidden_state = Variable(torch.zeros(num_layers, batch_size, hidden_size)).cuda()
    
    for i, (x, y) in enumerate(get_batches(x_train, y_train, batch_size), 1):
        inputs = Variable(torch.from_numpy(x).long()).cuda() # trans int tensor to long tensor
        targets = Variable(torch.from_numpy(y)).cuda()
        
        # forward , backward , optim
        gru_rnn.zero_grad()
        hidden_state = hidden_state.detach()
        outputs, hidden_state = gru_rnn(inputs, hidden_state)
        
        loss = criterion(outputs, targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm(gru_rnn.parameters(), 0.5)
        optimizer.step()
        
        c += 1
        if c % 20 == 0:
            print("Epoch: {}/{}...".format(epoch+1, num_epochs), 
                  "Iterations: {}...".format(c), 
                  "Loss: {:5.4f}".format(loss.data[0]))
    
    # evaluate
    gru_rnn.eval()

    corr = total = 0

    for i, (x, y) in enumerate(get_batches(x_val, y_val, batch_size), 1):
        inputs = Variable(torch.from_numpy(x).long()).cuda()

        # forward, backward, optimize
        outputs, _ = gru_rnn(inputs, hidden_state)
        total += y.shape[0]
        _, pred = torch.max(outputs, 1)

        corr += (pred.data.cpu().numpy() == y).sum()

    print('Validation Accuracy: %.2f %%' % (100 * corr / total))

Epoch: 1/30... Iterations: 20... Loss: 2.8724
Epoch: 1/30... Iterations: 40... Loss: 2.6326
Validation Accuracy: 18.68 %
Epoch: 2/30... Iterations: 60... Loss: 2.6214
Epoch: 2/30... Iterations: 80... Loss: 2.3073
Validation Accuracy: 27.55 %
Epoch: 3/30... Iterations: 100... Loss: 2.1320
Epoch: 3/30... Iterations: 120... Loss: 2.1679
Validation Accuracy: 32.59 %
Epoch: 4/30... Iterations: 140... Loss: 1.8790
Epoch: 4/30... Iterations: 160... Loss: 1.7552
Epoch: 4/30... Iterations: 180... Loss: 1.5576
Validation Accuracy: 42.32 %
Epoch: 5/30... Iterations: 200... Loss: 1.4943
Epoch: 5/30... Iterations: 220... Loss: 1.3353
Validation Accuracy: 49.73 %
Epoch: 6/30... Iterations: 240... Loss: 1.3803
Epoch: 6/30... Iterations: 260... Loss: 1.1623
Validation Accuracy: 59.50 %
Epoch: 7/30... Iterations: 280... Loss: 1.1065
Epoch: 7/30... Iterations: 300... Loss: 0.9173
Validation Accuracy: 64.45 %
Epoch: 8/30... Iterations: 320... Loss: 0.7790
Epoch: 8/30... Iterations: 340... Loss: 0.6554
Ep

In [72]:
# Testing
corr = total = 0

for i, (x, y) in enumerate(get_batches(x_test, y_test, batch_size), 1):
    inputs = Variable(torch.from_numpy(x).long()).cuda()

    # forward, backward, optimize
    outputs, _ = gru_rnn(inputs, hidden_state)
    total += y.shape[0]
    _, pred = torch.max(outputs, 1)

    corr += (pred.data.cpu().numpy() == y).sum()

print('Test Accuracy: %f %%' % (100 * corr / total))

Test Accuracy: 84.364865 %


## 5. BiRNN - LSTM

In [19]:
# Hyper Parameters
embed_size = 100
hidden_size = 200
num_layers = 2
num_epochs = 10
batch_size = 100
# seq_length = 1000
# learning_rate = 0.002
num_classes = 20

vocab_size = len(word_index) + 1

In [20]:
class BiRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, num_classes):
        super(BiRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(p=0.2)
        self.linear = nn.Linear(hidden_size*2, num_classes)
        self.init_weights()
        
    def init_weights(self):
        self.embed.weight.data.copy_(torch.from_numpy(embedding_matrix))
        self.linear.weight.data.uniform_(-0.1, 0.1)
        self.linear.bias.data.fill_(0)
        
    def forward(self, inputs, hidden):
        # embed word ids to vectors
        inputs = self.embed(inputs)
        
        # forward
        out, hidden = self.lstm(inputs, hidden)
        out = self.dropout(out)
        out = self.linear(out[:, -1, :])
        # out = self.sigmoid(out)
        return out, hidden

In [21]:
birnn = BiRNN(vocab_size, embed_size, hidden_size, num_layers, num_classes)
birnn.cuda()
print(birnn)

BiRNN(
  (embed): Embedding(179210, 100)
  (lstm): LSTM(100, 200, num_layers=2, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.2)
  (linear): Linear(in_features=400, out_features=20, bias=True)
)


In [22]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(birnn.parameters())

In [23]:
# Training
c = 0

for epoch in range(10):
    birnn.train()
    # initial hidden states and memory states
    states = (Variable(torch.zeros(num_layers*2, batch_size, hidden_size)).cuda(), 
              Variable(torch.zeros(num_layers*2, batch_size, hidden_size)).cuda())
    
    for i, (x, y) in enumerate(get_batches(x_train, y_train, batch_size), 1):
        inputs = Variable(torch.from_numpy(x).long()).cuda()
        targets = Variable(torch.from_numpy(y)).cuda()
        
        # forward backward optimization
        optimizer.zero_grad()
        states = detach(states)
        outputs, states = birnn(inputs, states)
        loss = criterion(outputs, targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm(birnn.parameters(), 0.5)
        optimizer.step()
        
        c += 1
        if c % 20 == 0:
            print("Epoch: {}/{}...".format(epoch+1, num_epochs), 
                  "Iterations: {}...".format(c), 
                  "Loss: {:5.4f}".format(loss.data[0]))
    
    # evaluate
    birnn.eval()

    corr = total = 0

    for i, (x, y) in enumerate(get_batches(x_val, y_val, batch_size), 1):
        inputs = Variable(torch.from_numpy(x).long()).cuda()

        # forward, backward, optimize
        outputs, _ = birnn(inputs, states)
        total += y.shape[0]
        _, pred = torch.max(outputs, 1)

        corr += (pred.data.cpu().numpy() == y).sum()

    print('Validation Accuracy: %.2f %%' % (100 * corr / total))

Epoch: 1/10... Iterations: 20... Loss: 2.9681
Epoch: 1/10... Iterations: 40... Loss: 2.7714
Epoch: 1/10... Iterations: 60... Loss: 2.6161
Epoch: 1/10... Iterations: 80... Loss: 2.4141
Validation Accuracy: 20.18 %
Epoch: 2/10... Iterations: 100... Loss: 2.4139
Epoch: 2/10... Iterations: 120... Loss: 2.4123
Epoch: 2/10... Iterations: 140... Loss: 2.2170
Epoch: 2/10... Iterations: 160... Loss: 2.1818
Epoch: 2/10... Iterations: 180... Loss: 2.0224
Validation Accuracy: 32.32 %
Epoch: 3/10... Iterations: 200... Loss: 2.0026
Epoch: 3/10... Iterations: 220... Loss: 1.7042
Epoch: 3/10... Iterations: 240... Loss: 2.1644
Epoch: 3/10... Iterations: 260... Loss: 1.8117
Validation Accuracy: 33.09 %
Epoch: 4/10... Iterations: 280... Loss: 1.7456
Epoch: 4/10... Iterations: 300... Loss: 2.0381
Epoch: 4/10... Iterations: 320... Loss: 1.7734
Epoch: 4/10... Iterations: 340... Loss: 1.7862
Epoch: 4/10... Iterations: 360... Loss: 1.7141
Validation Accuracy: 33.77 %
Epoch: 5/10... Iterations: 380... Loss: 1.

In [24]:
# Testing
corr = total = 0

for i, (x, y) in enumerate(get_batches(x_test, y_test, batch_size), 1):
    inputs = Variable(torch.from_numpy(x).long()).cuda()

    # forward, backward, optimize
    outputs, _ = birnn(inputs, states)
    total += y.shape[0]
    _, pred = torch.max(outputs, 1)

    corr += (pred.data.cpu().numpy() == y).sum() # cpu: runtime error; numpy: type error

print('Test Accuracy: %.2f %%' % (100 * corr / total))

Test Accuracy: 63.21 %


In [141]:
# a = np.array([0.1, 0.2, 0.49, 0.52, 0.9])
# ta = torch.from_numpy(a)
# tb = (torch.round(ta)).cpu().numpy()
# tb

array([0., 0., 0., 1., 1.])

In [142]:
# y = np.array([0, 1, 0, 1, 1])
# (tb == y).sum()

4