In [1]:
import numpy as np
# import tensorflow as tf
from collections import Counter
from string import punctuation

import torch 
import torch.nn as nn
import numpy as np
from torch.autograd import Variable

## 1. Data Preprocessing

In [2]:
with open('reviews.txt', 'r') as f:
    reviews = f.read()
with open('labels.txt', 'r') as f:
    labels = f.read()

In [3]:
len(reviews)

33678267

In [4]:
reviews[:2000]

'bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   \nstory of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is tu

In [5]:
type(labels)

str

In [6]:
labels[:100]

'positive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nn'

In [7]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [8]:
all_text = ''.join([c for c in reviews if c not in punctuation])
reviews = all_text.split('\n') # list : ['doc', 'doc', ...]

all_text = ' '.join(reviews)
words = all_text.split()

In [9]:
print(reviews[:3])
print('\n')
print(words[:100])

['bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers   the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students  when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled          at           high  a classic line inspector i  m here to sack one of your teachers  student welcome to bromwell high  i expect that many adults of my age think that bromwell high is far fetched  what a pity that it isn  t   ', 'story of a man who has unnatural feelings for a pig  starts out with a opening scene that is a terrific example of absurd comedy  a formal orchestra audience is turned into an insane  viole

## 2. Encoding words and labels

In [10]:
# Create your dictionary that maps vocab words to integers here
word_count = Counter(words)
vocab = sorted(word_count, key=word_count.get, reverse=True)
vocab_to_int = {word: i for i, word in enumerate(vocab, 1)}
# print(vocab_to_int)

In [11]:
# Convert the reviews to integers, same shape as reviews list, but with integers
reviews_ints = []
for review in reviews:
    reviews_ints.append([vocab_to_int[word] for word in review.split()]) 
print(len(reviews_ints))
print(len(reviews_ints[0]), len(reviews_ints[1]))
print(reviews_ints[:2])

25001
140 114
[[21025, 308, 6, 3, 1050, 207, 8, 2138, 32, 1, 171, 57, 15, 49, 81, 5785, 44, 382, 110, 140, 15, 5194, 60, 154, 9, 1, 4975, 5852, 475, 71, 5, 260, 12, 21025, 308, 13, 1978, 6, 74, 2395, 5, 613, 73, 6, 5194, 1, 24103, 5, 1983, 10166, 1, 5786, 1499, 36, 51, 66, 204, 145, 67, 1199, 5194, 19869, 1, 37442, 4, 1, 221, 883, 31, 2988, 71, 4, 1, 5787, 10, 686, 2, 67, 1499, 54, 10, 216, 1, 383, 9, 62, 3, 1406, 3686, 783, 5, 3483, 180, 1, 382, 10, 1212, 13583, 32, 308, 3, 349, 341, 2913, 10, 143, 127, 5, 7690, 30, 4, 129, 5194, 1406, 2326, 5, 21025, 308, 10, 528, 12, 109, 1448, 4, 60, 543, 102, 12, 21025, 308, 6, 227, 4146, 48, 3, 2211, 12, 8, 215, 23], [63, 4, 3, 125, 36, 47, 7472, 1395, 16, 3, 4181, 505, 45, 17, 3, 622, 134, 12, 6, 3, 1279, 457, 4, 1721, 207, 3, 10624, 7373, 300, 6, 667, 83, 35, 2116, 1086, 2989, 34, 1, 898, 46417, 4, 8, 13, 5096, 464, 8, 2656, 1721, 1, 221, 57, 17, 58, 794, 1297, 832, 228, 8, 43, 98, 123, 1469, 59, 147, 38, 1, 963, 142, 29, 667, 123, 1, 13584, 41

In [12]:
labels = labels.split('\n')
labels = [1 if label=='positive' else 0 for label in labels]

In [13]:
assert len(labels) == 25001

In [14]:
# check abnormal docs
review_lens = Counter([len(x) for x in reviews_ints])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 1
Maximum review length: 2514


In [15]:
# delete the blank doc
not_zero_idx = [i for i, sublist in enumerate(reviews_ints) if len(sublist) != 0]
print(len(not_zero_idx))

25000


In [16]:
# Filter out that review with 0 length
reviews_ints = [reviews_ints[idx] for idx in not_zero_idx]
labels = np.array([labels[idx] for idx in not_zero_idx])
print(len(reviews_ints), len(labels))

25000 25000


In [17]:
# transform doc list to array (length=200) 
seq_len = 200
features = np.zeros((len(reviews_ints), seq_len), dtype=int)
for i, review_int in enumerate(reviews_ints):
    features[i, -len(review_int):] = np.array(review_int)[:seq_len]

In [18]:
features[:2,:]

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 21025,   308,     6,
            3,  1050,   207,     8,  2138,    32,     1,   171,    57,
           15,    49,    81,  5785,    44,   382,   110,   140,    15,
         5194,    60,   154,     9,     1,  4975,  5852,   475,    71,
            5,   260,    12, 21025,   308,    13,  1978,     6,    74,
         2395,     5,   613,    73,     6,  5194,     1, 24103,     5,
         1983, 10166,     1,  5786,  1499,    36,    51,    66,   204,
          145,    67,  1199,  5194, 19869,     1, 37442,     4,     1,
      

In [19]:
features.shape

(25000, 200)

## 3. Training, Validation, Testing set

In [20]:
split_frac = 0.8
train_val = int(len(features)*split_frac)
train_x, val_x = features[:train_val], features[train_val:]
train_y, val_y = labels[:train_val], labels[train_val:]

val_test = int(len(val_x)*0.5)
val_x, test_x = val_x[:val_test], val_x[val_test:]
val_y, test_y = val_y[:val_test], val_y[val_test:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(20000, 200) 
Validation set: 	(2500, 200) 
Test set: 		(2500, 200)


In [21]:
train_y = train_y.reshape(-1, 1)
val_y = val_y.reshape(-1, 1)
test_y = test_y.reshape(-1, 1)
# train_y = train_y.astype('float')
# val_y = val_y.astype('float')
# test_y = test_y.astype('float')
print(train_y.shape, val_y.shape, test_y.shape)

(20000, 1) (2500, 1) (2500, 1)


## 4. Model

In [22]:
# Hyper Parameters
embed_size = 100
hidden_size = 256
num_layers = 1
num_epochs = 10
batch_size = 250
seq_length = 200
learning_rate = 0.002

In [23]:
vocab_size = len(vocab_to_int) + 1
vocab_size

74073

In [24]:
class LSTM_CLF(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(LSTM_CLF, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
        self.init_weights()
        
    def init_weights(self):
        self.embed.weight.data.uniform_(-0.1, 0.1)
        self.linear.weight.data.uniform_(-0.1, 0.1)
        self.linear.bias.data.fill_(0)
        
    def forward(self, inputs, hidden):
        # embed word ids to vectors
        inputs = self.embed(inputs)
        
        # forward
        out, hidden = self.lstm(inputs, hidden)
        
        out = self.linear(out[:, -1, :])
        out = self.sigmoid(out)
        return out, hidden

In [25]:
model = LSTM_CLF(vocab_size, embed_size, hidden_size, num_layers)
model.cuda()

print(model)

LSTM_CLF(
  (embed): Embedding(74073, 100)
  (lstm): LSTM(100, 256, batch_first=True)
  (linear): Linear(in_features=256, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [27]:
criterion = nn.BCELoss() # binary crossentropy
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [28]:
# Truncated Backpropagation 
def detach(states):
    return [state.detach() for state in states] 

In [29]:
# iterations
def get_batches(x, y, batch_size=100):
    
    n_batch = len(x) // batch_size
    x, y = x[:batch_size*n_batch], y[:batch_size*n_batch]
    for i in range(0, len(x), batch_size):
        yield x[i:i+batch_size], y[i:i+batch_size]

In [30]:
# training
c = 0
for epoch in range(num_epochs):
    # training
    model.train()
    # initial hidden states and memory states
    states = (Variable(torch.zeros(num_layers, batch_size, hidden_size)).cuda(), 
              Variable(torch.zeros(num_layers, batch_size, hidden_size)).cuda())
    
    for i, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
        # print(i, x.shape, y.shape)
        # get batch inputs and targets
        # x, y - numpy.array, trasfer to torch tensor, BCELoss need targets : Float tensor
        inputs = Variable(torch.from_numpy(x)).cuda()
        targets = Variable(torch.from_numpy(y).float()).cuda()
        
        # forward, backward, optimize
        model.zero_grad()
        states = detach(states)
        outputs, states = model(inputs, states)
        loss = criterion(outputs, targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), 0.5)
        optimizer.step()
        
        c += 1
        if c % 20 == 0:
            print("Epoch: {}/{}...".format(epoch+1, num_epochs), 
                  "Iterations: {}...".format(c), 
                  "Loss: {:5.4f}".format(loss.data[0]), 
                  "Perplexity: {:5.2f}".format(np.exp(loss.data[0])))
            
    # evaluate validation
    model.eval()

    corr = total = 0

    for i, (x, y) in enumerate(get_batches(val_x, val_y, batch_size), 1):
        inputs = Variable(torch.from_numpy(x)).cuda()

        # forward, backward, optimize
        outputs, _ = model(inputs, states)
        total += y.shape[0]
        outputs = (outputs.data > 0.5)    # > 0.5 get 1, or get 0
        pred = np.array(outputs)

        corr += (pred == y).sum()

    print('Validation Accuracy: %f %%' % (100 * corr / total))
            
#     model.eval()
#     total_loss = 0
#     for i, (x, y) in enumerate(get_batches(val_x, val_y, batch_size), 1):
#         # print(i, x.shape, y.shape)
#         # get batch inputs and targets
#         inputs = Variable(torch.from_numpy(x)).cuda()
#         targets = Variable(torch.from_numpy(y).float()).cuda()
        
#         # forward, backward, optimize
        
#         outputs, states = model(inputs, states)
#         total_loss += criterion(outputs, targets).data[0]
#     total_loss /= 10 # every batch : batch_size = 250

#     print('Validation Loss: %.4f' % (total_loss))

Epoch: 1/10... Iterations: 20... Loss: 1.5456 Perplexity:  4.69
Epoch: 1/10... Iterations: 40... Loss: 0.6840 Perplexity:  1.98
Epoch: 1/10... Iterations: 60... Loss: 0.6324 Perplexity:  1.88
Epoch: 1/10... Iterations: 80... Loss: 0.6183 Perplexity:  1.86
Validation Accuracy: 65.200000 %
Epoch: 2/10... Iterations: 100... Loss: 0.5656 Perplexity:  1.76
Epoch: 2/10... Iterations: 120... Loss: 0.5430 Perplexity:  1.72
Epoch: 2/10... Iterations: 140... Loss: 0.3862 Perplexity:  1.47
Epoch: 2/10... Iterations: 160... Loss: 0.3785 Perplexity:  1.46
Validation Accuracy: 73.240000 %
Epoch: 3/10... Iterations: 180... Loss: 0.5399 Perplexity:  1.72
Epoch: 3/10... Iterations: 200... Loss: 0.4502 Perplexity:  1.57
Epoch: 3/10... Iterations: 220... Loss: 0.2380 Perplexity:  1.27
Epoch: 3/10... Iterations: 240... Loss: 0.3944 Perplexity:  1.48
Validation Accuracy: 77.400000 %
Epoch: 4/10... Iterations: 260... Loss: 0.3140 Perplexity:  1.37
Epoch: 4/10... Iterations: 280... Loss: 0.4552 Perplexity:  

In [31]:
# Testing
corr = total = 0

for i, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
    inputs = Variable(torch.from_numpy(x)).cuda()

    # forward, backward, optimize
    outputs, _ = model(inputs, states)
    total += y.shape[0]
    outputs = outputs.data > 0.5 # > 0.5 get 1, or get 0
    pred = np.array(outputs)

    corr += (pred == y).sum()

print('Accuracy: %f %%' % (100 * corr / total))

Accuracy: 84.600000 %


## 5. Model with pre-trained word vectors

In [None]:
## 两个模型都使用model这个名字，建议使用不同的名字

In [48]:
# 从GloVe文件中解析出每个词和它所对应的词向量，并用字典的方式存储
# 使用的词向量：100维
embeddings_index = {}
with open('glove.6B.100d.txt') as txtfile:
    lines = txtfile.readlines()
    for line in lines:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print(len(embeddings_index))

400000


In [49]:
print(embeddings_index['the'])

[-0.038194 -0.24487   0.72812  -0.39961   0.083172  0.043953 -0.39141
  0.3344   -0.57545   0.087459  0.28787  -0.06731   0.30906  -0.26384
 -0.13231  -0.20757   0.33395  -0.33848  -0.31743  -0.48336   0.1464
 -0.37304   0.34577   0.052041  0.44946  -0.46971   0.02628  -0.54155
 -0.15518  -0.14107  -0.039722  0.28277   0.14393   0.23464  -0.31021
  0.086173  0.20397   0.52624   0.17164  -0.082378 -0.71787  -0.41531
  0.20335  -0.12763   0.41367   0.55187   0.57908  -0.33477  -0.36559
 -0.54857  -0.062892  0.26584   0.30205   0.99775  -0.80481  -3.0243
  0.01254  -0.36942   2.2167    0.72201  -0.24978   0.92136   0.034514
  0.46745   1.1079   -0.19358  -0.074575  0.23353  -0.052062 -0.22044
  0.057162 -0.15806  -0.30798  -0.41625   0.37972   0.15006  -0.53212
 -0.2055   -1.2526    0.071624  0.70565   0.49744  -0.42063   0.26148
 -1.538    -0.30223  -0.073438 -0.28312   0.37104  -0.25217   0.016215
 -0.017099 -0.38984   0.87424  -0.72569  -0.51058  -0.52028  -0.1459
  0.8278    0.27062 ]

In [None]:
# vocab_to_int

In [50]:
# 构建词向量矩阵, 没有的单词的词向量均为0
embedding_dim = 100
embedding_matrix = np.zeros((len(vocab_to_int) + 1, embedding_dim))
for word, idx in vocab_to_int.items():
    embed_vector = embeddings_index.get(word)
    if embed_vector is not None:
        embedding_matrix[idx] = embed_vector

In [51]:
class PreLSTM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(PreLSTM, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
        self.init_weights()
        
    def init_weights(self):
        self.embed.weight.data.copy_(torch.from_numpy(embedding_matrix))
        self.linear.weight.data.uniform_(-0.1, 0.1)
        self.linear.bias.data.fill_(0)
        
    def forward(self, inputs, hidden):
        # embed word ids to vectors
        inputs = self.embed(inputs)
        
        # forward
        out, hidden = self.lstm(inputs, hidden)

        out = self.linear(out[:, -1, :])
        out = self.sigmoid(out)
        return out, hidden

In [52]:
model = PreLSTM(vocab_size, embed_size, hidden_size, num_layers)
model.cuda()

print(model)

PreLSTM(
  (embed): Embedding(74073, 100)
  (lstm): LSTM(100, 256, batch_first=True)
  (linear): Linear(in_features=256, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [53]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [54]:
# training
c = 0
for epoch in range(num_epochs):
    # initial hidden states and memory states
    model.train()
    states = (Variable(torch.zeros(num_layers, batch_size, hidden_size)).cuda(), 
              Variable(torch.zeros(num_layers, batch_size, hidden_size)).cuda())
    
    for i, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
        # print(i, x.shape, y.shape)
        # get batch inputs and targets
        inputs = Variable(torch.from_numpy(x)).cuda()
        targets = Variable(torch.from_numpy(y).float()).cuda()
        
        # forward, backward, optimize
        model.zero_grad()
        states = detach(states)
        outputs, states = model(inputs, states)
        loss = criterion(outputs, targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), 0.5)
        optimizer.step()
        
        c += 1
        if c % 20 == 0:
            print("Epoch: {}/{}...".format(epoch+1, num_epochs), 
                  "Iterations: {}...".format(c), 
                  "Loss: {:5.4f}".format(loss.data[0]), 
                  "Perplexity: {:5.2f}".format(np.exp(loss.data[0])))
            
    # evaluate validation
    model.eval()

    corr = total = 0

    for i, (x, y) in enumerate(get_batches(val_x, val_y, batch_size), 1):
        inputs = Variable(torch.from_numpy(x)).cuda()

        # forward, backward, optimize
        outputs, _ = model(inputs, states)
        total += y.shape[0]
        outputs = outputs.data > 0.5 # > 0.5 get 1, or get 0
        pred = np.array(outputs)

        corr += (pred == y).sum()

    print('Validation Accuracy: %f %%' % (100 * corr / total))

Epoch: 1/10... Iterations: 20... Loss: 0.6614 Perplexity:  1.94
Epoch: 1/10... Iterations: 40... Loss: 0.7304 Perplexity:  2.08
Epoch: 1/10... Iterations: 60... Loss: 0.6356 Perplexity:  1.89
Epoch: 1/10... Iterations: 80... Loss: 0.5646 Perplexity:  1.76
Validation Accuracy: 69.000000 %
Epoch: 2/10... Iterations: 100... Loss: 0.5289 Perplexity:  1.70
Epoch: 2/10... Iterations: 120... Loss: 0.4909 Perplexity:  1.63
Epoch: 2/10... Iterations: 140... Loss: 0.4962 Perplexity:  1.64
Epoch: 2/10... Iterations: 160... Loss: 0.5509 Perplexity:  1.73
Validation Accuracy: 72.160000 %
Epoch: 3/10... Iterations: 180... Loss: 0.2278 Perplexity:  1.26
Epoch: 3/10... Iterations: 200... Loss: 0.5186 Perplexity:  1.68
Epoch: 3/10... Iterations: 220... Loss: 0.4551 Perplexity:  1.58
Epoch: 3/10... Iterations: 240... Loss: 0.4846 Perplexity:  1.62
Validation Accuracy: 79.360000 %
Epoch: 4/10... Iterations: 260... Loss: 0.3107 Perplexity:  1.36
Epoch: 4/10... Iterations: 280... Loss: 0.2642 Perplexity:  

In [55]:
# Testing
corr = total = 0

for i, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
    inputs = Variable(torch.from_numpy(x)).cuda()

    # forward, backward, optimize
    outputs, _ = model(inputs, states)
    total += y.shape[0]
    outputs = outputs.data > 0.5 # > 0.5 get 1, or get 0
    pred = np.array(outputs)

    corr += (pred == y).sum()

print('Accuracy: %f %%' % (100 * corr / total))

Accuracy: 97.400000 %
