
Dataset Reference

IMDB的数据
http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
https://github.com/SrinidhiRaghavan/AI-Sentiment-Analysis-on-IMDB-Dataset
    
    

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data.dataloader as dataloader
import torch.optim as optim
import torch.autograd as autograd
import torchtext.vocab as torchvocab
from torch.autograd import Variable

import tqdm
import os
import time
import re
import numpy as np
import pandas as pd
import string
import time
import random
import snowballstemmer
import collections
from collections import Counter

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from itertools import chain
from sklearn.metrics import accuracy_score

from gensim.models.word2vec import Word2Vec
import gensim.downloader as api



### 定义读数的函数

In [2]:
def readIMDB(path, seg='train'):
    pos_or_neg = ['pos', 'neg']
    data = []
    for label in pos_or_neg:
        files = os.listdir(os.path.join(path, seg, label))
        for file in files:
            with open(os.path.join(path, seg, label, file), 'r', encoding='utf8') as rf:
                review = rf.read().replace('\n', '')
                if label == 'pos':
                    data.append([review, 1])
                elif label == 'neg':
                    data.append([review, 0])
    return data

train_data = readIMDB('data/aclImdb')
test_data = readIMDB('data/aclImdb', 'test')

### Check Data Set 

In [3]:
train_data[0]

['For a movie that gets no respect there sure are a lot of memorable quotes listed for this gem. Imagine a movie where Joe Piscopo is actually funny! Maureen Stapleton is a scene stealer. The Moroni character is an absolute scream. Watch for Alan "The Skipper" Hale jr. as a police Sgt.',
 1]

In [4]:
test_data[0]

['Based on an actual story, John Boorman shows the struggle of an American doctor, whose husband and son were murdered and she was continually plagued with her loss. A holiday to Burma with her sister seemed like a good idea to get away from it all, but when her passport was stolen in Rangoon, she could not leave the country with her sister, and was forced to stay back until she could get I.D. papers from the American embassy. To fill in a day before she could fly out, she took a trip into the countryside with a tour guide. "I tried finding something in those stone statues, but nothing stirred in me. I was stone myself." <br /><br />Suddenly all hell broke loose and she was caught in a political revolt. Just when it looked like she had escaped and safely boarded a train, she saw her tour guide get beaten and shot. In a split second she decided to jump from the moving train and try to rescue him, with no thought of herself. Continually her life was in danger. <br /><br />Here is a woman

In [5]:
train_df = pd.DataFrame(train_data,columns=["comment","label"])
train_df.groupby(["label"]).count().reset_index()

Unnamed: 0,label,comment
0,0,12500
1,1,12500


In [6]:
test_df = pd.DataFrame(test_data,columns=["comment","label"])
test_df.groupby(["label"]).count().reset_index()

Unnamed: 0,label,comment
0,0,1840
1,1,12500


### 分詞
接着是分词，这里只做非常简单的分词，也就是按照空格分词。当然按照一些传统的清洗方式效果会更好。

In [7]:
# "vocab" would be all the words appear in the train_data_set

def tokenizer(text):
    return [tok.lower() for tok in text.split(' ')]

train_tokenized = []
test_tokenized = []
for review, score in train_data:
    train_tokenized.append(tokenizer(review))
for review, score in test_data:
    test_tokenized.append(tokenizer(review))

# itertools.chain, Combine N list to form a new single list.
vocab = set(chain(*train_tokenized))
vocab_size = len(vocab)

print("{} unique words in the data set".format(vocab_size))

252192 unique words in the data set


### word embedding

因为这个数据集非常小，所以如果我们用这个数据集做word embedding有可能过拟合，而且模型没有通用性，所以我们传入一个已经学好的word embedding。 用的是glove-twitter-100, 387MB, 100維度的數據。

In [8]:
# download the model and return as object ready for use
dimension = 100
model_glove_twitter = api.load("glove-twitter-100")

In [9]:
model_glove_twitter["twitter"]

array([ 0.44104  ,  0.1385   , -0.66489  , -0.044309 ,  0.44579  ,
        0.027886 , -0.30068  , -0.13851  ,  0.44771  ,  0.60006  ,
        0.12149  , -0.69262  , -3.5289   , -0.5495   , -0.98539  ,
        0.54288  , -0.17355  , -0.73415  , -0.46325  , -0.68942  ,
       -0.29029  , -0.20679  , -1.0008   , -0.010779 , -0.52833  ,
       -2.9566   ,  0.45207  , -0.65441  ,  0.10636  ,  0.15182  ,
       -0.71115  ,  0.17282  , -0.16225  , -0.96776  ,  0.64226  ,
       -0.029472 ,  0.5799   ,  0.18865  , -0.022253 , -0.61489  ,
       -1.1467   ,  0.39476  , -0.2715   , -0.024786 ,  0.32542  ,
       -0.14626  , -0.13835  ,  0.44469  , -0.72034  ,  0.0059288,
        0.069213 , -0.042943 , -0.32557  , -0.4062   , -0.023224 ,
        0.74154  , -1.5501   , -0.012535 , -0.020187 , -0.31557  ,
        0.036324 , -0.56278  ,  0.072553 , -0.02491  , -0.53492  ,
        0.49579  ,  0.24916  ,  0.92282  , -0.20315  ,  0.27591  ,
       -0.71818  ,  0.39903  , -0.078875 , -0.38303  , -0.8473

### 定义一个word to index的词典 

定义的目的是为了将预训练的weight跟我们的词库拼上。    
另外我们定义了一个unknown的词，也就是说没有出现在训练集里的词，我们都叫做unknown，词向量就定义为0。

In [10]:
# unknow word index=0, the other word start from index=1
word_to_idx = {word: i+1 for i, word in enumerate(vocab)}
word_to_idx['<unk>'] = 0

idx_to_word = {i+1: word for i, word in enumerate(vocab)}
idx_to_word[0] = '<unk>'

### 编码

我们这里为了解决评论长度不一致的问题，将所有的评论都取500个词，超过的就取前500个，不足的补0。

In [11]:

def encode_samples(tokenized_samples, vocab):
    features = []
    for sample in tokenized_samples:
        feature = []
        for token in sample:
            if token in word_to_idx:
                feature.append(word_to_idx[token])
            else:
                feature.append(0)
        features.append(feature)
    return features

def pad_samples(features, maxlen=500, PAD=0):
    padded_features = []
    for feature in features:
        if len(feature) >= maxlen:
            padded_feature = feature[:maxlen]
        else:
            padded_feature = feature
            while(len(padded_feature) < maxlen):
                padded_feature.append(PAD)
        padded_features.append(padded_feature)
    return padded_features


In [12]:
# "vocab" would be all the words appear in the train_data_set

# "train_tokenized" would be [[tokenized word in comment1],[tokenized word in comment2],...] 
# (i.g. train_tokenized[0:2])

# "train_data" would be [[comment1,label1],[comment2,label2]...]
# (i.g. train_data[0:2])

train_features = torch.tensor(pad_samples(encode_samples(train_tokenized, vocab)))
train_labels = torch.tensor([score for _, score in train_data])
test_features = torch.tensor(pad_samples(encode_samples(test_tokenized, vocab)))
test_labels = torch.tensor([score for _, score in test_data])

In [14]:
print("train_features = {}".format(train_features.shape))
print("train_labels = {}".format(train_labels.shape))
print("test_features = {}".format(test_features.shape))
print("test_labels = {}".format(test_labels.shape))

train_features = torch.Size([25000, 500])
train_labels = torch.Size([25000])
test_features = torch.Size([14340, 500])
test_labels = torch.Size([14340])


### TextCNN Model

In [21]:
embed_size = 100

# create the pretrained - word Embedding
weight = torch.zeros(vocab_size+1, embed_size)

for i in range(len(model_glove_twitter.index_to_key)):
    try:
        # use this statement to capture the case that glove_word not in our word_base
        # if glove_word is in word_to_idx, get the word index
        # else raise Error
        glove_word = model_glove_twitter.index_to_key[i]
        index = word_to_idx[glove_word]
    except:
        continue
        
    glove_word = model_glove_twitter.index_to_key[i]
    index = word_to_idx[glove_word]
    word = idx_to_word[index]
    weight[index, :] = torch.from_numpy(model_glove_twitter.get_vector(word))

In [22]:
print("pre-train model have {} key words".format( len(model_glove_twitter.index_to_key) ))

pre-train model have 1193514 key words


In [23]:
model_glove_twitter.index_to_key[218]

'twitter'

In [24]:
model_glove_twitter.key_to_index["twitter"]

218

In [25]:
word_to_idx['twitter']

193528

In [26]:
i = 281
glove_word = model_glove_twitter.index_to_key[i]
print(f"glove_word = {glove_word}")
index = word_to_idx[glove_word]
print(f"index = {index}")
word = idx_to_word[index]
print(f"word = {word}")
vector = model_glove_twitter.get_vector(word)
print(f"output vector = {vector.shape}")
print(vector)

print(f"pretrained-embedding(weight.shape) = {weight.shape}")

glove_word = school
index = 180839
word = school
output vector = (100,)
[ 0.11078   -0.24165    0.58407    0.33004   -0.28523    0.39694
  0.48903    1.3905    -0.75806   -0.0066963 -0.30907    0.18059
 -4.5539     0.46777   -0.35242   -0.28211   -0.81762   -0.16247
 -1.4519    -0.54034   -0.20429    0.048978  -0.67599    0.56651
  0.66277   -0.094981  -0.39577   -0.88254    0.007897  -1.0843
 -0.35871   -0.21915   -0.26527   -0.032049   0.02007    0.44382
  0.39047    1.2238     0.62872   -0.1833    -1.1305     0.29216
 -0.15579   -0.357     -0.071372   0.15666    0.11832    0.44056
 -0.9165    -0.17102    0.33795    0.14644   -0.4715     0.22617
 -0.53264   -0.43504   -0.73209   -0.23924   -0.78109    0.37778
 -0.5164     0.030757   0.49751   -0.79576   -0.63072   -0.40766
  0.032427  -0.31238    0.54056   -0.99269   -0.17894    0.096784
  0.34071    0.27062   -0.23308   -0.050616   0.59956   -0.29787
  0.4078    -0.50386    1.7127    -0.72348    0.30162   -0.38823
  0.53881   -0.203

In [27]:
class textCNN(nn.Module):
    def __init__(self, vocab_size, embed_size, seq_len, labels, weight, **kwargs):
        super(textCNN, self).__init__(**kwargs)
        self.labels = labels
        self.embedding = nn.Embedding.from_pretrained(weight)
        self.embedding.weight.requires_grad = False
        self.conv1 = nn.Conv2d(1, 1, (3, embed_size))
        self.conv2 = nn.Conv2d(1, 1, (4, embed_size))
        self.conv3 = nn.Conv2d(1, 1, (5, embed_size))
        self.pool1 = nn.MaxPool2d((seq_len - 3 + 1, 1))
        self.pool2 = nn.MaxPool2d((seq_len - 4 + 1, 1))
        self.pool3 = nn.MaxPool2d((seq_len - 5 + 1, 1))
        self.linear = nn.Linear(3, labels)
        self.dropout = nn.Dropout(0.5)

    def forward(self, inputs):
        inputs = self.embedding(inputs).view(inputs.shape[0], 1, inputs.shape[1], -1)
        x1 = F.relu(self.conv1(inputs))
        x2 = F.relu(self.conv2(inputs))
        x3 = F.relu(self.conv3(inputs))
        
        x1 = self.dropout(x1)
        x2 = self.dropout(x2)
        x3 = self.dropout(x3)

        x1 = self.pool1(x1)
        x2 = self.pool2(x2)
        x3 = self.pool3(x3)

        x = torch.cat((x1, x2, x3), -1)
        x = x.view(inputs.shape[0], 1, -1)

        x = self.linear(x)
        x = x.view(-1, self.labels)

        return(x)
    
num_epochs = 5
num_hiddens = 100
num_layers = 2
bidirectional = True
batch_size = 64
labels = 2
lr = 0.8
device = torch.device('cuda:0')
use_gpu = False


net = textCNN(vocab_size=(vocab_size+1), embed_size=embed_size,
              seq_len=500, labels=labels, weight=weight)

loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=lr)

In [28]:
net

textCNN(
  (embedding): Embedding(252193, 100)
  (conv1): Conv2d(1, 1, kernel_size=(3, 100), stride=(1, 1))
  (conv2): Conv2d(1, 1, kernel_size=(4, 100), stride=(1, 1))
  (conv3): Conv2d(1, 1, kernel_size=(5, 100), stride=(1, 1))
  (pool1): MaxPool2d(kernel_size=(498, 1), stride=(498, 1), padding=0, dilation=1, ceil_mode=False)
  (pool2): MaxPool2d(kernel_size=(497, 1), stride=(497, 1), padding=0, dilation=1, ceil_mode=False)
  (pool3): MaxPool2d(kernel_size=(496, 1), stride=(496, 1), padding=0, dilation=1, ceil_mode=False)
  (linear): Linear(in_features=3, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [29]:
train_set = torch.utils.data.TensorDataset(train_features, train_labels)
test_set = torch.utils.data.TensorDataset(test_features, test_labels)

In [30]:
train_iter = torch.utils.data.DataLoader(train_set, batch_size=batch_size,
                                         shuffle=True)
test_iter = torch.utils.data.DataLoader(test_set, batch_size=batch_size,
                                        shuffle=False)

In [31]:
train_loss_lst = []
train_acc_lst = []
test_loss_lst = []
test_acc_lst = [] 

for epoch in range(num_epochs):
    start = time.time()
    train_loss, test_losses = 0, 0
    train_acc, test_acc = 0, 0
    n, m = 0, 0
    for feature, label in train_iter:
        n += 1
        net.train()
        net.zero_grad()
        feature = Variable(feature)
        label = Variable(label)
        score = net(feature)
        loss = loss_function(score, label)
        loss.backward()
        optimizer.step()
        train_acc += accuracy_score(torch.argmax(score.cpu().data,
                                                 dim=1), label.cpu())
        train_loss += loss
    with torch.no_grad():
        for test_feature, test_label in test_iter:
            m += 1
            net.eval()
            test_feature = test_feature
            test_label = test_label
            test_score = net(test_feature)
            test_loss = loss_function(test_score, test_label)
            test_acc += accuracy_score(torch.argmax(test_score.cpu().data,
                                                    dim=1), test_label.cpu())
            test_losses += test_loss
    end = time.time()
    runtime = end - start
    print('epoch: %d, train loss: %.4f, train acc: %.2f, test loss: %.4f, test acc: %.2f, time: %.2f' %
          (epoch, train_loss.data / n, train_acc / n, test_losses.data / m, test_acc / m, runtime))

    train_loss_lst.append(train_loss.data / n)
    train_acc_lst.append(train_acc / n)
    test_loss_lst.append(test_losses.data / m)
    test_acc_lst.append(test_acc / m)
    

epoch: 0, train loss: 0.6972, train acc: 0.50, test loss: 0.6410, test acc: 0.87, time: 163.79
epoch: 1, train loss: 0.6938, train acc: 0.51, test loss: 0.7891, test acc: 0.15, time: 162.95
epoch: 2, train loss: 0.6920, train acc: 0.52, test loss: 0.7723, test acc: 0.19, time: 162.81
epoch: 3, train loss: 0.6904, train acc: 0.52, test loss: 0.7834, test acc: 0.29, time: 162.44
epoch: 4, train loss: 0.6866, train acc: 0.54, test loss: 0.6926, test acc: 0.45, time: 163.34


In [47]:
# with torch.no_grad():
#     net.eval()
#     for test_feature, test_label in test_iter:
#         test_feature = test_feature
#         test_label = test_label
#         test_score = net(test_feature)
#         test_loss = loss_function(test_score, test_label)
#         break
