## TextCNN 文本分析

### 1 data reading

In [1]:
# library related packages
import collections
import os
import random
import tarfile
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
from tqdm import tqdm
import time
import torch.nn.functional as F
from pandas import DataFrame

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DATA_ROOT = "/Users/hongcan/Documents/HKU STAT/STAT7008"  # change the value to the folder directory in your computer
fname = os.path.join(DATA_ROOT, "aclImdb_v1.tar.gz")
if not os.path.exists(os.path.join(DATA_ROOT, "aclImdb")):
    print("从压缩包解压...")
    with tarfile.open(fname, 'r') as f: 
        f.extractall(DATA_ROOT)
        
# load the data
def read_imdb_train(folder='train', data_root="/Users/hongcan/Documents/HKU STAT/STAT7008/aclImdb"):
    data = []
    for label in ['pos', 'neg']:
        folder_name = os.path.join(data_root, folder, label)
        for file in tqdm(os.listdir(folder_name)):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n','').lower()
                data.append([review, 1 if label == 'pos' else 0])
    random.shuffle(data)
    return data

def read_imdb_test(folder='test', data_root="/Users/hongcan/Documents/HKU STAT/STAT7008/aclImdb"):
    data = []
    for label in ['pos', 'neg']:
        folder_name = os.path.join(data_root, folder, label)
        for file in tqdm(os.listdir(folder_name)):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n','').lower()
                data.append([review, 1 if label == 'pos' else 0])
    random.shuffle(data)
    return data
train_data, test_data = read_imdb_train('train'), read_imdb_test('test')

100%|███████████████████████████████████| 12500/12500 [00:01<00:00, 6426.72it/s]
100%|███████████████████████████████████| 12500/12500 [00:01<00:00, 6490.82it/s]
100%|███████████████████████████████████| 12500/12500 [00:01<00:00, 6662.28it/s]
100%|███████████████████████████████████| 12500/12500 [00:01<00:00, 7567.94it/s]


In [2]:
train_data[0]

["first, let's all agree that lorenzo lamas could never be considered a skilled actor, barely even decent, sometimes just plain lousy. however, in this piece of @*!^ called snakeeater, the film industry as a whole sank.<br /><br />first, let's start with the plot. a vietnam vet named jack kelly, aka soldier (who is supposed to be as tough as a strap of leather and then some, which you can believe when he shoves a palate of nails through 2 guys' feet and pins them to the floor), gets word that his family has been killed and his sister kidnapped. therefore he goes on a solo mission to save his sister. had some potential, but still pretty thin to begin with.<br /><br />now, the acting. being an actor myself, i am qualified to say that this was some of the worst acting in the history of the art!!!!! lamas is, well, himself. the jackasses playing the clampets/deliverance rejects should be strung up and shot for their so-called performances which are insulting to actors everywhere, especiall

In [3]:
test_data[0]

["i'm glad i never watched this show when it came out.<br /><br />i just wondered why it lasted 4 years. it reminds me of the terrible 80's with fake people, fake clothes, and fake music. how did i ever survive growing up in this era? <br /><br />the acting in the majority of episodes i have watched are forced. this makes for very boring shows. the plot lines are not very interesting as the old twilight zone shows. the old show inspired the imagination and made one look forward to the next show. <br /><br />stick with the old twilight zone shows and spare yourself the pain of watching garbage.",
 0]

### 2 data cleaning - tokenization, extract flag, standardization

In [4]:
def get_tokenized_imdb(data):
    """
     data: list of [string, label]，
     """
    def tokenizer(text):
        return [tok.lower() for tok in text.split(' ')]
    return [tokenizer(review) for review, _ in data] 


def get_vocab_imdb(data):
    tokenized_data = get_tokenized_imdb(data)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return Vocab.Vocab(counter,None, 5)   # max_size=None,min_freq=5
vocab = get_vocab_imdb(train_data)        # put the vacabulary into a dictionary and remove words that occur less than 5 times in the dictionary

def preprocess_imdb(data, vocab):
    max_l = 500                           # standardize the length of each sentences into 500
    def pad(x):
        return x[:max_l] if len(x) > max_l else x + [0] * (max_l - len(x))
    tokenized_data = get_tokenized_imdb(data)
    features = torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data])
    labels = torch.tensor([score for _, score in data])
    return features, label                # feature stands for the text vectors, and labels stand for the sentiment label

### 3 construct initial Text CNN model - encapsulation （parameter shows batch_size, kernal_size, num_channels)

In [5]:
class GlobalMaxPool1d(nn.Module):
    def __init__(self):
        super(GlobalMaxPool1d, self).__init__()
    def forward(self, x):
        # x shape: (batch_size, channel, seq_len)
        # return shape: (batch_size, channel, 1)
        return F.max_pool1d(x, kernel_size=x.shape[2])

In [6]:
# data encapsulation (parameter 1:batchsize）
batch_size = 64 
train_set = Data.TensorDataset(*preprocess_imdb(train_data, vocab))
test_set = Data.TensorDataset(*preprocess_imdb(test_data, vocab))
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)

In [7]:
for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
    break

X torch.Size([64, 500]) y torch.Size([64])


In [8]:
class TextCNN(nn.Module):
    def __init__(self, vocab, embed_size, kernel_sizes, num_channels):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size)
        self.constant_embedding = nn.Embedding(len(vocab), embed_size)
        self.dropout = nn.Dropout(0.5)
        self.decoder = nn.Linear(sum(num_channels), 2)
        self.pool = GlobalMaxPool1d()
        self.convs = nn.ModuleList()
        for c, k in zip(num_channels, kernel_sizes):
            self.convs.append(nn.Conv1d(in_channels = 2*embed_size, out_channels = c, kernel_size = k))
    def forward(self, inputs):
        embeddings = torch.cat((self.embedding(inputs),
        self.constant_embedding(inputs)), dim=2) # (batch, seq_len, 2*embed_size)
        embeddings = embeddings.permute(0, 2, 1)
        encoding = torch.cat([self.pool(F.relu(conv(embeddings))).squeeze(-1) for conv in self.convs], dim=1)
        outputs = self.decoder(self.dropout(encoding))
        return outputs

In [9]:
# ————————————————————————————————parameter tuning————————————————————————————————————
embed_size,  kernel_sizes, nums_channels = 100, [3, 4, 5], [100, 100, 100]
net = TextCNN(vocab, embed_size, kernel_sizes, nums_channels)

### 4 Embedding using glove_100.txt

In [10]:
def load_pretrained_embedding(words, pretrained_vocab):
    embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0]) # 初始化为0
    oov_count = 0 # out of vocabulary
    for i, word in enumerate(words):
        try:
            idx = pretrained_vocab.stoi[word]
            embed[i, :] = pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count += 0
    if oov_count > 0:
        print("There are %d oov words.")
    return embed

In [11]:
# Download the file glove100.txt
glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=os.path.join(DATA_ROOT, "glove"))
net.embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.requires_grad = False

### 5 training (optimization) and evaluation

In [12]:
# define evaluation parameter
def evaluate_accuracy(data_iter, net, device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(net, torch.nn.Module):
                net.eval() 
                acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
                net.train() 
            else:
                if ('is_training' in net.__code__.co_varnames):
                    acc_sum += (net(X, is_training = False).argmax(dim=1) == y).float().sum().item()
                else:
                    acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
            n += y.shape[0]
    return acc_sum / n

In [13]:
def train(train_iter, test_iter, net, loss, optimizer, device, num_epochs):
    result = []
    for epoch in range(num_epochs):
        train_acc_sum, sum_l, start, n, batch_count = 0.0, 0.0, time.time(), 0, 0
        for X, y in train_iter:
            out = net(X)
            l = loss(out,y)
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            sum_l += l.cpu().item()
            n += y.shape[0]
            batch_count += 1
            train_acc_sum += (out.argmax(dim=1) == y).float().sum().cpu().item()
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train_acc %.3f, test acc %.3f, time %.1f sec'
             % (epoch + 1, sum_l / batch_count, train_acc_sum / n, test_acc, time.time() - start ))
        result.append('epoch %d, loss %.4f, train_acc %.3f, test acc %.3f, time %.1f sec'
             % (epoch + 1, sum_l / batch_count, train_acc_sum / n, test_acc, time.time() - start ))
    return (result)

In [None]:
lr, num, num_epochs = 0.001, 5, 15      # num_epochs represeent the number of training epoches
optimizer = torch.optim.Adam(filter(lambda p:p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)

### 6 prediction application

In [14]:
def predict_sentiment(net, vocab, sentence):
    device = list(net.parameters())[0].device
    sentence = torch.tensor([vocab.stoi[word] for word in sentence], device=device)
    label = torch.argmax(net(sentence.view((1, -1))), dim=1)
    return 'positive' if label.item() == 1 else 'negative'

In [15]:
def tokenizer(text):
        return [tok.lower() for tok in text.split(' ')]

In [35]:
comment = "How much sympathy can we muster for a wealthy, celebrated filmmaker on holiday? It’s hard not to roll your eyes when Silverio says things like, “Success has been my biggest failure"
predict_sentiment(net, vocab, tokenizer(comment))

'negative'

In [24]:
comment = "Titanic＂ is a poem. Ten years ago, it swept the world, won 11 Oscars large giant system to passion, dreams, sadness, couage and momentum rainbow scene to the differences between Jack and Rose secular love of life and death, as well as disaster flashed out of glorious epic of human nature, and warmth touched countless heart and become a rich humanistic atmosphere overflowing disaster film classic. Wandering artist Jack Rose and beautiful, in the luxurious Titanic encounter love, but the unexpected tragedy, the Titanic collided with the tip of the fracture, jack will be Rose onto a floating plank, himself immersed in Ice in frozen to death. In addition to creative the theme song ＂ my heart forever,＂ Jack and Rose standing on the bow railing the wind to fly, as well as their time and survived,etc ink rendering, the film there is a lens language , depiction of a critical the calm , serene and elegant, compared to the comfusion, panic and ugly. Perhaps, fortume or misfortune, sadness and happiness, are forever time in reincarnation, life should be realized is the this seene gaze of God？"
predict_sentiment(net, vocab, tokenizer(comment))

'positive'

In [41]:
comment = "This movie is not my fav."
predict_sentiment(net, vocab, tokenizer(comment))

'positive'

In [42]:
comment = "We can not say this movie is good."
predict_sentiment(net, vocab, tokenizer(comment))

'positive'

In [43]:
comment = "We can not say this movie is boring."
predict_sentiment(net, vocab, tokenizer(comment))

'negative'

In [44]:
comment = "We can not say this movie is not good."
predict_sentiment(net, vocab, tokenizer(comment))

'negative'

In [45]:
comment = "We can not say this movie is not boring."
predict_sentiment(net, vocab, tokenizer(comment))

'negative'

### 7 parameter tunning and result showing

In [16]:
# 1 result with different epoches

# 2 different kernel sizes and channel numbers with the same kernel numbers
# 2.1 【3，4，5】【100，100，100】 small kernel sizes
# 2.2 【8，9，10】【100，100，100】 big kernel sizes
# 2.3 【3，5，8】【100，100，100】 medium kernel sizes
# 2.4 【3，5，8】【80，100，120】 different channel number with different kernel sizes

# 3 different kernel numbers
# 3.1 【3，5】【100，100】 two kernels (compared with 2.1)
# 3.2 【2，3，4，5】【100，100，100，100】 four kernels (compared with 2.1)
# 3.3 【3，5，7，9】【100，100，100，100】 four kernels with big spans (compared with 2.1 and 2.2)

# 4 different batch_size
# 4.1 64
# 4.2 128

In [17]:
# parameter - batch_size
random.seed(101)

batch_size = 64 

train_set = Data.TensorDataset(*preprocess_imdb(train_data, vocab))
test_set = Data.TensorDataset(*preprocess_imdb(test_data, vocab))
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)

In [40]:
# 2.1
embed_size,  kernel_sizes, nums_channels = 100, [3, 4, 5], [100, 100, 100]

net = TextCNN(vocab, embed_size, kernel_sizes, nums_channels)

glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=os.path.join(DATA_ROOT, "glove"))
net.embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.requires_grad = False

lr, num, num_epochs = 0.001, 5, 5 
optimizer = torch.optim.Adam(filter(lambda p:p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
result = train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)
result = DataFrame(result)
result.to_excel("/Users/hongcan/Documents/HKU STAT/STAT7008/batchsize128.xlsx", sheet_name='2.1')

epoch 1, loss 0.4802, train_acc 0.764, test acc 0.844, time 618.5 sec
epoch 2, loss 0.3238, train_acc 0.862, test acc 0.864, time 243.3 sec
epoch 3, loss 0.2111, train_acc 0.916, test acc 0.876, time 245.4 sec
epoch 4, loss 0.1228, train_acc 0.956, test acc 0.875, time 245.8 sec
epoch 5, loss 0.0656, train_acc 0.978, test acc 0.874, time 249.3 sec


In [15]:
# 2.2
embed_size,  kernel_sizes, nums_channels = 100, [8, 9, 10], [100, 100, 100]

net = TextCNN(vocab, embed_size, kernel_sizes, nums_channels)

glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=os.path.join(DATA_ROOT, "glove"))
net.embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.requires_grad = False

lr, num, num_epochs = 0.001, 5, 5 
optimizer = torch.optim.Adam(filter(lambda p:p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
result = train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)
result = DataFrame(result)
#result.to_excel("/Users/hongcan/Documents/HKU STAT/STAT7008/batchsize128.xlsx", sheet_name='2.2')

epoch 1, loss 0.4961, train_acc 0.751, test acc 0.851, time 479.0 sec
epoch 2, loss 0.3124, train_acc 0.869, test acc 0.861, time 506.4 sec
epoch 3, loss 0.1963, train_acc 0.924, test acc 0.883, time 516.6 sec
epoch 4, loss 0.1154, train_acc 0.957, test acc 0.879, time 796.1 sec
epoch 5, loss 0.0657, train_acc 0.976, test acc 0.879, time 648.6 sec


In [None]:
# 2.3
embed_size,  kernel_sizes, nums_channels = 100, [3, 5, 8], [100, 100, 100]

net = TextCNN(vocab, embed_size, kernel_sizes, nums_channels)

glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=os.path.join(DATA_ROOT, "glove"))
net.embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.requires_grad = False

lr, num, num_epochs = 0.001, 5, 15 
optimizer = torch.optim.Adam(filter(lambda p:p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
result = train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)
result = DataFrame(result)
result.to_excel("/Users/hongcan/Documents/HKU STAT/STAT7008/batchsize128.xlsx", sheet_name='2.3')

In [None]:
# 2.4
embed_size,  kernel_sizes, nums_channels = 100, [3, 5, 8], [80, 100, 120]

net = TextCNN(vocab, embed_size, kernel_sizes, nums_channels)

glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=os.path.join(DATA_ROOT, "glove"))
net.embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.requires_grad = False

lr, num, num_epochs = 0.001, 5, 15 
optimizer = torch.optim.Adam(filter(lambda p:p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
result = train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)
result = DataFrame(result)
result.to_excel("/Users/hongcan/Documents/HKU STAT/STAT7008/batchsize128.xlsx", sheet_name='2.4')

In [None]:
# 3.1
embed_size,  kernel_sizes, nums_channels = 100, [3, 5], [100, 100]

net = TextCNN(vocab, embed_size, kernel_sizes, nums_channels)

glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=os.path.join(DATA_ROOT, "glove"))
net.embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.requires_grad = False

lr, num, num_epochs = 0.001, 5, 15 
optimizer = torch.optim.Adam(filter(lambda p:p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
result = train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)
result = DataFrame(result)
result.to_excel("/Users/hongcan/Documents/HKU STAT/STAT7008/batchsize128.xlsx", sheet_name='3.1')

In [None]:
# 3.2
embed_size,  kernel_sizes, nums_channels = 100, [2, 3, 4, 5], [100, 100, 100, 100]

net = TextCNN(vocab, embed_size, kernel_sizes, nums_channels)

glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=os.path.join(DATA_ROOT, "glove"))
net.embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.requires_grad = False

lr, num, num_epochs = 0.001, 5, 15 
optimizer = torch.optim.Adam(filter(lambda p:p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
result = train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)
result = DataFrame(result)
result.to_excel("/Users/hongcan/Documents/HKU STAT/STAT7008/batchsize128.xlsx", sheet_name='3.2')

In [None]:
# 3.3
embed_size,  kernel_sizes, nums_channels = 100, [3, 5, 7, 9], [100, 100, 100, 100]

net = TextCNN(vocab, embed_size, kernel_sizes, nums_channels)

glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=os.path.join(DATA_ROOT, "glove"))
net.embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.requires_grad = False

lr, num, num_epochs = 0.001, 5, 15 
optimizer = torch.optim.Adam(filter(lambda p:p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
result = train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)
result = DataFrame(result)
result.to_excel("/Users/hongcan/Documents/HKU STAT/STAT7008/batchsize128.xlsx", sheet_name='3.3')