In [1]:
import torch
import spacy
import random
import time
import torch.nn as nn
import torch.nn.functional as F
from torchtext import data
from torchtext import datasets

In [2]:
nlp = spacy.load('en_core_web_sm')

# Word Averageing Model

## 准备数据

- TorchText中的一个重要概念是Field。Field决定了你的数据会被如何处理；
- 我们使用TEXT field来定义如何处理电影评论，使用LABEL field来处理两个情感类别；
- LABEL由LabelField定义。这是一种特别的用来处理label的Field。

In [3]:
SEED = 1224
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True

DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 64

TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)

In [4]:
train_data, test_data = datasets.IMDB.splits(text_field=TEXT, label_field=LABEL)

In [5]:
# 查看每个数据splits有多少条

In [6]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 25000
Number of testing examples: 25000


In [7]:
#查看一个example

In [8]:
print(vars(train_data.examples[0]))

{'text': ['Bromwell', 'High', 'is', 'a', 'cartoon', 'comedy', '.', 'It', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', ',', 'such', 'as', '"', 'Teachers', '"', '.', 'My', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'Bromwell', 'High', "'s", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '"', 'Teachers', '"', '.', 'The', 'scramble', 'to', 'survive', 'financially', ',', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', 'teachers', "'", 'pomp', ',', 'the', 'pettiness', 'of', 'the', 'whole', 'situation', ',', 'all', 'remind', 'me', 'of', 'the', 'schools', 'I', 'knew', 'and', 'their', 'students', '.', 'When', 'I', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school', ',', 'I', 'immediately', 'recalled', '.........', 'at', '..........', 'High', '.', 'A', 'classic', 'l

In [9]:
print(vars(train_data.examples[0]).keys())

dict_keys(['text', 'label'])


In [10]:
# 使用.split()创建一个新的validation set

In [11]:
train_data, valid_data = train_data.split(random_state=random.seed(SEED))

In [12]:
# 检查每个部分有多少数据

In [13]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


## 创建vocabulary

- vocabulary就是把每个单词映射到一个数字。
- 我们使用最常见的25k个单词来构建我们的单词表，用max_size可以做到这一点。

In [14]:
%pwd

'E:\\PycharmProjects\\PyTorchLearning\\PyTorch-Into-Action\\03自然语言分类任务'

In [15]:
TEXT.build_vocab(train_data, max_size=25000, vectors='glove.6B.100d', unk_init=torch.Tensor.normal_)

In [18]:
LABEL.build_vocab(train_data)

In [19]:
print(f'Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}')
print(f'Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}')

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


- 把句子传入模型时，我们是按照一个一个batch传入的。也就是说，我们一次传入了多个句子，而且每个batch中的句子必须是相同的长度。为了确保句子的长度相同，torchtext会把短的句子pad到和最长的句子等长。

In [20]:
# 查看训练数据中最长的句子

In [21]:
print(TEXT.vocab.freqs.most_common(n=10))

[('the', 203961), (',', 193946), ('.', 165993), ('and', 109823), ('a', 109782), ('of', 101067), ('to', 94340), ('is', 76438), ('in', 61518), ('I', 54038)]


- 用 `stoi`(**s**tring **to** **i**nt) 或者 `itos` (**i**nt **to**  **s**tring) 来查看单词表。

In [22]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']


In [23]:
# 查看labels

In [24]:
print(LABEL.vocab.stoi)

defaultdict(None, {'pos': 0, 'neg': 1})


## 创建iterator

- 每个itartion都会返回一个batch的examples。
- 我们会使用`BucketIterator`。`BucketIterator`会把长度差不多的句子放到同一个batch中，确保每个batch中不出现太多的padding。
- 严格来说，这份notebook中的模型代码都有一个问题，也就是我们把`<pad>`也当做了模型的输入进行训练。更好的做法是在模型中把由`<pad>`产生的输出给消除掉。在这节课中我们简单处理，直接把`<pad>`也用作模型输入了。由于`<pad>`数量不多，模型的效果也不差。

In [25]:
train_iter, valid_iter, test_iter = data.BucketIterator.splits(
    datasets=(train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=DEVICE)

In [26]:
batch = next(iter(valid_iter))

In [27]:
batch.text

tensor([[ 473, 2595,   14,  ..., 1118, 3107,   11],
        [  11,   22,    0,  ...,   22,    3,   62],
        [ 531,   13,   14,  ...,   17,  403,   27],
        ...,
        [1655,    6,  145,  ...,    1,    1,    1],
        [  85,  273,  145,  ...,    1,    1,    1],
        [   4,    4,  145,  ...,    1,    1,    1]], device='cuda:0')

In [28]:
batch.text[:, 0]

tensor([  473,    11,   531,    12,  1639,    33,     6,   378,   100,    67,
           32,    78,     6,   189,   715,  6618,  1748,     3,    26,    11,
         1764,   119,    12,     9,    88,    53,    15,     6,    63,   337,
         1190,    22,     3,  1322, 17296,    86,     6,  5142,   313,    21,
           35,   121,     5,    12,    74,  1655,    85,     4],
       device='cuda:0')

In [29]:
[TEXT.vocab.itos[i] for i in batch.text[:, 0]]

['Well',
 'I',
 'guess',
 'it',
 'supposedly',
 'not',
 'a',
 'classic',
 'because',
 'there',
 'are',
 'only',
 'a',
 'few',
 'easily',
 'recognizable',
 'faces',
 ',',
 'but',
 'I',
 'personally',
 'think',
 'it',
 'is',
 '...',
 'It',
 "'s",
 'a',
 'very',
 'beautiful',
 'sweet',
 'movie',
 ',',
 'Henry',
 'Winkler',
 'did',
 'a',
 'GREAT',
 'job',
 'with',
 'his',
 'character',
 'and',
 'it',
 'really',
 'impressed',
 'me',
 '.']

## Word Averaging Model

In [32]:
class WordAVGModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx):
        super(WordAVGModel, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, padding_idx=pad_idx)  # 从__init__()传入
        self.linear = nn.Linear(in_features=embedding_dim, out_features=output_dim)
    
    def forward(self, text):
        embedded = self.embedding(text)  # text: [seq_len, batch_size, embedding_dim]
        # embedded = embedded.transpose(1, 0)  # [batch_size, seq_len, embedding_dim]
        embedded = embedded.permute(1, 0, 2)  # [batch_size, seq_len, embedding_dim], 同transpose
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1))  # [batch_size, 1, embedding_dim]
        pooled = pooled.squeeze()  # [batch_size, embedding_dim]
        return self.linear(pooled)

In [33]:
# 定义模型参数

In [49]:
VOCAB_SIZE = len(TEXT.vocab)
EMBEDDING_SIZE = 100
OUTPUT_SIZE = 1
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

In [50]:
model = WordAVGModel(vocab_size=VOCAB_SIZE,
                     embedding_dim=EMBEDDING_SIZE,
                     output_dim=OUTPUT_SIZE,
                     pad_idx=PAD_IDX)

In [51]:
model

WordAVGModel(
  (embedding): Embedding(25002, 100, padding_idx=1)
  (linear): Linear(in_features=100, out_features=1, bias=True)
)

In [52]:
# 计算模型参数

In [53]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [54]:
count_parameters(model=model)

2500301

## 初始化模型

In [55]:
# GloVe初始化模型

In [56]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)  # GloVe词向量

tensor([[ 0.8659, -0.0285, -0.5158,  ...,  0.4541, -0.2438, -1.0133],
        [-0.3128,  0.0196,  2.1079,  ...,  0.7128, -0.3003, -1.2175],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.5654,  0.7512, -1.3183,  ..., -0.6322, -0.7870,  0.5531],
        [ 1.1017,  0.3210, -0.5054,  ..., -0.7363, -0.6928, -0.2557],
        [-0.0632,  0.3695,  0.3843,  ..., -1.5785, -0.0872, -0.1061]])

In [57]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

In [58]:
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_SIZE)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_SIZE)

## 训练模型

In [None]:
optimizer = torch.optim.Adam(model.parameters())
crit = nn.BCEWithLogitsLoss()

model = model.to(DEVICE)
crit = crit.to(DEVICE)

In [None]:
# 计算模型准确率

In [None]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [None]:
# 计时

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
def train(model, iterator, optimizer, crit):
    epoch_loss, epoch_acc = 0., 0.
    model.train()
    total_len = 0.
    for batch in iterator:
        preds = model(batch.text).squeeze()
        loss = crit(preds, batch.label)
        acc = binary_accuracy(preds=preds, y=batch.label)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item() * len(batch.label)
        epoch_acc += acc.item() * len(batch.label)
        total_len += len(batch.label)
    
    return epoch_loss / total_len, epoch_acc / total_len

In [None]:
def evaluate(model, iterator, crit):
    epoch_loss, epoch_acc = 0., 0.
    model.eval()
    total_len = 0.
    for batch in iterator:
        preds = model(batch.text).squeeze()
        loss = crit(preds, batch.label)
        acc = binary_accuracy(preds=preds, y=batch.label)
        
        epoch_loss += loss.item() * len(batch.label)
        epoch_acc += acc.item() * len(batch.label)
        total_len += len(batch.label)
    model.train()
    
    return epoch_loss / total_len, epoch_acc / total_len

In [257]:
N_EPOCHS = 30
best_valid_acc = 0.
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iter, optimizer, crit)
    valid_loss, valid_acc = evaluate(model, valid_iter, crit)
    
    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        torch.save(model.state_dict(), 'word_avg_model.pth')
        
    print(f'Epoch: {epoch}, Train Loss: {train_loss}, Train Acc: {train_acc}')
    print(f'Epoch: {epoch}, Valid Loss:, {valid_loss}, Valid Acc: {valid_acc}')
    print('======' * 13)

Epoch: 0, Train Loss: 0.6858755663735526, Train Acc: 0.6244000000681196
Epoch: 0, Valid Loss:, 0.6264575871785482, Valid Acc: 0.7109333333969117
Epoch: 1, Train Loss: 0.6458370156424386, Train Acc: 0.7385714286395482
Epoch: 1, Valid Loss:, 0.5106585666974386, Valid Acc: 0.7604000000635783
Epoch: 2, Train Loss: 0.5761070107596261, Train Acc: 0.788971428666796
Epoch: 2, Valid Loss:, 0.4487710873444875, Valid Acc: 0.7985333333651224
Epoch: 3, Train Loss: 0.504969163261141, Train Acc: 0.8265142857415335
Epoch: 3, Valid Loss:, 0.4276981432994207, Valid Acc: 0.8270666666984559
Epoch: 4, Train Loss: 0.4404541014398847, Train Acc: 0.8566285714694432
Epoch: 4, Valid Loss:, 0.4271032152891159, Valid Acc: 0.8442666666984558
Epoch: 5, Train Loss: 0.3914074293068477, Train Acc: 0.8769714286259243
Epoch: 5, Valid Loss:, 0.43147724928855896, Valid Acc: 0.8557333333651225
Epoch: 6, Train Loss: 0.3509202476705824, Train Acc: 0.8907428571837289
Epoch: 6, Valid Loss:, 0.4486611499150594, Valid Acc: 0.864

## 预测

In [258]:
# 模型加载

In [259]:
model.load_state_dict(torch.load('./word_avg_model.pth'))

In [260]:
def predict_sentiment(sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(DEVICE)  # seq_len
    tensor = tensor.unsqueeze(1)  # seq_len * batch_size=1
    pred = torch.sigmoid(model(tensor))
    return pred.item()

In [261]:
predict_sentiment('I like this food so much')

0.9998784065246582

In [262]:
predict_sentiment('I hate this food')

9.477886123931967e-06

# RNN Model

## 模型定义

In [263]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, padding_idx=pad_idx)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.linear = nn.Linear(in_features=hidden_dim * 2, out_features=output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        embedded = self.dropout(self.embedding(text))  # [sent_len, batch_size, emd_dim]
        output, (hidden, cell) = self.rnn(embedded)
        # output = [sent_len, batch_size, hid_dim * num_directions]
        # hidden = [num_layers * num_directions, batch_size, hid_dim]
        # cell = [num_layers * num_directions, batch_size, hid_dim]
        
        # concat the final forward (hidden[-2, :, :]) and backward (hidden[-1, :, :]) hidden layers and apply dropout
        hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))  # [batch_size, hid_dim * num_directions]
        return self.linear(hidden.squeeze(0))        

In [264]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

In [273]:
model = RNNModel(vocab_size=INPUT_DIM,
                 embedding_dim=EMBEDDING_DIM,
                 hidden_dim=HIDDEN_DIM,
                 output_dim=OUTPUT_DIM,
                 n_layers=N_LAYERS,
                 bidirectional=BIDIRECTIONAL,
                 dropout=DROPOUT,
                 pad_idx=PAD_IDX)

In [274]:
model

RNNModel(
  (embedding): Embedding(25002, 100, padding_idx=1)
  (rnn): LSTM(100, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (linear): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.5)
)

In [275]:
print(f'The model has {count_parameters(model)} trainable parameters')

The model has 4810857 trainable parameters


## 初始化模型

In [276]:
model.embedding.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [277]:
print((model.embedding.weight.data).shape)

torch.Size([25002, 100])


## 训练模型

In [278]:
optimizer = torch.optim.Adam(model.parameters())
model = model.to(DEVICE)

In [279]:
model.embedding.weight.data

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.5654,  0.7512, -1.3183,  ..., -0.6322, -0.7870,  0.5531],
        [ 1.1017,  0.3210, -0.5054,  ..., -0.7363, -0.6928, -0.2557],
        [-0.0632,  0.3695,  0.3843,  ..., -1.5785, -0.0872, -0.1061]],
       device='cuda:0')

In [283]:
N_EPOCHS = 10
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train(model,
                                  iterator=train_iter,
                                  optimizer=optimizer,
                                  crit=crit)
    valid_loss, valid_acc = evaluate(model, iterator=valid_iter, crit=crit)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best_lstm_model.pth')

    print(f'Epoch: {epoch+1:02} & Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\t Train Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
    print(f'\t Valid Loss: {valid_loss:.3f} | Valid Acc: {valid_acc * 100:.2f}%')

Epoch: 01 & Epoch Time: 1m 29s
	 Train Loss: 0.694 | Train Acc: 50.80%
	 Valid Loss: 0.693 | Valid Acc: 51.37%
Epoch: 02 & Epoch Time: 1m 29s
	 Train Loss: 0.693 | Train Acc: 51.09%
	 Valid Loss: 0.693 | Valid Acc: 50.63%
Epoch: 03 & Epoch Time: 1m 29s
	 Train Loss: 0.689 | Train Acc: 53.52%
	 Valid Loss: 0.672 | Valid Acc: 59.53%
Epoch: 04 & Epoch Time: 1m 29s
	 Train Loss: 0.667 | Train Acc: 59.52%
	 Valid Loss: 0.668 | Valid Acc: 61.39%
Epoch: 05 & Epoch Time: 1m 30s
	 Train Loss: 0.542 | Train Acc: 73.83%
	 Valid Loss: 0.356 | Valid Acc: 85.53%
Epoch: 06 & Epoch Time: 1m 30s
	 Train Loss: 0.327 | Train Acc: 86.86%
	 Valid Loss: 0.324 | Valid Acc: 87.83%
Epoch: 07 & Epoch Time: 1m 29s
	 Train Loss: 0.258 | Train Acc: 90.18%
	 Valid Loss: 0.285 | Valid Acc: 88.75%
Epoch: 08 & Epoch Time: 1m 29s
	 Train Loss: 0.209 | Train Acc: 92.07%
	 Valid Loss: 0.301 | Valid Acc: 88.72%
Epoch: 09 & Epoch Time: 1m 30s
	 Train Loss: 0.179 | Train Acc: 93.58%
	 Valid Loss: 0.301 | Valid Acc: 89.43%
E

- You may have noticed the loss is not really decreasing and the accuracy is poor. This is due to several issues with the model which we'll improve in the next notebook.

- Finally, the metric we actually care about, the test loss and accuracy, which we get from our parameters that gave us the best validation loss.

## 测试

In [284]:
model.load_state_dict(torch.load('./best_lstm_model.pth'))
test_loss, test_acc = evaluate(model, test_iter, crit)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}%')

Test Loss: 0.285 | Test Acc: 88.81%


# CNN Model

In [44]:
class CNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_size, output_size, pad_idx, num_filters, filter_size, dropout):
        super(CNNModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, embedding_size, padding_idx=pad_idx)
        self.conv = nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(filter_size, embedding_size))
        self.linear = nn.Linear(embedding_size, output_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        text = text.permute(1, 0)  # [batch_size, seq_len]
        embedded = self.embed(text)  # [batch_size, seq_len, embedding_size]
        embedded = embedded.unsqueeze(1)  # [batch_size, 1, seq_len, embedding_size]
        conved = F.relu(self.conv(embedded))  # [batch_size, num_filters, seq_len-filter_size+1, 1]
        conved = conved.squeeze(3)  # [batch_size, num_filters, seq_len-filter_size+1]
        pooled = F.max_pool1d(conved, conved.shape[2])  # [batch_size, num_filters, 1]
        pooled = pooled.squeeze(2)  # [batch_size, num_filters]
        pooled = self.dropout(pooled)
        return self.linear(pooled)

In [46]:
model = CNNModel(vocab_size=VOCAB_SIZE,
                embedding_size=EMBEDDING_SIZE,
                output_size=OUTPUT_SIZE,
                pad_idx=PAD_IDX,
                num_filters=100,
                filter_size=3,
                dropout=0.5)

In [47]:
model

CNNModel(
  (embed): Embedding(25002, 100, padding_idx=1)
  (conv): Conv2d(1, 100, kernel_size=(3, 100), stride=(1, 1))
  (linear): Linear(in_features=100, out_features=1, bias=True)
  (dropout): Dropout(p=0.5)
)

In [59]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)  # GloVe词向量

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_SIZE)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_SIZE)

In [60]:
optimizer = torch.optim.Adam(model.parameters())
crit = nn.BCEWithLogitsLoss()

model = model.to(DEVICE)
crit = crit.to(DEVICE)

In [62]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [63]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [64]:
def train(model, iterator, optimizer, crit):
    epoch_loss, epoch_acc = 0., 0.
    model.train()
    total_len = 0.
    for batch in iterator:
        preds = model(batch.text).squeeze()
        loss = crit(preds, batch.label)
        acc = binary_accuracy(preds=preds, y=batch.label)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item() * len(batch.label)
        epoch_acc += acc.item() * len(batch.label)
        total_len += len(batch.label)
    
    return epoch_loss / total_len, epoch_acc / total_len

In [65]:
def evaluate(model, iterator, crit):
    epoch_loss, epoch_acc = 0., 0.
    model.eval()
    total_len = 0.
    for batch in iterator:
        preds = model(batch.text).squeeze()
        loss = crit(preds, batch.label)
        acc = binary_accuracy(preds=preds, y=batch.label)
        
        epoch_loss += loss.item() * len(batch.label)
        epoch_acc += acc.item() * len(batch.label)
        total_len += len(batch.label)
    model.train()
    
    return epoch_loss / total_len, epoch_acc / total_len

In [66]:
N_EPOCHS = 20
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train(model, iterator=train_iter, optimizer=optimizer, crit=crit)
    valid_loss, valid_acc = evaluate(model, iterator=valid_iter, crit=crit)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best_cnn_model.pth')
 
    print(f'Epoch: {epoch+1:02} & Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\t Train Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
    print(f'\t Valid Loss: {valid_loss:.3f} | Valid Acc: {valid_acc * 100:.2f}%')

Epoch: 01 & Epoch Time: 0m 4s
	 Train Loss: 0.685 | Train Acc: 61.43%
	 Valid Loss: 0.624 | Valid Acc: 70.04%
Epoch: 02 & Epoch Time: 0m 3s
	 Train Loss: 0.644 | Train Acc: 74.64%
	 Valid Loss: 0.500 | Valid Acc: 76.40%
Epoch: 03 & Epoch Time: 0m 3s
	 Train Loss: 0.572 | Train Acc: 79.32%
	 Valid Loss: 0.453 | Valid Acc: 79.81%
Epoch: 04 & Epoch Time: 0m 3s
	 Train Loss: 0.497 | Train Acc: 82.94%
	 Valid Loss: 0.428 | Valid Acc: 82.96%
Epoch: 05 & Epoch Time: 0m 3s
	 Train Loss: 0.435 | Train Acc: 86.01%
	 Valid Loss: 0.428 | Valid Acc: 84.60%
Epoch: 06 & Epoch Time: 0m 3s
	 Train Loss: 0.385 | Train Acc: 87.80%
	 Valid Loss: 0.445 | Valid Acc: 85.67%
Epoch: 07 & Epoch Time: 0m 3s
	 Train Loss: 0.347 | Train Acc: 89.11%
	 Valid Loss: 0.458 | Valid Acc: 86.48%
Epoch: 08 & Epoch Time: 0m 3s
	 Train Loss: 0.317 | Train Acc: 90.19%
	 Valid Loss: 0.482 | Valid Acc: 87.00%
Epoch: 09 & Epoch Time: 0m 3s
	 Train Loss: 0.290 | Train Acc: 90.89%
	 Valid Loss: 0.502 | Valid Acc: 87.52%
Epoch: 10 

In [67]:
model.load_state_dict(torch.load('./best_cnn_model.pth'))
test_loss, test_acc = evaluate(model, test_iter, crit)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}%')

Test Loss: 0.435 | Test Acc: 84.20%


# CNN Model List

In [77]:
class CNNModelList(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout, pad_idx):
        super(CNNModelList, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs, embedding_dim)) for fs in filter_sizes
        ])
        self.linear = nn.Linear(in_features=len(filter_sizes) * n_filters, out_features=output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        text = text.permute(1, 0)
        embedded = self.embedding(text)
        embedded = embedded.unsqueeze(1)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]  # 对维度2进行max_pool
        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.linear(cat)

In [78]:
INPUT_DIM = 100
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

In [79]:
model = CNNModelList(vocab_size=VOCAB_SIZE, 
                     embedding_dim=EMBEDDING_DIM, 
                     n_filters=N_FILTERS, 
                     filter_sizes=FILTER_SIZES, 
                     output_dim=OUTPUT_DIM, 
                     dropout=DROPOUT, 
                     pad_idx=PAD_IDX)

In [80]:
model.embedding.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
model = model.to(DEVICE)

In [81]:
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(DEVICE)

In [82]:
N_EPOCHS = 20
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train(model, iterator=train_iter, optimizer=optimizer, crit=crit)
    valid_loss, valid_acc = evaluate(model, iterator=valid_iter, crit=crit)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best_cnn_model_list.pth')
 
    print(f'Epoch: {epoch+1:02} & Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\t Train Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
    print(f'\t Valid Loss: {valid_loss:.3f} | Valid Acc: {valid_acc * 100:.2f}%')

Epoch: 01 & Epoch Time: 0m 13s
	 Train Loss: 0.653 | Train Acc: 61.37%
	 Valid Loss: 0.510 | Valid Acc: 77.84%
Epoch: 02 & Epoch Time: 0m 13s
	 Train Loss: 0.419 | Train Acc: 81.10%
	 Valid Loss: 0.352 | Valid Acc: 85.11%
Epoch: 03 & Epoch Time: 0m 13s
	 Train Loss: 0.293 | Train Acc: 87.73%
	 Valid Loss: 0.318 | Valid Acc: 86.56%
Epoch: 04 & Epoch Time: 0m 13s
	 Train Loss: 0.207 | Train Acc: 91.82%
	 Valid Loss: 0.316 | Valid Acc: 87.12%
Epoch: 05 & Epoch Time: 0m 13s
	 Train Loss: 0.149 | Train Acc: 94.49%
	 Valid Loss: 0.340 | Valid Acc: 86.76%
Epoch: 06 & Epoch Time: 0m 13s
	 Train Loss: 0.099 | Train Acc: 96.74%
	 Valid Loss: 0.359 | Valid Acc: 86.89%
Epoch: 07 & Epoch Time: 0m 13s
	 Train Loss: 0.068 | Train Acc: 97.98%
	 Valid Loss: 0.394 | Valid Acc: 86.88%
Epoch: 08 & Epoch Time: 0m 13s
	 Train Loss: 0.050 | Train Acc: 98.43%
	 Valid Loss: 0.408 | Valid Acc: 87.04%
Epoch: 09 & Epoch Time: 0m 13s
	 Train Loss: 0.040 | Train Acc: 98.80%
	 Valid Loss: 0.443 | Valid Acc: 86.84%
E

In [83]:
model.load_state_dict(torch.load('./best_cnn_model_list.pth'))
test_loss, test_acc = evaluate(model, test_iter, crit)
print(f'Test Loss: {test_loss:.3f} ! Test Acc: {test_acc * 100:.2f}%')

Test Loss: 0.337 ! Test Acc: 85.81%
