In [1]:
from torchtext.data import Iterator, BucketIterator
from torchtext import data
import torch

In [2]:
def load_iters(batch_size=32, device="cpu", data_path='data', vectors=None, use_tree=False):
    if not use_tree:
        #定义Field:声明如何处理数据
        # Field使用include_lengths参数为True,可以在返回minibatch的时候同时返回一个表示每个句子的长度的list
        TEXT = data.Field(batch_first=True, include_lengths=True, lower=True)  
        LABEL = data.LabelField(batch_first=True)
        TREE = None

        fields = {'sentence1': ('premise', TEXT),
                  'sentence2': ('hypothesis', TEXT),
                  'gold_label': ('label', LABEL)}
    else:
        TEXT = data.Field(batch_first=True,
                          lower=True,
                          preprocessing=lambda parse: [t for t in parse if t not in ('(', ')')],
                          include_lengths=True)
        LABEL = data.LabelField(batch_first=True)
        TREE = data.Field(preprocessing=lambda parse: ['reduce' if t == ')' else 'shift' for t in parse if t != '('],
                          batch_first=True)

        TREE.build_vocab([['reduce'], ['shift']])  #构建词表

        fields = {'sentence1_binary_parse': [('premise', TEXT),
                                             ('premise_transitions', TREE)],
                  'sentence2_binary_parse': [('hypothesis', TEXT),
                                             ('hypothesis_transitions', TREE)],
                  'gold_label': ('label', LABEL)}
    train_data, dev_data, test_data = data.TabularDataset.splits(
        path=data_path,
        train='snli_1.0_train.jsonl',
        validation='snli_1.0_dev.jsonl',
        test='snli_1.0_test.jsonl',
        format='json',
        fields=fields,
        filter_pred=lambda ex: ex.label != '-'  # filter the example which label is '-'(means unlabeled)
    )
    if vectors is not None:
        #unk_init表示的是对于未登录词的初始化方法，默认是使用全零进行初始化,这里用均值为0方差为1的正态分布去初始化
        TEXT.build_vocab(train_data, vectors=vectors, unk_init=torch.Tensor.normal_)
    else:
        TEXT.build_vocab(train_data)
    LABEL.build_vocab(dev_data)
    
    #相比于标准迭代器，会将类似长度的样本当做一批来处理，因为在文本处理中经常会需要将每一批样本长度补齐为当前批中
    #最长序列的长度，因此当样本长度差别较大时，使用BucketIerator可以带来填充效率的提高。
    train_iter, dev_iter = BucketIterator.splits(
        (train_data, dev_data),
        batch_sizes=(batch_size, batch_size),
        device=device,
        sort_key=lambda x: len(x.premise) + len(x.hypothesis),  #sort_key是一个告诉迭代器如何对批处理中的元素进行排序的函数。
        sort_within_batch=True,   #sort_within_batch=True告诉迭代器需要对批处理的内容进行排序
        repeat=False,    #不重复多个epoches的迭代
        shuffle=True
    )
    test_iter = Iterator(test_data,      #这里为什么不用BucketIterator?
                         batch_size=batch_size,
                         device=device,
                         sort=False,
                         sort_within_batch=False,   ##sort_within_batch=False告诉迭代器不需要对批处理的内容进行排序
                         repeat=False,
                         shuffle=False)

    return train_iter, dev_iter, test_iter, TEXT, LABEL, TREE    

## 创建模型

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

对于输入的词向量，首先使用 BILSTM来学习如何表示一个word以及上下文，

即对原始的word embedding在当前的语境下重新编码，得到两个句子的新的词向量：

In [4]:
class BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size=128, dropout_rate=0.1, layer_num=1):
        super(BiLSTM, self).__init__()
        self.hidden_size = hidden_size
        if layer_num == 1:
            self.bilstm = nn.LSTM(input_size, hidden_size // 2, layer_num, batch_first=True, bidirectional=True)

        else:
            self.bilstm = nn.LSTM(input_size, hidden_size // 2, layer_num, batch_first=True, dropout=dropout_rate,
                                  bidirectional=True)
        self.init_weights()

    def init_weights(self):
        for p in self.bilstm.parameters():
            if p.dim() > 1:
                nn.init.normal_(p)   #使p服从正态分布
                p.data.mul_(0.01)   #均值为0.01？？
            else:
                p.data.zero_()
                # This is the range of indices for our forget gates for each LSTM cell
                p.data[self.hidden_size // 2: self.hidden_size] = 1
                
    def forward(self, x, lens):
        '''
        :param x: (batch, seq_len, input_size)
        :param lens: (batch, )
        :return: (batch, seq_len, hidden_size)
        '''
        ordered_lens, index = lens.sort(descending=True)
        ordered_x = x[index]
    
        #ordered_lens需要从大到小排序，ordered_x为已根据长度大小排好序，batch_first如果设置为true，则x的第一维为batch_size，第二维为seq_length，否则相反。 
        packed_x = nn.utils.rnn.pack_padded_sequence(ordered_x, ordered_lens, batch_first=True)   #是打包序列
        packed_output, _ = self.bilstm(packed_x)  #打包后的tensor
        #解包，返回输出和每个的长度，输出的第一个维度是填充序列长度，但是由于使用打包填充序列，当填充标记是输入时，张量的值将全为零，且不需要用到
        output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)   

        recover_index = index.argsort()   #将索引从小到大排列
        recover_output = output[recover_index]
        return recover_output    

第二层先用Attention来提取前提与假设之间的关系，然后重构，以前提为例：

其中，x1为前提，x2为假设

In [5]:
class ESIM(nn.Module):
    def __init__(self, vocab_size, num_labels, embed_size, hidden_size, dropout_rate=0.1, layer_num=1,
                 pretrained_embed=     None, freeze=False):
        super(ESIM, self).__init__()
        self.pretrained_embed = pretrained_embed
        if pretrained_embed is not None:
            self.embed = nn.Embedding.from_pretrained(pretrained_embed, freeze)
        else:
            self.embed = nn.Embedding(vocab_size, embed_size)   #降维，
        self.bilstm1 = BiLSTM(embed_size, hidden_size, dropout_rate, layer_num)
        self.bilstm2 = BiLSTM(hidden_size, hidden_size, dropout_rate, layer_num)
        self.fc1 = nn.Linear(4 * hidden_size, hidden_size)   #？
        self.fc2 = nn.Linear(4 * hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, num_labels)
        self.dropout = nn.Dropout(dropout_rate)

        self.init_weight()

    def init_weight(self):
        if self.pretrained_embed is None:
            nn.init.normal_(self.embed.weight)
            self.embed.weight.data.mul_(0.01)
        nn.init.normal_(self.fc1.weight)
        self.fc1.weight.data.mul_(0.01)
        nn.init.normal_(self.fc2.weight)
        self.fc2.weight.data.mul_(0.01)
        nn.init.normal_(self.fc3.weight)
        self.fc3.weight.data.mul_(0.01)
        
    def soft_align_attention(self, x1, x1_lens, x2, x2_lens):
        '''
        local inference modeling
        :param x1: (batch, seq1_len, hidden_size)
        :param x1_lens: (batch, )
        :param x2: (batch, seq2_len, hidden_size)
        :param x2_lens: (batch, )
        :return: x1_align (batch, seq1_len, hidden_size)
                 x2_align (batch, seq2_len, hidden_size)
        '''
        seq1_len = x1.size(1)   #句子的长度等于列的大小
        seq2_len = x2.size(1)   
        batch_size = x1.size(0)   #batch大小等于行的大小
        
        #计算两个句子word之间的相似度
        #torch.matmul是torch.mm的broadcast版本，x2.transpose(1,2)交换其维度一和维度二，使hidden_size与seq2_len交换位置
        attention = torch.matmul(x1, x2.transpose(1, 2))  # (batch（个数）, seq1_len(行), seq2_len（列）)
        #unsqueeze(1)在XX的第二维上增加一个维度
        mask1 = torch.arange(seq1_len).expand(batch_size, seq1_len).to(x1.device) >= x1_lens.unsqueeze(
            1)  # (batch, seq1_len), 1 means <pad>
        mask2 = torch.arange(seq2_len).expand(batch_size, seq2_len).to(x1.device) >= x2_lens.unsqueeze(
            1)  # (batch, seq2_len)
        mask1 = mask1.float().masked_fill_(mask1, float('-inf'))
        mask2 = mask2.float().masked_fill_(mask2, float('-inf'))
        #weight2是x2的每个词对x1的归一化相关程度，即attention值。
        weight2 = F.softmax(attention + mask2.unsqueeze(1), dim=-1)  # (batch, seq1_len, seq2_len),
        #对假设x2进行加权求和，该值提取出了x2中与x1相关的部分；
        x1_align = torch.matmul(weight2, x2)  # (batch, seq1_len, hidden_size)
        weight1 = F.softmax(attention.transpose(1, 2) + mask1.unsqueeze(1), dim=-1)  # (batch, seq2_len, seq1_len)
        x2_align = torch.matmul(weight1, x1)  # (batch, seq2_len, hidden_size)
        return x1_align, x2_align
    
    def composition(self, x, lens):
        x = F.relu(self.fc1(x)) 
        x_compose = self.bilstm2(self.dropout(x), lens)  # (batch, seq_len, hidden_size)
        p1 = F.avg_pool1d(x_compose.transpose(1, 2), x.size(1)).squeeze(-1)  # (batch, hidden_size)  #x.size(1)，x的列大小作为窗口大小
        p2 = F.max_pool1d(x_compose.transpose(1, 2), x.size(1)).squeeze(-1)  # (batch, hidden_size)
        return torch.cat([p1, p2], 1)  # (batch, hidden_size*2)，按维数1拼接，就是横着拼

    def forward(self, x1, x1_lens, x2, x2_lens):
        '''
        :param x1: (batch, seq1_len)
        :param x1_lens: (batch,)
        :param x2: (batch, seq2_len)
        :param x2_lens: (batch,)
        :return: (batch, num_class)
        '''
        # Input encoding
        embed1 = self.embed(x1)  # (batch, seq1_len, embed_size)
        embed2 = self.embed(x2)  # (batch, seq2_len, embed_size)
        #new_embed1是假设x1经过BiLSTM后的值
        new_embed1 = self.bilstm1(self.dropout(embed1), x1_lens)  # (batch, seq1_len, hidden_size)
        new_embed2 = self.bilstm1(self.dropout(embed2), x2_lens)  # (batch, seq2_len, hidden_size)

        # Local inference collected over sequence
        x1_align, x2_align = self.soft_align_attention(new_embed1, x1_lens, new_embed2, x2_lens)

        # Enhancement of local inference information
        #将四部分连接起来，用相减以及相乘来实现前提与假设的“交互推断”，文中说可以使得局部信息（如矛盾关系）更加明显；
        x1_combined = torch.cat([new_embed1, x1_align, new_embed1 - x1_align, new_embed1 * x1_align],
                                dim=-1)  # (batch, seq1_len, 4*hidden_size)
        x2_combined = torch.cat([new_embed2, x2_align, new_embed2 - x2_align, new_embed2 * x2_align],
                                 dim=-1)  # (batch, seq2_len, 4*hidden_size)

        # Inference composition
        x1_composed = self.composition(x1_combined, x1_lens)  # (batch, 2*hidden_size), v=[v_avg; v_max]
        x2_composed = self.composition(x2_combined, x2_lens)  # (batch, 2*hidden_size)
        composed = torch.cat([x1_composed, x2_composed], -1)  # (batch, 4*hidden_size)

        # MLP classifier
        out = self.fc3(self.dropout(torch.tanh(self.fc2(self.dropout(composed)))))
        return out


## 训练模型

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.vocab import Vectors
from tqdm import tqdm

In [7]:
torch.manual_seed(1)  #为当前CPU设置种子用于生成随机数，使神经网络初始化
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#### 参数设置

In [8]:
BATCH_SIZE = 32
HIDDEN_SIZE = 600  # every LSTM's(forward and backward) hidden size is half of HIDDEN_SIZE
EPOCHS = 6
DROPOUT_RATE = 0.5
LAYER_NUM = 1
LEARNING_RATE = 4e-4
PATIENCE = 5
CLIP = 10
EMBEDDING_SIZE = 300
# vectors = None
vectors = Vectors(name = '/home/xiyu/data/trainee/ZhangZhongmin/task3/data_task3/embeddings/glove.840B.300d.txt')
freeze = False
data_path = '/home/xiyu/data/trainee/Zhangbingbin/task3/data'

In [9]:
def show_example(premise, hypothesis, label, TEXT, LABEL):
    tqdm.write('Label: ' + LABEL.vocab.itos[label])   #itos:按照下标的顺序返回每一个单词
    tqdm.write('premise: ' + ' '.join([TEXT.vocab.itos[i] for i in premise]))
    tqdm.write('hypothesis: ' + ' '.join([TEXT.vocab.itos[i] for i in hypothesis]))

In [10]:
def count_parameters(model):
    #返回model中的参数的总数目
    return sum(p.numel() for p in model.parameters() if p.requires_grad)  #numel()返回数组中元素的个数,变量的requires_grad标记的运算就相当于or。 

In [11]:
def eval(data_iter, name, epoch=None, use_cache=False):
    if use_cache:
        model.load_state_dict(torch.load('best_model.ckpt'))   #加载保存的部分参数
    model.eval()  #让模型变成测试模式，不启用BatchNormalization和Dropout
    correct_num = 0
    err_num = 0
    total_loss = 0
    with torch.no_grad():   #当网络中tensor不需要梯度时，用torch.no_grad处理
        for i, batch in enumerate(data_iter):
            premise, premise_lens = batch.premise
            hypothesis, hypothesis_lens = batch.hypothesis
            labels = batch.label

            output = model(premise, premise_lens, hypothesis, hypothesis_lens)
            #argmax(-1)表示，按行返回最大值索引，reshape(-1)表示数组会根据剩下的维度计算出数组的另一个shape属性
            predicts = output.argmax(-1).reshape(-1) 
            loss = loss_func(output, labels)
            total_loss += loss.item()
            correct_num += (predicts == labels).sum().item()
            err_num += (predicts != batch.label).sum().item()

    acc = correct_num / (correct_num + err_num)
    if epoch is not None:
        tqdm.write(
            "Epoch: %d, %s Acc: %.3f, Loss %.3f" % (epoch + 1, name, acc, total_loss))
    else:
        tqdm.write(
            "%s Acc: %.3f, Loss %.3f" % (name, acc, total_loss))
    return acc


In [12]:
def train(train_iter, dev_iter, loss_func, optimizer, epochs, patience=5, clip=5):
    best_acc = -1
    patience_counter = 0
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_iter):
            premise, premise_lens = batch.premise
            hypothesis, hypothesis_lens = batch.hypothesis
            labels = batch.label
            # show_example(premise[0],hypothesis[0], labels[0], TEXT, LABEL)

            model.zero_grad()   #反向传播前，需要将梯度初始化为0，防止梯度累加爆炸
            output = model(premise, premise_lens, hypothesis, hypothesis_lens)
            loss = loss_func(output, labels)
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
            optimizer.step()
        tqdm.write("Epoch: %d, Train Loss: %d" % (epoch + 1, total_loss))

        acc = eval(dev_iter, "Dev", epoch)
        if acc<best_acc:
            patience_counter +=1
        else:
            best_acc = acc
            patience_counter = 0
            torch.save(model.state_dict(), 'best_model.ckpt')
        if patience_counter >= patience:
            tqdm.write("Early stopping: patience limit reached, stopping...")
            break
            

In [13]:
train_iter, dev_iter, test_iter, TEXT, LABEL, _ = load_iters(BATCH_SIZE, device, data_path, vectors)

model = ESIM(len(TEXT.vocab), len(LABEL.vocab.stoi),   #stoi 返回每一个单词与其对应的下标
                 EMBEDDING_SIZE, HIDDEN_SIZE, DROPOUT_RATE, LAYER_NUM,
                 TEXT.vocab.vectors, freeze).to(device)
print(f'The model has {count_parameters(model):,} trainable parameters')

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_func = nn.CrossEntropyLoss()

train(train_iter, dev_iter, loss_func, optimizer, EPOCHS,PATIENCE, CLIP)
eval(test_iter, "Test", use_cache=True)

  0%|          | 0/17168 [00:00<?, ?it/s]

The model has 23,358,603 trainable parameters


100%|██████████| 17168/17168 [16:00<00:00, 12.62it/s]


Epoch: 1, Train Loss: 11805


  0%|          | 0/17168 [00:00<?, ?it/s]

Epoch: 1, Dev Acc: 0.809, Loss 150.622


100%|██████████| 17168/17168 [21:04<00:00, 13.58it/s]


Epoch: 2, Train Loss: 8733


  0%|          | 0/17168 [00:00<?, ?it/s]

Epoch: 2, Dev Acc: 0.838, Loss 131.182


100%|██████████| 17168/17168 [15:33<00:00, 18.38it/s]


Epoch: 3, Train Loss: 7685


  0%|          | 0/17168 [00:00<?, ?it/s]

Epoch: 3, Dev Acc: 0.855, Loss 120.424


100%|██████████| 17168/17168 [15:20<00:00, 19.31it/s]


Epoch: 4, Train Loss: 7048


  0%|          | 0/17168 [00:00<?, ?it/s]

Epoch: 4, Dev Acc: 0.857, Loss 119.585


100%|██████████| 17168/17168 [15:19<00:00, 19.13it/s]


Epoch: 5, Train Loss: 6584


  0%|          | 0/17168 [00:00<?, ?it/s]

Epoch: 5, Dev Acc: 0.863, Loss 113.772


100%|██████████| 17168/17168 [15:20<00:00, 18.28it/s]


Epoch: 6, Train Loss: 6247
Epoch: 6, Dev Acc: 0.865, Loss 111.899
Test Acc: 0.858, Loss 114.639


0.8577972312703583