#this file is for training the evidence labelling
written by Yige Wen

----

In [0]:
import torch 
import torch.nn as nn
from torch.autograd import Variable
import numpy as np 
import matplotlib.pyplot as plt 
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader 
import csv

import gensim
embedding_model = gensim.models.KeyedVectors.load_word2vec_format('word_embedding/model.txt',binary=False)

import nltk
from nltk.corpus import stopwords
sw = stopwords.words('english')+['-LRB-', '-RRB-', '.', ',', '"', '?', '!', "'", ':', '-LSB-', '-RSB-']

In [0]:
class EvidencePairDataset(Dataset):
    def __init__(self, root, model):
        self.data = []
        self.model = model
        self.sw = sw
        with open(root) as csvfile: 
            reader = csv.DictReader(csvfile) 
#             self.i = 0
            for row in reader: 
#                 if self.i == size:
#                     break
#                 self.i += 1
                claim_embedding = torch.tensor(words2sen(row['claim'], self.model, self.sw), dtype=torch.float)
                evidence_embedding = torch.tensor(words2sen(row['evidence'], self.model, self.sw), dtype=torch.float)
                if(claim_embedding.size(0) == 0 or evidence_embedding.size(0) == 0):
#                     print('empty sentence')
                    continue
                if row['label'] == '1':
                    label = 1
                elif row['label'] == '0':
                    label = 0
                # claim,row['claim'], row['evidence'], row['label']
#                 print(torch.FloatTensor(claim_embedding))
#                 print(torch.FloatTensor(evidence_embedding))
#                 print(label)
#                 self.data.append([torch.FloatTensor(claim_embedding), 
#                                   torch.FloatTensor(evidence_embedding), 
#                                   label]
#                                 )
                self.data.append((torch.FloatTensor(claim_embedding),
                                  torch.FloatTensor(evidence_embedding),
                                  label))
#         self.data = torch.stack()
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        claim, evidence, label = self.data[index]
        return claim, evidence, label

def words2sen(sen, model, stopwords, nearWordsNum = 5):
    words=[w for w in nltk.tokenize.word_tokenize(sen) if w not in stopwords]
    sen = []
    for i in range(len(words)):
        try:
            sen.append(model[words[i]])
            # print("有", words[i])
        except:
            near = []
            for j in range(-nearWordsNum,nearWordsNum+1):
                if j >=0 and j < len(words):
                    nearIx = i+j
                    if nearIx != i:
                        near.append(words[j])
            
            near = [model[t] for t in near if t in model.vocab and t not in sw]
            if near != []:
                sumVec = 0
                for x in near:
                    sumVec += x
                sen.append(sumVec/len(near))
                # print("有邻居", words[i])
            else:
                sen.append(-0.05+0.1*np.random.random(300))
                # print("纯碎机", words[i])
    return sen

In [0]:
train_pair = EvidencePairDataset('train_pairs.csv', embedding_model)

In [0]:
test_pair = EvidencePairDataset('devset_pairs.csv', embedding_model)

In [0]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("torch.cuda.is_available()   =", torch.cuda.is_available())
print("torch.cuda.device_count()   =", torch.cuda.device_count())
print("torch.cuda.device('cuda')   =", torch.cuda.device('cuda'))

torch.cuda.is_available()   = True
torch.cuda.device_count()   = 1
torch.cuda.device('cuda')   = <torch.cuda.device object at 0x000001EB1FEB3A90>


In [0]:
def collate_fn(batch):
    claims = [b[0] for b in batch]
    evidences = [b[1] for b in batch]
    labels = torch.tensor([b[2] for b in batch],dtype=torch.long)
    
    padded_claims, claim_lengths = padding(claims)
    padded_evidences, evidences_lengths = padding(evidences)
    
    return padded_claims, claim_lengths, padded_evidences, evidences_lengths, labels

def padding(batch_one_field, features=300):
    max_claim_length = max([len(b) for b in batch_one_field])
    lengths = []
    padded = []
    for b in batch_one_field:
        lengths.append(len(b))
        b = torch.cat((b,torch.zeros(max_claim_length-len(b),features)),dim=0)
        padded.append(b)
    torch.FloatTensor(lengths)
    return torch.stack(padded), torch.LongTensor(lengths)
    
train_pair_loader = DataLoader(dataset = train_pair,
                               batch_size = 100,
                               shuffle = True,
                               collate_fn = collate_fn)

def sort_sequences(inputs, lengths):
    """sort_sequences
    Sort sequences according to lengths descendingly.

    :param inputs (Tensor): input sequences, size [B, T, D]
    :param lengths (Tensor): length of each sequence, size [B]
    """
    lengths_sorted, sorted_idx = lengths.sort(descending=True)
    _, unsorted_idx = sorted_idx.sort()
    return inputs[sorted_idx], lengths_sorted, unsorted_idx

In [0]:
class Siamese_LSTM(nn.Module):
    def __init__(self):
        super(Siamese_LSTM, self).__init__()
        self.lstm = nn.LSTM(
            input_size = 300,
            hidden_size = 150,
            num_layers = 3,
            bidirectional = True,
            dropout = 0.3,
            batch_first = True
        )

        self.out = nn.Sequential(
            nn.Linear(1200, 800),
            nn.Dropout(0.3),
            nn.ReLU(),
            nn.Linear(800, 400),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(400, 200),
            nn.Dropout(0.1),
            nn.ReLU(),
            nn.Linear(200, 2)
        )

    def forward_once(self, x, lengths):
        inputs, sorted_lengths, unsorted_idx = sort_sequences(x,lengths)
#         print(inputs, sorted_lengths, unsorted_idx)
        inputs = torch.nn.utils.rnn.pack_padded_sequence(inputs, sorted_lengths, batch_first=True)
 
        outputs, _ = self.lstm(inputs, None)

        outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
        outputs = outputs.index_select(0, unsorted_idx)
        output_tensor = []
#         print(lengths)
        for output,length in zip(outputs,lengths):
            output_tensor.append(output[length-1,:])
        return torch.stack(output_tensor)
        
    def forward(self, x1, len1, x2, len2):
        r_out1 = self.forward_once(x1, len1)
        r_out2 = self.forward_once(x2, len2)
        
        siamese_out = torch.cat((r_out1,r_out2,torch.abs(r_out1-r_out2),r_out1*r_out2),1)
        return self.out(siamese_out)

In [0]:
net = Siamese_LSTM()
# net = torch.load('Siamese_LSTM-epoch9.pkl')
net.cuda()
print(net)
optimizer = torch.optim.Adam(net.parameters(), lr=0.0001)
loss_func = nn.CrossEntropyLoss()

Siamese_LSTM(
  (lstm): LSTM(300, 150, num_layers=3, batch_first=True, dropout=0.3, bidirectional=True)
  (out): Sequential(
    (0): Linear(in_features=1200, out_features=800, bias=True)
    (1): Dropout(p=0.3)
    (2): ReLU()
    (3): Linear(in_features=800, out_features=400, bias=True)
    (4): Dropout(p=0.2)
    (5): ReLU()
    (6): Linear(in_features=400, out_features=200, bias=True)
    (7): Dropout(p=0.1)
    (8): ReLU()
    (9): Linear(in_features=200, out_features=2, bias=True)
  )
)


In [0]:
def save_net(net, epoch):
    name = 'Siamese_BiLSTM_Dropout-epoch'+str(epoch)+'.pkl'
    torch.save(net,name)

In [0]:
def test():
    net.eval()
    test_shuf = test_pair.data
    np.random.shuffle(test_shuf)
#     print(len(test_shuf))
    test_inputs1,test_lengths1,test_inputs2,test_lengths2,test_labels = collate_fn(test_shuf[:2000])
    test_inputs1 = Variable(test_inputs1).cuda()
    test_lengths1 = Variable(test_lengths1).cuda()
    test_inputs2 = Variable(test_inputs2).cuda()
    test_lengths2 = Variable(test_lengths2).cuda()
    test_labels = Variable(test_labels).cuda()
    test_output = net(test_inputs1, test_lengths1, test_inputs2, test_lengths2)
    test_pred_labels = torch.max(test_output, 1)[1].data.cpu().numpy()
    test_labels = test_labels.cpu().numpy()
    correct = 0
    for x,y in zip(test_pred_labels,test_labels):
        if x == y:
            correct += 1
    print('accuracy:',correct,'/',len(test_pred_labels),'=',correct/len(test_pred_labels))
    net.train()

# test()

In [0]:
EPOCH = 10
for epoch in range(15):
    for i, data in enumerate(train_pair_loader):
        inputs1, lengths1, inputs2, lengths2, labels = data
        inputs1 = Variable(inputs1).cuda()
        lengths1 = Variable(lengths1).cuda()
        inputs2 = Variable(inputs2).cuda()
        lengths2 = Variable(lengths2).cuda()
        labels = Variable(labels).cuda()
        output = net(inputs1, lengths1, inputs2, lengths2)
        
        # zero the parameter gradients
        optimizer.zero_grad()
        
        # forward + backward + optimize
        loss = loss_func(output, labels)
        loss.backward()
        optimizer.step()
        
        if i%100 == 0:
            print('epoch:',epoch,'batch',i,'loss:',loss.item())
            test()
            
    save_net(net, epoch)
        
    print('epoch:',epoch,'loss:',loss.item())

    

epoch: 0 batch 0 loss: 0.6963233947753906
accuracy: 990 / 2000 = 0.495
epoch: 0 batch 100 loss: 0.6957831382751465
accuracy: 1011 / 2000 = 0.5055
epoch: 0 batch 200 loss: 0.6965621113777161
accuracy: 1269 / 2000 = 0.6345
epoch: 0 batch 300 loss: 0.6278018951416016
accuracy: 1274 / 2000 = 0.637
epoch: 0 batch 400 loss: 0.6134536266326904
accuracy: 1279 / 2000 = 0.6395
epoch: 0 batch 500 loss: 0.547346830368042
accuracy: 1295 / 2000 = 0.6475
epoch: 0 batch 600 loss: 0.6389167308807373
accuracy: 1275 / 2000 = 0.6375
epoch: 0 batch 700 loss: 0.5235599875450134
accuracy: 1278 / 2000 = 0.639
epoch: 0 batch 800 loss: 0.6384148597717285
accuracy: 1322 / 2000 = 0.661
epoch: 0 batch 900 loss: 0.6140878796577454
accuracy: 1315 / 2000 = 0.6575
epoch: 0 batch 1000 loss: 0.6745547652244568
accuracy: 1307 / 2000 = 0.6535
epoch: 0 batch 1100 loss: 0.6853716373443604
accuracy: 1307 / 2000 = 0.6535
epoch: 0 batch 1200 loss: 0.5761851072311401
accuracy: 1288 / 2000 = 0.644
epoch: 0 batch 1300 loss: 0.581

  "type " + obj.__name__ + ". It won't be checked "


epoch: 0 loss: 0.5895117521286011
epoch: 1 batch 0 loss: 0.6099227666854858
accuracy: 1414 / 2000 = 0.707
epoch: 1 batch 100 loss: 0.5336961150169373
accuracy: 1416 / 2000 = 0.708
epoch: 1 batch 200 loss: 0.5532849431037903
accuracy: 1411 / 2000 = 0.7055
epoch: 1 batch 300 loss: 0.5782852172851562
accuracy: 1416 / 2000 = 0.708
epoch: 1 batch 400 loss: 0.5584542751312256
accuracy: 1410 / 2000 = 0.705
epoch: 1 batch 500 loss: 0.5248233675956726
accuracy: 1424 / 2000 = 0.712
epoch: 1 batch 600 loss: 0.541163444519043
accuracy: 1423 / 2000 = 0.7115
epoch: 1 batch 700 loss: 0.539895236492157
accuracy: 1422 / 2000 = 0.711
epoch: 1 batch 800 loss: 0.5569810271263123
accuracy: 1421 / 2000 = 0.7105
epoch: 1 batch 900 loss: 0.5186588168144226
accuracy: 1417 / 2000 = 0.7085
epoch: 1 batch 1000 loss: 0.6017690300941467
accuracy: 1447 / 2000 = 0.7235
epoch: 1 batch 1100 loss: 0.5103204846382141
accuracy: 1413 / 2000 = 0.7065
epoch: 1 batch 1200 loss: 0.5293477773666382
accuracy: 1437 / 2000 = 0.718

KeyboardInterrupt: 

In [0]:
# print(len(test_inputs1))
# print(len(test_lengths1))
# print(len(test_inputs2))
# print(len(test_lengths2))
# print(len(test_labels))

print(len(test_labels))

In [0]:
correct = 0
for x,y in zip(test_pred_labels,test_labels):
    if x == y:
        correct += 1
print(correct)
print('accuracy:',correct/len(test_pred_labels))