In [35]:
import json
import torch.optim as optim
import numpy as np
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import random
import torch
from io import open
# 一些常量
# On windows
# dir_train = 'D:/Github/candidate_answer/data/json_train_expt_stop'
# dir_test = 'D:/Github/candidate_answer/data/json_test_expt_stop'
# dir_embedding = 'D:/nlp_data/sogou_100_nobinary'
# On ubuntu
dir_train = 'data/json_train_expt_stop2'
dir_test = 'data/json_test_expt_stop'
dir_embedding = '/home/tuomx/nlp_data/sogou_100_nobinary'

embedding_size = 100
max_question_words = 23 # 问题最大词数，下同理
max_right_answer_words = 824
max_wrong_answer_words = 824
kernel_size = (3, embedding_size) # 卷积核的size
out_channels = 300 # 输出通道数
hidden_out = 400 # 隐藏层输出单元数
batch_size = 64
debug = False

log_file = open('log','w',encoding='utf-8',buffering=1)
def log(log_inf):
    if debug:
        print(log_inf)
    else:
        log_file.write(log_inf)
        
        

In [36]:
# 加载词向量
embedding = {}
f = open(dir_embedding,"r",encoding='utf-8')
line = f.readline()
line_num = 0
print("loading enmbedding...")
while line:
    try:
        content = line.strip(' \n').split(' ')
        assert len(content) == embedding_size + 1
        embedding[content[0]] = np.array([float(i) for i in content[1:]])
        line = f.readline()
        line_num+=1
#         print(line_num)
    except:
        print(content)
        break
print("finish loading")
f.close()

loading enmbedding...
finish loading


In [33]:
# 创建一个CNN\

# caculate hinge_loss
def hinge_loss(s1,s2,t0,batch_size):
    # print(s1.size(), s2.size())
    loss = Variable(torch.Tensor(1))
    loss.data[0] = 0.0
    for i in range(batch_size):
        if (t0 - s1[i] + s2[i]).data[0] > 0:
            loss += t0 - s1[i] + s2[i]
        else:
            print("pos:%f,neg:%f" % (s1[i].data[0], s2[i].data[0]))
    return loss
    
def get_simple_score(self,question_variable, pos_relation_variable_l, pos_word_variable_l, neg_relation_variable_l, neg_word_variable_l, hn_hidden,cn_hidden):
        '''计算一个问题的分数，正例与负例，用max——margin计算
        '''
        sum_score = 0.0

        qr_output = self.qr_model(question_variable,hn_hidden,cn_hidden,self.padding_size)
        pos_score_list = []
        neg_score_list = []
        for pi in range(len(pos_relation_variable_l)):
            # self.optimizer.zero_grad()
            pos_ar_output = self.ar_model(pos_relation_variable_l[pi],pos_word_variable_l[pi],hn_hidden,cn_hidden,20)
            pos_score = F.cosine_similarity(qr_output,pos_ar_output)
            pos_score_list.append(pos_score)

        for ni in range(len(neg_relation_variable_l)):
            # self.optimizer.zero_grad()
            neg_ar_output = self.ar_model(neg_relation_variable_l[ni],neg_word_variable_l[ni],hn_hidden,cn_hidden,20)
            neg_score = F.cosine_similarity(qr_output,neg_ar_output)
            neg_score_list.append(neg_score)

        for pos_score in pos_score_list:
            for neg_score in neg_score_list:
                score = self.max_margin_number - pos_score + neg_score
                if score.data[0] <0:
                    score = Variable(torch.FloatTensor(1).fill_(0.0))
                sum_score += score

        return sum_score

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 300, (3,100)) # 保证输出列向量在高度上与X相同
        self.conv2 = nn.Conv2d(1, 300, (3,100))
        self.conv3 = nn.Conv2d(1, 300, (3,100))

        self.pool1 = nn.MaxPool2d(1, max_wrong_answer_words) # 输出是out_channels*1维向量
        self.pool2 = nn.MaxPool2d(1, max_question_words)
        self.pool3 = nn.MaxPool2d(1, max_right_answer_words)

        self.fc1 = nn.Linear(out_channels, hidden_out)
        self.fc2 = nn.Linear(out_channels, hidden_out)
        self.fc3 = nn.Linear(out_channels, hidden_out)


    def forward(self, x1, x2, x3, batch_size):
        # x1/x2/x3 分别表示错误答案，问题，正确答案
#         x1 = self.pool1(F.tanh(self.conv1(x1)))
#         x2 = self.pool2(F.tanh(self.conv2(x2)))
#         x3 = self.pool3(F.tanh(self.conv3(x3)))
        # print("in forward:")
        # print("X:",x1.size(),x2.size(),x3.size())
        x1 = F.tanh(self.conv1(x1))
        x2 = F.tanh(self.conv2(x2))
        x3 = F.tanh(self.conv3(x3))
        #print("conv1:",x1.size(),x2.size(),x3.size())
        
        x1 = self.pool1(x1)
        x2 = self.pool2(x2)
        x3 = self.pool3(x3)
        #print("pool:",x1.size(),x2.size(),x3.size())
        
        x1 = F.tanh(x1)
        x2 = F.tanh(x2)
        x3 = F.tanh(x3)

        neg_cosine = F.cosine_similarity(x1,x2)
        pos_cosine = F.cosine_similarity(x2,x3)
        #print(neg_cosine, pos_cosine)

        return hinge_loss(pos_cosine, neg_cosine, 2, batch_size), pos_cosine, neg_cosine
net = Net()

In [34]:
# 开始训练
def get_sentence_embedding(s,out_size):
    arr = []
    for word in s:
        if word in embedding:
            arr.append(embedding[word])
        else:
            arr.append([random.uniform(-1,1) for i in range(embedding_size)])
    if len(arr) < out_size: # 补零
        append_arr = [0.0 for i in range(embedding_size)]
        for j in range(out_size - len(arr)):
            arr.append(append_arr)
    return [arr]

optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
f = open(dir_train,'r',encoding='utf-8')
data = json.loads(f.read()) # 8768 quesions(except 4 questions which don't have the right answer)
count_step = 0
for epoch in range(2):  # loop over the dataset multiple times
    f = open(dir_train)
    running_loss = 0.0
    batch = [[] for i in range(3)]
    for id in data:
        # stop early
#         if count_step == 5000:
#             print("finish training")
#             break
        
        # get the inputs
        question_ebd = get_sentence_embedding(data[id]['question'], max_question_words)
        right_answer_ebd = get_sentence_embedding(data[id]['right_answer'][0], max_right_answer_words)
        for wrong_answer in data[id]['wrong_answer']:
            wrong_answer_ebd = get_sentence_embedding(wrong_answer, max_wrong_answer_words)
            batch[0].append(wrong_answer_ebd)
            batch[1].append(question_ebd)
            batch[2].append(right_answer_ebd)
            if len(batch[0]) == batch_size:
                # wrap them in Variable
                # assert(batch[0])
        
                x1 = Variable(torch.from_numpy(np.array(batch[0])).float())
                x2 = Variable(torch.from_numpy(np.array(batch[1])).float())
                x3 = Variable(torch.from_numpy(np.array(batch[2])).float())
                # zero the parameter gradients
                optimizer.zero_grad()
                # forward + backward + optimize
                loss,pos_cosine,neg_cosine = net(x1, x2, x3, batch_size)
                if loss.data[0] != 0.0:
                    loss.backward()
                    optimizer.step()

                    # print statistics
                    count_step += 1
                    running_loss += loss.data[0]
    #                print("Epoch:%d Step:%d: Loss:%f" % (epoch, count_step, loss.data[0]))
                    if count_step % 200 == 199:    # print every 2000 mini-batches
                        print('[%d, %5d] loss: %.3f' %
                              (epoch + 1, count_step + 1, running_loss / 200))
                        running_loss = 0.0
                # clear batch
                batch = [[] for i in range(3)]

print('Finished Training')

KeyboardInterrupt: 

In [30]:
# test
print('start test...')
f = open(dir_test,'r',encoding='utf-8')
test_data = json.loads(f.read())

MRR = 0
count_right_answer = 0
for id in test_data:
    # get the inputs
    question_ebd = get_sentence_embedding(test_data[id]['question'], max_question_words)
    
    for right_answer in data[id]['right_answer']:
        right_answer_ebd = get_sentence_embedding(right_answer, max_right_answer_words)
        rank = 1 # rank of right answer in all answers
        no_pos_score = True # tag to help caculate right score
        for wrong_answer in data[id]['wrong_answer']:
            batch = [[] for i in range(3)]
            wrong_answer_ebd = get_sentence_embedding(wrong_answer, max_wrong_answer_words)
            batch[0].append(wrong_answer_ebd)
            batch[1].append(question_ebd)
            batch[2].append(right_answer_ebd)

            # print(x1.size(),x2.size(),x3.size())
            if no_pos_score:
                x1 = Variable(torch.from_numpy(np.array(batch[0])).float())
                x2 = Variable(torch.from_numpy(np.array(batch[1])).float())
                x3 = Variable(torch.from_numpy(np.array(batch[2])).float())
                loss,pos_score,neg_cosine = net(x1,x2,x3,1)
                no_pos_score = False
            x1 = Variable(torch.from_numpy(np.array(batch[0])).float())
            x2 = Variable(torch.from_numpy(np.array(batch[1])).float())
            x3 = Variable(torch.from_numpy(np.array(batch[2])).float())
            loss,neg_score,neg_cosine = net(x3,x2,x1,1)
            # print(x1)
            #print(pos_score.data[0],neg_score.data[0])
            if pos_score.data[0] < neg_score.data[0]:
                rank += 1
        print("rank:%d" %(rank))
        MRR += 1 / rank
        count_right_answer += 1
        if count_right_answer % 2000 == 1999:    # print every 2000 mini-batches
            print('count_roght_answer:%d; MRR:%f' %
                  (count_right_answer, MRR / count_right_answer))
MRR /= count_right_answer
print("Final MRR:%f" %(MRR))
print("Finish test")

start test...
rank:25
rank:6
rank:10
rank:5
rank:4
rank:7
rank:7
rank:7
rank:9
rank:6
rank:9
rank:10
rank:2
rank:26
rank:8
rank:20
rank:15
rank:15
rank:7
rank:26
rank:12
rank:3
rank:5
rank:16
rank:30
rank:4
rank:4
rank:1
rank:1
rank:27
rank:7
rank:12
rank:6
rank:21
rank:12
rank:6
rank:2
rank:7
rank:25
rank:17
rank:13
rank:15
rank:9
rank:18
rank:20
rank:19
rank:9
rank:17
rank:2
rank:21
rank:14
rank:1
rank:11
rank:2
rank:7
rank:1
rank:3
rank:9
rank:20
rank:1
rank:19
rank:7
rank:28
rank:3
rank:9
rank:18
rank:3
rank:24
rank:17
rank:10
rank:12
rank:17
rank:20
rank:12
rank:27
rank:21
rank:19
rank:12
rank:7
rank:7
rank:7
rank:5
rank:23
rank:5
rank:5
rank:15
rank:5
rank:14
rank:15
rank:6
rank:29
rank:1
rank:3
rank:28
rank:1
rank:3
rank:7
rank:18
rank:10
rank:7
rank:1
rank:11
rank:20
rank:15
rank:12
rank:5
rank:6
rank:11
rank:3
rank:9
rank:27
rank:15
rank:13
rank:4
rank:13
rank:25
rank:9
rank:1
rank:4
rank:4
rank:22
rank:1
rank:8
rank:10
rank:15
rank:24
rank:7
rank:7
rank:17
rank:24
rank:5
rank

KeyboardInterrupt: 

In [31]:
MRR / count_right_answer

0.23000256003582398

In [27]:
x1 = Variable(torch.from_numpy(np.array([1,2,3])).float())

In [28]:
x1.cuda()

AssertionError: 
Found no NVIDIA driver on your system. Please check that you
have an NVIDIA GPU and installed a driver from
http://www.nvidia.com/Download/index.aspx