In [5]:
from __future__ import division

import sys
sys.path.append("../OpenNMT-py/")
import onmt
import onmt.Markdown
import torch
import argparse
import math
import numpy

In [2]:
# 因为translator要用到opt，所以先全部拷贝过来了，

parser = argparse.ArgumentParser(description='translate.py')
onmt.Markdown.add_md_help_argument(parser)

parser.add_argument('-model', default = "../data/mt_model/prepro_model_ppl_20.07_e13.pt",
                    help='Path to model .pt file')
parser.add_argument('-src', default = "./data/preprosrc-ref.bpe.noUndo.en",
                    help='Source sequence to decode (one line per sequence)')
parser.add_argument('-src_img_dir',   default="",
                    help='Source image directory')
parser.add_argument('-tgt', default = "./test_data/preprosrc-ref.bpe.noUndo.de",
                    help='True target sequence (optional)')
parser.add_argument('-output', default='pred.txt',
                    help="""Path to output the predictions (each line will
                    be the decoded sequence""")



parser.add_argument('-beam_size',  type=int, default=5,
                    help='Beam size')
parser.add_argument('-batch_size', type=int, default=30,
                    help='Batch size')
parser.add_argument('-max_sent_length', type=int, default=100,
                    help='Maximum sentence length.')
parser.add_argument('-replace_unk', action="store_true",
                    help="""Replace the generated UNK tokens with the source
                    token that had highest attention weight. If phrase_table
                    is provided, it will lookup the identified source token and
                    give the corresponding target token. If it is not provided
                    (or the identified source token does not exist in the
                    table) then it will copy the source token""")
# parser.add_argument('-phrase_table',
#                     help="""Path to source-target dictionary to replace UNK
#                     tokens. See README.md for the format of this file.""")
parser.add_argument('-verbose', action="store_true",
                    help='Print scores and predictions for each sentence')
parser.add_argument('-dump_beam', type=str, default="",
                    help='File to dump beam information to.')

parser.add_argument('-n_best', type=int, default=1,
                    help="""If verbose is set, will output the n_best
                    decoded sentences""")
parser.add_argument('-print_nbest', action='store_true',
                    help='Output the n-best list instead of a single sentence')
parser.add_argument('-normalize', action='store_true',
                    help='To normalize the scores based on output length')
parser.add_argument('-gpu', type=int, default=-1,
                    help="Device to run on")


_StoreAction(option_strings=['-gpu'], dest='gpu', nargs=None, const=None, default=-1, type=<type 'int'>, choices=None, help='Device to run on', metavar=None)

In [5]:
def addone(f):
    for line in f:
        yield line
    yield None

In [12]:
opt = parser.parse_args()
opt.gpu

usage: ipykernel_launcher.py [-h] [-md] [-model MODEL] [-src SRC]
                             [-src_img_dir SRC_IMG_DIR] [-tgt TGT]
                             [-output OUTPUT] [-beam_size BEAM_SIZE]
                             [-batch_size BATCH_SIZE]
                             [-max_sent_length MAX_SENT_LENGTH] [-replace_unk]
                             [-verbose] [-dump_beam DUMP_BEAM]
                             [-n_best N_BEST] [-print_nbest] [-normalize]
                             [-gpu GPU]
ipykernel_launcher.py: error: unrecognized arguments: -f /Users/ihuangyiran/Library/Jupyter/runtime/kernel-d704f76f-18a9-44cd-921a-0762a098a590.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [6]:
def main():
    opt = parser.parse_args()
    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)
    
    # Always pick n_best
    opt.n_best = opt.beam_size

    
    if opt.output == "stdout":
            outF = sys.stdout
    else:
            outF = open(opt.output, 'w')


    srcBatch, tgtBatch = [], []

    count = 0

    tgtF = open(opt.tgt) if opt.tgt else None

    if opt.dump_beam != "":
        import json
        translator.initBeamAccum()
    
    # here we are trying to open the file
    inFile = None
    if(opt.src == "stdin"):
            inFile = sys.stdin
            opt.batch_size = 1
    else:
      inFile = open(opt.src)

    translator = onmt.Translator(opt)

    for line in addone(inFile):
        if line is not None:
            srcTokens = line.split()
            srcBatch += [srcTokens]
            if tgtF:
                tgtTokens = tgtF.readline().split() if tgtF else None
                tgtBatch += [tgtTokens]

            if len(srcBatch) < opt.batch_size:
                continue
        else:
            # at the end of file, check last batch
            if len(srcBatch) == 0:
                break

    decOut, decStates, attn = translator.translate(srcBatch, tgtBatch)
    
    with open("./test_data/hidden_value", "w") as f:
        tmp = decOut.data[-1].numpy()
        numpy.save(f, tmp)

In [7]:
class Pipeline_hidden(object):
    def __init__(self, opt):
        # 构建并读取模型
        self.opt = opt
        self.tt = torch.cuda if opt.cuda else torch
        self.beam_accum = None

        # 从opt.model读取模型的各种参数
        if opt.verbose:
                    print('Loading model from %s' % opt.model)
        checkpoint = torch.load(opt.model,
                               map_location=lambda storage, loc: storage)

        if opt.verbose:
                    print('Done')

        # 提取词典类型
        model_opt = checkpoint['opt']
        self.src_dict = checkpoint['dicts']['src']
        self.tgt_dict = checkpoint['dicts']['tgt']
        self._type = model_opt.encoder_type \
            if "encoder_type" in model_opt else "text"


        # 构建新模型框架
        encoder = onmt.Models.Encoder(model_opt, self.src_dict)
        decoder = onmt.Models.Decoder(model_opt, self.tgt_dict)

        # 从中间层到目标词汇的映射框架
        generator = onmt.Models.Generator(model_opt.rnn_size, self.tgt_dict)
        model = onmt.Models.NMTModel(encoder, decoder, generator)\

        #~ for k, v in checkpoint['model'].items():
                    #~ print k

        model_state_dict = {k: v for k, v in checkpoint['model'].items()
                                                            if 'criterion' not in k}

        #~ generator = nn.Sequential(
            #~ nn.Linear(model_opt.rnn_size, self.tgt_dict.size()),
            #~ nn.LogSoftmax())

        # 给模型参数进行赋值
        model.load_state_dict(model_state_dict)
        #~ generator.load_state_dict(checkpoint['generator'])

        if opt.cuda:
            model.cuda()
            generator.cuda()
        else:
            model.cpu()
            generator.cpu()


        self.model = model
        self.model.eval()

    def buildData(self, srcBatch, goldBatch):
        # This needs to be the same as preprocess.py.
        if self._type == "text":
            srcData = [self.src_dict.convertToIdx(b,
                                                  onmt.Constants.UNK_WORD)
                       for b in srcBatch]
        elif self._type == "img":
            srcData = [transforms.ToTensor()(
                Image.open(self.opt.src_img_dir + "/" + b[0]))
                       for b in srcBatch]

        tgtData = None
        if goldBatch:
            tgtData = [self.tgt_dict.convertToIdx(b,
                       onmt.Constants.UNK_WORD,
                       onmt.Constants.BOS_WORD,
                       onmt.Constants.EOS_WORD) for b in goldBatch]

        return onmt.Dataset(srcData, tgtData, self.opt.batch_size,
                            self.opt.cuda, volatile=True,
                            data_type=self._type, balance=False)

    def _getBatchSize(self, batch):
        if self._type == "text":
            return batch.size(1)
        else:
            return batch.size(0)

    def get_hidden_batch(self, srcBatch, tgtBatch):
        # 1) run the encoder on the src
        # 其中encoder的输入的size是：[seq_len, batch, input_size], 这里seq_len即是numWords
        # 其中context是[seq_len, batch, hidden_size * num_directions]
        # encStates是一个tupel，他包括:
        #  - h_0 (num_layers * num_directions, batch, hidden_size): tensor containing the hidden state for t=seq_len
        #  - c_0 (num_layers * num_directions, batch, hidden_size): tensor containing the cell state for t=seq_len
        encStates, context = self.model.encoder(srcBatch)

        # 这里的srcBatch原本是dataset的输出，所以应该是(src, lengths)，下面这一步取出其中src的内容。
        srcBatch = srcBatch[0]
        batchSize = self._getBatchSize(srcBatch)

        # 获得RNN每层的节点数
        rnnSize = context.size(2)

        # 转换encState的维度为：[layers * batch * (directions * dim)]
        encStates = (self.model._fix_enc_hidden(encStates[0]), self.model_fix_enc_hidden(encStates[1]))

        decoder = self.model.decoder
        attentionLayer = decoder.attn

        # 如果类型为text，且batchSize大于1，则使用mask，原因未知？？？？？
        # 这个mask 将被用于decoder中的Attention，目的是使得attention能够忽略掉输入句子的padding的部分内容。
        # 但为什么仅在batchSize大于1的时候，才使用呢？？？
        useMasking = (self._type == 'text' and batchSize > 1)
        padMask = None
        if useMasking:
            padMask = srcBatch.data.eq(onmt.Constants.PAD).t() #标记pad的内容

        def mask(padMask):
            if useMasking:
                attentionLayer.appleMask(padMask)

        decStates = encStates
        # 初始化一个decoder的输出，
        decOut = self.model.make_init_decoder_output(context)
        mask(padMask)
        initOutput = self.model.make_init_decoder_output(context)
        # decoder的输出使outputs, hidden, atten, 前两者同一般的rnn输出，atten是nn.softmax()的输出
        # 关于tgtBatch，应该是一个size: [numWords, batchSize]的Variable(Datasetl类里面进行了转换)
        # globalAttention的参数是input: batch x hidden_size，context: batch x seq_len x hidden_size。
        # 对应输出的attn是batch X seq_len
        # 所以decoder的输出output, hidden, attn的size应该分别是：
        # [seq_len X batch X hidden_size], [num_layers X batch X hidden_size], [batch X seq_len]
        decOut, decStates, attn = self.model.decoder(tgtBatch[:-1], decStates, context, initOutput)

        return decOut, decStates, attn

    def get_hidden(self, srcBatch, goldBatch):
        # 把单词转化成对应的index，然后放进dataset中进行包装
        dataset = self.buildData(srcBatch, goldBatch)
        # 获得第一个Batch
        src, tgt, indices = dataset[0]
        batchSize = self._getBatchSize(src[0])

        # 扔到translateBatch方法里面进行翻译, 这里src，tgt都是tensor类型维度为batchSize*numWord
        decOut, decStates, attn = self.get_hidden_batch(src, tgt)
             
        return decOut, decStates, attn

In [19]:
from torch.autograd import Variable

In [20]:
x = Variable(torch.ones(2, 2), requires_grad = True)
x

Variable containing:
 1  1
 1  1
[torch.FloatTensor of size 2x2]

In [25]:
b = x.data
b


 1  1
 1  1
[torch.FloatTensor of size 2x2]

In [27]:
for i in range(1,5):
    print(i)

1
2
3
4


In [3]:
import numpy as np

In [4]:
file_sys = file("./data/hidden_sys")
data_sys = np.load(file_sys)
data_sys.shape
file_sys.close()

In [5]:
file_ref = open("./data/hidden_ref")
data_ref = np.load(file_ref)
data_ref.shape
file_ref.close()

In [9]:
data_scores = []
with open("./data/data_scores") as fi:
    for line in fi:
        data_scores.append(float(line.strip()))

In [10]:
data_scores

[-1.70434875861064,
 1.12727142153235,
 -0.872401325076093,
 -0.520636691467046,
 0.0293189192586597,
 -1.67083618888355,
 -1.54107458193584,
 -1.26716970413938,
 1.14801144278375,
 -0.126473141789861,
 1.08998382718802,
 -1.45083864991872,
 -1.30334660957726,
 0.909684951356344,
 -0.865310032116374,
 -1.04805674497439,
 1.03362830429051,
 -1.30334660957726,
 0.473968273650239,
 0.0655234422069162,
 -2.31501818674212,
 -1.30040055102541,
 -1.26443616430169,
 1.24609931070437,
 0.0048710057016938,
 0.339556020247105,
 -0.553215740838215,
 -0.284941429727458,
 -0.740862210579549,
 -1.44105725859061,
 -0.264247987380428,
 0.0386786043300455,
 0.71831182779596,
 -1.26443616430169,
 1.10910099750817,
 0.0686014628819244,
 1.11418127557441,
 0.253071201445601,
 0.419182111938213,
 -1.80918239815969,
 -0.229408504480094,
 -0.123356240182688,
 -0.0496745081911503,
 1.22217430718766,
 -1.1248553785731,
 -0.985337863893415,
 -1.15989655423098,
 -0.0139082445291922,
 -1.54107458193584,
 -0.434365

In [7]:
import torch

In [11]:
a = torch.Tensor(data_scores)

In [8]:
b = torch.from_numpy(data_ref)

In [10]:
c = torch.from_numpy(data_sys)

In [4]:
d = torch.cat((c,b), 0)
d

NameError: name 'c' is not defined

In [17]:
tmp = zip(b, d, a)

In [18]:
import random

In [21]:
random.seed(9783)

In [12]:
type(a)

torch.FloatTensor