## 1.导入模块

In [2]:
import moxing as mox
#mox.file.copy_parallel(src_url="obs://nlp-kim/project/data/", dst_url='./data/') 
mox.file.copy_parallel(src_url="s3://dl4nlp-my/project/data/", dst_url='./data/') 

INFO:root:Using MoXing-v1.17.3-d858ff4a
INFO:root:Using OBS-Python-SDK-3.20.9.1


In [3]:
import sys
import os
import json
import pickle as pkl
import re
from collections import Counter
import numpy as np
import random
from collections import OrderedDict
import math

import mindspore
import mindspore.nn as nn
from mindspore import Tensor
from mindspore import context
from mindspore.train.model import Model
from mindspore.nn.metrics import Accuracy
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore.ops import operations as ops

## 2.数据预处理

### 2.1 处理函数

In [4]:
def process_sentence(sentence):
    periodStrip  = re.compile("(?!<=\d)(\.)(?!\d)")
    commaStrip   = re.compile("(\d)(\,)(\d)")
    punct        = [';', r"/", '[', ']', '"', '{', '}',
                    '(', ')', '=', '+', '\\', '_', '-',
                    '>', '<', '@', '`', ',', '?', '!']
    contractions = {"aint": "ain't", "arent": "aren't", "cant": "can't", "couldve": "could've", "couldnt": "couldn't", \
                    "couldn'tve": "couldn't've", "couldnt've": "couldn't've", "didnt": "didn't", "doesnt": "doesn't", "dont": "don't", "hadnt": "hadn't", \
                    "hadnt've": "hadn't've", "hadn'tve": "hadn't've", "hasnt": "hasn't", "havent": "haven't", "hed": "he'd", "hed've": "he'd've", \
                    "he'dve": "he'd've", "hes": "he's", "howd": "how'd", "howll": "how'll", "hows": "how's", "Id've": "I'd've", "I'dve": "I'd've", \
                    "Im": "I'm", "Ive": "I've", "isnt": "isn't", "itd": "it'd", "itd've": "it'd've", "it'dve": "it'd've", "itll": "it'll", "let's": "let's", \
                    "maam": "ma'am", "mightnt": "mightn't", "mightnt've": "mightn't've", "mightn'tve": "mightn't've", "mightve": "might've", \
                    "mustnt": "mustn't", "mustve": "must've", "neednt": "needn't", "notve": "not've", "oclock": "o'clock", "oughtnt": "oughtn't", \
                    "ow's'at": "'ow's'at", "'ows'at": "'ow's'at", "'ow'sat": "'ow's'at", "shant": "shan't", "shed've": "she'd've", "she'dve": "she'd've", \
                    "she's": "she's", "shouldve": "should've", "shouldnt": "shouldn't", "shouldnt've": "shouldn't've", "shouldn'tve": "shouldn't've", \
                    "somebody'd": "somebodyd", "somebodyd've": "somebody'd've", "somebody'dve": "somebody'd've", "somebodyll": "somebody'll", \
                    "somebodys": "somebody's", "someoned": "someone'd", "someoned've": "someone'd've", "someone'dve": "someone'd've", \
                    "someonell": "someone'll", "someones": "someone's", "somethingd": "something'd", "somethingd've": "something'd've", \
                    "something'dve": "something'd've", "somethingll": "something'll", "thats": "that's", "thered": "there'd", "thered've": "there'd've", \
                    "there'dve": "there'd've", "therere": "there're", "theres": "there's", "theyd": "they'd", "theyd've": "they'd've", \
                    "they'dve": "they'd've", "theyll": "they'll", "theyre": "they're", "theyve": "they've", "twas": "'twas", "wasnt": "wasn't", \
                    "wed've": "we'd've", "we'dve": "we'd've", "weve": "we've", "werent": "weren't", "whatll": "what'll", "whatre": "what're", \
                    "whats": "what's", "whatve": "what've", "whens": "when's", "whered": "where'd", "wheres": "where's", "whereve": "where've", \
                    "whod": "who'd", "whod've": "who'd've", "who'dve": "who'd've", "wholl": "who'll", "whos": "who's", "whove": "who've", "whyll": "why'll", \
                    "whyre": "why're", "whys": "why's", "wont": "won't", "wouldve": "would've", "wouldnt": "wouldn't", "wouldnt've": "wouldn't've", \
                    "wouldn'tve": "wouldn't've", "yall": "y'all", "yall'll": "y'all'll", "y'allll": "y'all'll", "yall'd've": "y'all'd've", \
                    "y'alld've": "y'all'd've", "y'all'dve": "y'all'd've", "youd": "you'd", "youd've": "you'd've", "you'dve": "you'd've", \
                    "youll": "you'll", "youre": "you're", "youve": "you've"}

    inText = sentence.replace('\n', ' ')
    inText = inText.replace('\t', ' ')
    inText = inText.strip()
    outText = inText
    for p in punct:
        if (p + ' ' in inText or ' ' + p in inText) or \
           (re.search(commaStrip, inText) != None):
            outText = outText.replace(p, '')
        else:
            outText = outText.replace(p, ' ')
    outText = periodStrip.sub("", outText, re.UNICODE)
    outText = outText.lower().split()
    for wordId, word in enumerate(outText):
        if word in contractions:
            outText[wordId] = contractions[word]
    outText = ' '.join(outText)
    return outText

def process_answer(answer):
    articles = ['a', 'an', 'the']
    manualMap = { 'none': '0', 'zero': '0', 'one': '1', 'two': '2', 'three':
                  '3', 'four': '4', 'five': '5', 'six': '6', 'seven': '7',
                  'eight': '8', 'nine': '9', 'ten': '10' }
    new_answer = process_sentence(answer)
    outText = []
    for word in new_answer.split():
        if word not in articles:
            word = manualMap.setdefault(word, word)
            outText.append(word)
    return ' '.join(outText)

### 2.2 变量说明

In [5]:
#qa：question和对应的annotions
#train_question_ids：question的id的数组

#question_dict_count： question中的单词出现次数统计
#train_questions： question语句split为word的数组的数组
#answer_dict_count： answer中的单词出现次数统计
#train_answers： answer被split为word的数组的数组

#question_key：按照question中出现次数进行排序
#answer_top_k: 按照answer中出现的次数进行排序


### 2.3 预处理细节

In [6]:
f = open("./data/questions/train.json", "r")
f1 = open("./data/annotations/train.json", "r")
file = json.load(f)
file1 = json.load(f1)
annotations = file1['annotations']
train_question_ids = []
train_image_ids = []
train_questions = []
train_answers = []
question_dict_count = dict()
answer_dict_count = dict()

# 形成qa：一个字典，整理出对应question_id的annotation
qa = {ann['question_id']: [] for ann in annotations}
for ann in annotations:
    qa[ann['question_id']] = ann

#获取image_id question_id
for idx, item in enumerate(file['questions']):
    train_question_ids.append(item['question_id'])
    train_image_ids.append(item['image_id'])
    
    #process question
    question = item['question']
    question = process_sentence(question)
    question = question.split()
    for word in question:
        question_dict_count[word] = question_dict_count.get(word, 0) + 1
    train_questions.append(question)
    answer = qa[item['question_id']]['answers']
    answer_new = [process_answer(ans['answer']) for ans in answer]
    ans_array = []
    for ans in answer:
        ans_array.append(ans['answer'])
    for word in answer_new:
        answer_dict_count[word] = answer_dict_count.get(word, 0) + 1
    train_answers.append(ans_array)
    if idx % 10000 == 0:
        print ('finished processing %d in train' %(idx))

# sort question dict
question_count = question_dict_count.values()
sorted_index = [count[0] for count in
                sorted(enumerate(question_count),
                       key = lambda x : x[1],
                       reverse=True)]
sorted_count = sorted(question_count, reverse=True)
question_key = list(question_dict_count.keys())
# 对question_key重新排序
question_key = [question_key[idx] for idx in sorted_index]
# add '<unk>' to the begining
question_key.insert(0, '<unk>')
# '<unk>' begins at 1, 0 is reserved for empty words
question_key = dict((key, idx + 1) for idx, key in enumerate(question_key))

k = 1000
# sort answer dict and get top k answers
del answer_dict_count['']
answer_count = answer_dict_count.values()
sorted_index = [count[0] for count in
                sorted(enumerate(answer_count),
                       key = lambda x : x[1],
                       reverse=True)]
sorted_count = sorted(answer_count, reverse=True)
answer_key = list(answer_dict_count.keys())
answer_key = [answer_key[idx] for idx in sorted_index]
answer_top_k = answer_key[:k]
answer_top_k = dict((key, idx) for idx, key in enumerate(answer_top_k))

# convert words to idx and remove some
train_question_idx = []
train_answer_idx = []
train_answer_counter = []
idx_to_remove = []
for idx, answer in enumerate(train_answers):
    question_idx = [question_key[word] for word in train_questions[idx]]
    #print(question_idx)
    #print('\n')
    #print(train_questions[idx])
    train_question_idx.append(question_idx)
    answer_idx = [answer_top_k[ans] for ans in answer
                 if ans in answer_top_k]
    answer_counter = Counter(answer_idx)
    train_answer_counter.append(answer_counter)
    train_answer_idx.append(answer_idx)
    if not answer_idx:
        idx_to_remove.append(idx)
print ('%d out of %d, %f of the question in train are removed'\
    %(len(idx_to_remove), len(train_question_ids),
      len(idx_to_remove) / float(len(train_question_ids))))

# transform to array and delete all the empty answer
train_question_ids = np.array(train_question_ids)
train_image_ids = np.array(train_image_ids)
train_question_idx = np.array(train_question_idx)
train_answer_idx = np.array(train_answer_idx)
train_answer_counter = np.array(train_answer_counter)

train_question_ids = np.delete(train_question_ids, idx_to_remove)
train_image_ids = np.delete(train_image_ids, idx_to_remove)
train_question_idx = np.delete(train_question_idx, idx_to_remove)
train_answer_idx = np.delete(train_answer_idx, idx_to_remove)
train_answer_counter = np.delete(train_answer_counter, idx_to_remove)

# reshuffle the train data
idx_shuffle = list(range(train_question_ids.shape[0]))
random.shuffle(idx_shuffle)
train_question_ids = train_question_ids[idx_shuffle]
train_image_ids = train_image_ids[idx_shuffle]
train_question_idx = train_question_idx[idx_shuffle]
train_answer_idx = train_answer_idx[idx_shuffle]
train_answer_counter = train_answer_counter[idx_shuffle]

# the most frequent as label
train_answer_label = [counter.most_common(1)[0][0]
                      for counter in train_answer_counter]
train_answer_label = np.array(train_answer_label)

# transform from counter to dict
train_answer_counter = [dict(counter) for counter in train_answer_counter]
train_answer_counter = np.array(train_answer_counter)

print ('finished processing train')

finished processing 0 in train
finished processing 10000 in train
finished processing 20000 in train
finished processing 30000 in train
finished processing 40000 in train
2105 out of 44375, 0.047437 of the question in train are removed
finished processing train


### 2.4 构建词向量

In [7]:
#construct one hot vector
all_question_vector=[]
for idx,question in enumerate(train_questions):
    count = 0
    question_vector = []
    for word in question:
        count = count + 1
        if count > 10:
            break
        else:
            q_emb = np.zeros((len(question_key) + 1), dtype='int32')
            q_emb[question_key[word]] = 1
            question_vector.append(q_emb)
    while count < 10:
        padding = np.zeros((len(question_key) + 1), dtype='int32')
        question_vector.append(padding)
        count = count + 1
    all_question_vector.append(question_vector)

In [8]:
#convert word to idx
all_question_idx = []
for question in train_questions:
    count = 0
    one_question_idx = []
    
    for word in question:
        count = count + 1
        if count > 10:
            break
        else:
            one_question_idx.append(question_key[word])     
    while count < 10:
        one_question_idx.append(0)
        count = count + 1        
    all_question_idx.append(one_question_idx)
all_question_idx

[[3, 99, 1627, 0, 0, 0, 0, 0, 0, 0],
 [4, 3, 6, 16, 126, 8, 0, 0, 0, 0],
 [5, 2, 295, 1164, 7, 10, 3990, 13, 0, 0],
 [4, 3, 2, 16, 31, 0, 0, 0, 0, 0],
 [4, 3, 2, 28, 20, 0, 0, 0, 0, 0],
 [144, 164, 1628, 44, 452, 7, 2, 219, 0, 0],
 [4, 3, 7, 2, 55, 0, 0, 0, 0, 0],
 [4, 3, 8, 2, 630, 0, 0, 0, 0, 0],
 [4, 3, 2, 16, 58, 0, 0, 0, 0, 0],
 [23, 3, 2, 154, 0, 0, 0, 0, 0, 0],
 [4, 3, 2, 248, 56, 8, 0, 0, 0, 0],
 [4, 13, 3, 2, 248, 0, 0, 0, 0, 0],
 [4, 3, 2, 121, 0, 0, 0, 0, 0, 0],
 [3, 2, 63, 296, 268, 0, 0, 0, 0, 0],
 [4, 13, 133, 3, 1781, 18, 2, 229, 0, 0],
 [5, 35, 58, 3991, 0, 0, 0, 0, 0, 0],
 [5, 2, 101, 108, 18, 2994, 2, 255, 0, 0],
 [4, 13, 5, 2, 145, 0, 0, 0, 0, 0],
 [11, 12, 145, 5, 413, 8, 2, 248, 0, 0],
 [11, 12, 191, 5, 7, 2, 55, 0, 0, 0],
 [4, 13, 3, 2, 36, 0, 0, 0, 0, 0],
 [4, 5, 2, 741, 56, 8, 0, 0, 0, 0],
 [3, 2, 34, 95, 2, 72, 0, 0, 0, 0],
 [929, 516, 3, 8, 2, 3992, 274, 0, 0, 0],
 [4, 5, 2, 39, 31, 0, 0, 0, 0, 0],
 [3, 6, 10, 2134, 0, 0, 0, 0, 0, 0],
 [4, 25, 9, 49, 3, 67, 0,

In [94]:
#convert word to idx

In [95]:
x = [(idx,len(item)) for idx,item in enumerate(train_questions)]

## 3.训练过程

### 3.1 超参数设置

In [10]:
#from easydict import EasyDict as edict
options = OrderedDict()
# data related
options['data_path'] = './data/'
#options['feature_file'] = 'trainval_feat.h5'
#options['expt_folder'] = 'expt_1'
options['model_name'] = 'imageqa'
options['train_split'] = 'trainval1'
options['val_split'] = 'val2'
options['shuffle'] = True
options['reverse'] = True
options['sample_answer'] = True

options['num_region'] = 196
options['region_dim'] = 512

options['n_words'] = 13746
options['n_output'] = 1000

# structure options
options['combined_num_mlp'] = 1
options['combined_mlp_drop_0'] = True
options['combined_mlp_act_0'] = 'linear'
options['sent_drop'] = False
options['use_tanh'] = False

options['use_attention_drop'] = False

# dimensions
options['n_emb'] = 500
options['n_dim'] = 1024
options['n_image_feat'] = options['region_dim']
options['n_common_feat'] = 500
options['n_attention'] = 512

# initialization
options['init_type'] = 'uniform'
options['range'] = 0.01
options['std'] = 0.01
options['init_lstm_svd'] = False

options['forget_bias'] = np.float32(1.0)

# learning parameters
options['optimization'] = 'sgd' # choices
options['batch_size'] = 100
options['lr'] = np.float32(0.05)
options['w_emb_lr'] = np.float32(80)
options['momentum'] = np.float32(0.9)
options['gamma'] = 1
options['step'] = 10
options['step_start'] = 100
options['max_epochs'] = 50
options['weight_decay'] = 0.0005
options['decay_rate'] = np.float32(0.999)
options['drop_ratio'] = np.float32(0.5)
options['smooth'] = np.float32(1e-8)
options['grad_clip'] = np.float32(0.1)

# log params
options['disp_interval'] = 10
options['eval_interval'] = 1000
options['save_interval'] = 500

#new
options['dict_size'] = 6620

In [11]:
context.set_context(mode=context.GRAPH_MODE, device_target='Ascend', device_id=0)

### 3.2模型搭建

In [10]:
def init_weight(n, d, options):
    ''' initialize weight matrix
    options['init_type'] determines
    gaussian or uniform initlizaiton
    '''
    if options['init_type'] == 'gaussian':
        return (np.random.randn(n, d).astype(floatX)) * options['std']
    elif options['init_type'] == 'uniform':
        # [-range, range]
        return ((np.random.rand(n, d) * 2 - 1) * \
                options['range']).astype(floatX)
def ortho_weight(ndim):
    """
    Random orthogonal weights, we take
    the right matrix in the SVD.

    Remember in SVD, u has the same # rows as W
    and v has the same # of cols as W. So we
    are ensuring that the rows are
    orthogonal.
    """
    W = np.random.randn(ndim, ndim)
    u, _, _ = np.linalg.svd(W)
    return u.astype('float32')

def init_fflayer(params, nin, nout, options, prefix='ff'):
    ''' initialize ff layer
    '''
    params[prefix + '_w'] = init_weight(nin, nout, options)
    params[prefix + '_b'] = np.zeros(nout, dtype='float32')
    return params

def init_lstm_layer(params, nin, ndim, options, prefix='lstm'):
    ''' initializt lstm layer
    '''
    params[prefix + '_w_x'] = init_weight(nin, 4 * ndim, options)
    # use svd trick to initializ
    if options['init_lstm_svd']:
        params[prefix + '_w_h'] = np.concatenate([ortho_weight(ndim),
                                                  ortho_weight(ndim),
                                                  ortho_weight(ndim),
                                                  ortho_weight(ndim)],
                                                 axis=1)
    else:
        params[prefix + '_w_h'] = init_weight(ndim, 4 * ndim, options)
    params[prefix + '_b_h'] = np.zeros(4 * ndim, dtype='float32')
    # set forget bias to be positive
    params[prefix + '_b_h'][ndim : 2*ndim] = np.float32(options.get('forget_bias', 0))
    return params

# initialize the parmaters
def init_params(options):
    ''' Initialize all the parameters
    '''
    params = OrderedDict()
    n_words = options['n_words']
    n_emb = options['n_emb']
    n_dim = options['n_dim']
    n_image_feat = options['n_image_feat']
    n_common_feat = options['n_common_feat']
    n_output = options['n_output']
    n_attention = options['n_attention']

    params['w_emb'] = ((np.random.rand(n_words, n_emb) * 2 - 1) * 0.5).astype(floatX)

    params = init_fflayer(params, n_image_feat, n_dim, options,
                          prefix='image_mlp')

    # attention model based parameters
    params = init_fflayer(params, n_dim, n_attention, options,
                          prefix='image_att_mlp_1')
    params = init_fflayer(params, n_dim, n_attention, options,
                          prefix='sent_att_mlp_1')
    params = init_fflayer(params, n_attention, 1, options,
                          prefix='combined_att_mlp_1')
    params = init_fflayer(params, n_dim, n_attention, options,
                          prefix='image_att_mlp_2')
    params = init_fflayer(params, n_dim, n_attention, options,
                          prefix='sent_att_mlp_2')
    params = init_fflayer(params, n_attention, 1, options,
                          prefix='combined_att_mlp_2')


    # params for sentence image mlp
    for i in range(options['combined_num_mlp']):
        if i == 0 and options['combined_num_mlp'] == 1:
            params = init_fflayer(params, n_dim, n_output,
                                  options, prefix='combined_mlp_%d'%(i))
        elif i == 0 and options['combined_num_mlp'] != 1:
            params = init_fflayer(params, n_dim, n_common_feat,
                                  options, prefix='combined_mlp_%d'%(i))
        elif i == options['combined_num_mlp'] - 1 :
            params = init_fflayer(params, n_common_feat, n_output,
                                  options, prefix='combined_mlp_%d'%(i))
        else:
            params = init_fflayer(params, n_common_feat, n_common_feat,
                                  options, prefix='combined_mlp_%d'%(i))

    # lstm layer
    params = init_lstm_layer(params, n_emb, n_dim, options, prefix='sent_lstm')

    return params

def init_shared_params(params):
    ''' return a shared version of all parameters
    '''
    global shared_params
    shared_params = OrderedDict()
    for k, p in params.items():
        shared_params[k] = params[k]

    return shared_params

def get_lr(options, curr_epoch):
    if options['optimization'] == 'sgd':
        power = max((curr_epoch - options['step_start']) / options['step'], 0)
        power = math.ceil(power)
        return options['lr'] * (options['gamma'] ** power)  #
    else:
        return options['lr']

In [11]:
def lstm_layer(shared_params, x, mask, h_0, c_0, options, prefix='lstm'):
    ''' lstm layer:
    :param shared_params: shared parameters
    :param x: input, T x batch_size x n_emb
    :param mask: mask for x, T x batch_size
    '''
    n_emb = options['n_emb']
    n_dim = options['n_dim']
    # weight matrix for x, n_emb x 4*n_dim (ifoc)
    lstm_w_x = shared_params[prefix + '_w_x']
    # weight matrix for h, n_dim x 4*n_dim
    lstm_w_h = shared_params[prefix + '_w_h']
    lstm_b_h = shared_params[prefix + '_b_h']
    h_0 = h_0[:x.shape[1]]
    c_0 = c_0[:x.shape[1]]
    question_net = LSTM(n_emb, n_dim)
    output, (h, c) = question_net(x, (h_0, c_0))
    return h, c

In [12]:
def build_model(shared_params, options):
    #input_idx = Tensor.imatrix('input_idx')
    input_idx = Tensor()
    global empty_word
    empty_word = np.zeros((1, options['n_emb']), dtype='float32')
    w_emb_extend = Tensor.concatenate([empty_word, shared_params['w_emb']],
                                 axis=0)
    input_emb = w_emb_extend[input_idx]
    
    # get the transformed image feature
    global h_0, c_0
    h_0 = np.zeros((batch_size, n_dim), dtype='float32')
    c_0 = np.zeros((batch_size, n_dim), dtype='float32')
    h_encode, c_encode = lstm_layer(shared_params, input_emb, input_mask,
                                    h_0, c_0, options, prefix='sent_lstm')
    return h_encodem, c_encode

In [13]:
floatX = np.float32
batch_size = options['batch_size']
max_epochs = options['max_epochs']

###############
# build model #
###############
params = init_params(options)
shared_params = init_shared_params(params)

In [14]:
input_idx = np.ones((6618,100),dtype = 'int32')
shared_params['w_emb'] = ((np.random.rand(13746, 500) * 2 - 1) * 0.5).astype(floatX)
empty_word = np.zeros((1, 500), dtype='float32')
w_emb_extend = shared_params['w_emb']
input_emb = w_emb_extend[input_idx]

In [None]:
class LSTM(nn.Cell):
    def __init__(self, options, is_training=True):
        super(LSTM, self).__init__()
        if is_training:
            self.batch_size = options['batch_size']
        else:
            self.batch_size = 1
            
        self.n_dim = options['n_dim']
        self.n_emb = options['n_emb']
        self.dropout = options['drop_ratio']
        
        # TODO
        self.h = Tensor(np.zeros((1,self.batch_size, self.n_dim), dtype='float32'))
        self.c = Tensor(np.zeros((1,self.batch_size, self.n_dim), dtype='float32'))
        
        self.rnn = nn.LSTM(self.n_emb,self.n_dim,1,True,True,self.dropout)
        #self.cast = P.Cast()

    def construct(self, x):
        #x = self.cast(x, mstype.float16)
        output,(h1,c1) = self.rnn(x, (self.h,self.c))
        return output,(h1,c1)

class Question(nn.Cell):
    def __init__(self, options, is_training=True):
        super(Question, self).__init__()
        #dict_size(vocab_size)
        self.dict_size = options['dict_size']
        #n_dim (hidden_size)
        self.n_dim = options['n_dim']
        self.n_emb = options['n_emb']
        
        if is_training:
            self.batch_size = options['batch_size']
        else:
            self.batch_size = 1

        #self.trans = P.Transpose()
        #self.perm = (1, 0, 2)
        
        #HIGHLIGHT 第二个参数n_dim -> n_emb
        self.embedding = nn.Embedding(self.dict_size, self.n_emb)
        #?
        self.lstm = LSTM(options, is_training=is_training).to_float(mstype.float16)
        #self.h = Tensor(np.zeros((self.batch_size, self.n_dim)).astype(np.float16))
        #self.c = Tensor(np.zeros((self.batch_size, self.n_dim)).astype(np.float16))

    def construct(self, question_input):
        embeddings = self.embedding(question_input)
        #embeddings = self.trans(embeddings, self.perm)
        output, (hn,cn) = self.lstm(embeddings)
        return output, hn, cn


In [None]:
def fflayer(shared_params, x, options, prefix='ff', act_func='tanh'):
    ''' fflayer: multiply weight then add bias
    '''
    return nn.Tanh(mindspore.ops.dot(x, shared_params[prefix + '_w']) +
                          shared_params[prefix + '_b'])

class VQA(nn.Cell):
    def __init__(self, config, is_train=True):
        super(Seq2Seq, self).__init__()
        self.max_len = config.max_seq_length
        self.is_train = is_train

        #self.encoder = Encoder(config, is_train)
        #self.decoder = Decoder(config, is_train)
        self.expanddims = P.ExpandDims()
        self.squeeze = P.Squeeze(axis=0)
        self.argmax = P.ArgMaxWithValue(axis=int(2), keep_dims=True)
        self.concat = P.Concat(axis=1)
        self.concat2 = P.Concat(axis=0)
        self.select = P.Select()
        self.softmax = nn.Softmax()
        
        
        ##### my #####
        self.question = Question(options, is_train)
    
    
    def construct(self, src, dst):
        ### TODO:get image_feat
        
        output,h_encode,c_encode = self.question(src)
        h_encode = h_encode[0][-1]
        
        image_feat_down = fflayer(shared_params, image_feat, options,
                             prefix='image_mlp',
                              act_func=options.get('image_mlp_act',
                                                   'tanh'))

        image_feat_attention_1 = fflayer(shared_params, image_feat_down, options,
                                         prefix='image_att_mlp_1',
                                         act_func=options.get('image_att_mlp_act',
                                                              'tanh'))
        
        h_encode_attention_1 = fflayer(shared_params, h_encode, options,
                                       prefix='sent_att_mlp_1',
                                       act_func=options.get('sent_att_mlp_act',
                                                            'tanh'))  #
        combined_feat_attention_1 = image_feat_attention_1 + \
                                    h_encode_attention_1[:, None, :]
        
        ###暂时不管
        #if options['use_attention_drop']:
            #combined_feat_attention_1 = dropout_layer(combined_feat_attention_1,
                                                      #dropout, trng, drop_ratio)
            
        combined_feat_attention_1 = fflayer(shared_params,
                                            combined_feat_attention_1, options,
                                            prefix='combined_att_mlp_1',
                                            act_func=options.get(
                                                'combined_att_mlp_act',
                                                'tanh'))
        
        prob_attention_1 = self.softmax(combined_feat_attention_1[:, :, 0])
        image_feat_ave_1 = (prob_attention_1[:, :, None] * image_feat_down).sum(axis=1)

        combined_hidden_1 = image_feat_ave_1 + h_encode
        
        # second layer attention model
        image_feat_attention_2 = fflayer(shared_params, image_feat_down, options,
                                         prefix='image_att_mlp_2',
                                         act_func=options.get('image_att_mlp_act',
                                                              'tanh'))
        h_encode_attention_2 = fflayer(shared_params, combined_hidden_1, options,
                                       prefix='sent_att_mlp_2',
                                       act_func=options.get('sent_att_mlp_act',
                                                            'tanh'))
        combined_feat_attention_2 = image_feat_attention_2 + \
                                    h_encode_attention_2[:, None, :]
        
        ### 暂时不做
        #if options['use_attention_drop']:
            #combined_feat_attention_2 = dropout_layer(combined_feat_attention_2,
                                                      #dropout, trng, drop_ratio)

        combined_feat_attention_2 = fflayer(shared_params,
                                            combined_feat_attention_2, options,
                                            prefix='combined_att_mlp_2',
                                            act_func=options.get(
                                                'combined_att_mlp_act', 'tanh'))
        
        prob_attention_2 = self.softmax(combined_feat_attention_2[:, :, 0])

        image_feat_ave_2 = (prob_attention_2[:, :, None] * image_feat_down).sum(axis=1)

        return outputs

In [50]:
options.get('image_mlp_act',
                                                   'tanh')

'tanh'

In [12]:
from mindspore import dtype as mstype
test_batch = 8
all_question_idx = Tensor(np.array(all_question_idx),mstype.int32)

In [13]:
x = all_question_idx[0:8,]

In [21]:
#modify
embedding = nn.Embedding(options['dict_size'], options['n_emb'],True)
embeddings = embedding(x)

In [22]:
print(embeddings.shape)

(8, 10, 500)


In [24]:
#这里的8是batch_size
h = Tensor(np.zeros((1,8, options['n_dim']), dtype='float32'))
c = Tensor(np.zeros((1,8, options['n_dim']), dtype='float32'))
        
net = nn.LSTM(options['n_emb'],options['n_dim'],1,True,True)
output,(h1,c1) = net(embeddings, (h,c))

In [31]:
h1[0][-1]

Tensor(shape=[1024], dtype=Float32, value= [ 8.98742676e-03,  9.22393799e-03,  1.33590698e-02,  2.92968750e-03,  7.48443604e-03,  1.69849396e-03,  5.61904907e-03, -9.92584229e-03,  3.23486328e-03, -2.92205811e-03, -1.00402832e-02,  2.30216980e-03, 
 -1.04522705e-02, -4.16183472e-03, -1.07803345e-02, -2.21633911e-03,  2.01225281e-03, -8.78906250e-03, -1.07002258e-03,  4.51087952e-04,  9.27734375e-03, -9.95635986e-03, -1.16539001e-03,  4.73022461e-03, 
 -1.31225586e-02, -1.08795166e-02,  7.14874268e-03,  1.12991333e-02, -2.97164917e-03, -2.96401978e-03, -1.66320801e-02, -9.59014893e-03,  1.43966675e-02, -3.15475464e-03,  1.08413696e-02, -1.70898438e-02, 
 -1.67083740e-02,  3.79753113e-03,  7.95745850e-03,  1.24130249e-02, -1.20925903e-02,  1.27792358e-02, -1.37786865e-02, -2.83050537e-03,  9.22203064e-04, -1.03759766e-02, -1.28784180e-02,  6.46972656e-03, 
  1.08184814e-02, -1.36256218e-04,  4.58145142e-03, -1.33285522e-02, -8.91113281e-03,  7.35473633e-03,  1.01776123e-02,  1.12533569e-

In [10]:

net = nn.Embedding(20000, 768,  True)
input_data = Tensor(np.ones([8, 128]), mindspore.int32)

# Maps the input word IDs to word embedding.
output = net(input_data)
result = output.shape
print(result)



(8, 128, 768)


In [39]:
h = Tensor([[1,2,1],
            [1,1,1]], mstype.float32)
c = Tensor(np.ones((1,3,2)), mstype.float32)

In [40]:
output = mindspore.ops.dot(h,c)

In [41]:
output

Tensor(shape=[2, 1, 2], dtype=Float32, value=
[[[ 4.00000000e+00,  4.00000000e+00]],
 [[ 3.00000000e+00,  3.00000000e+00]]])

In [42]:
h.shape

(2, 3)

In [43]:
c.shape

(1, 3, 2)

In [45]:
input_x1 = Tensor(np.ones(shape=[2, 3]), mindspore.float32)
input_x2 = Tensor(np.ones(shape=[1, 3, 2]), mindspore.float32)
output = mindspore.ops.dot(input_x1, input_x2)
print(output)


[[[3. 3.]]

 [[3. 3.]]]


In [49]:
print(output.shape)
print(input_x1.shape)
print(input_x2.shape)

(2, 1, 2)
(2, 3)
(1, 3, 2)
