# nlp-project-vqa


In [1]:
import mindspore
from mindspore.mindrecord import FileWriter
import mindspore.dataset as dataset
import numpy as np
from easydict import EasyDict
from preprocess.preprocess import *

## 1 预处理

### 1.1 预处理配置

In [2]:
padding = '<pad>'
config = EasyDict({
	'train_ans_path': './data/annotations/train.json',
	'train_que_path': './data/questions/train.json',
	'valid_ans_path': './data/annotations/val.json',
	'valid_que_path': './data/questions/val.json',
	'test_ans_path':  './data/annotations/test.json',
	'test_que_path':  './data/questions/test.json',
	'train_img_path': './data/images/train/COCO_train2014_',
	'test_img_path': './data/images/test/COCO_val2014_',
	'val_img_path': './data/images/val/COCO_val2014_',
	'max_length': 25,
	'dict_path': './mindrecord/dict.npy',
	'idx_word_dict_path': './mindrecord/idx_word_dict.npy',
	'num_splits': 1,
	'train_mindrecord_path': './mindrecord/train.mindrecord',
	'valid_mindrecord_path': './mindrecord/valid.mindrecord',
	'test_mindrecord_path':  './mindrecord/test.mindrecord',
})

### 1.2 读取数据

注: 只取那些答案长度为1的vqa组合

In [3]:
# get 3 types of input data
train_images, train_questions, train_answers = get_list(config.train_que_path, config.train_ans_path)
valid_images, valid_questions, valid_answers = get_list(config.valid_que_path, config.valid_ans_path)
test_images,  test_questions,  test_answers  = get_list(config.test_que_path,  config.test_ans_path)

In [4]:
total_questions = train_questions + valid_questions + test_questions
total_answers = train_answers + valid_answers + test_answers

### 1.3 构建词典

In [5]:
# build word vocab
word_dict = dict({'<pad>': 0})
word_dict = add_word_into_dict(total_questions, word_dict)
word_dict = add_word_into_dict(total_answers, word_dict)

In [6]:
# build revert dict
idx_word_dict = dict()
for item in word_dict.items():
	idx_word_dict[item[1]] = item[0]

In [7]:
# save dict
np.save(config.dict_path, word_dict)
np.save(config.idx_word_dict_path, idx_word_dict)

### 1.4 向量化 & 补齐长度

In [8]:
# word -> vector & padding
train_questions_vec = get_vec_and_pad(train_questions, word_dict, config.max_length)
valid_questions_vec = get_vec_and_pad(valid_questions, word_dict, config.max_length)
test_questions_vec = get_vec_and_pad(test_questions, word_dict, config.max_length)

train_answers_vec = get_vec_and_pad(train_answers, word_dict, 1)
valid_answers_vec = get_vec_and_pad(valid_answers, word_dict, 1)
test_answers_vec = get_vec_and_pad(test_answers, word_dict, 1)


train_images_list = read_image(train_images, config.train_img_path)
np.save('./mindrecord/train_images_list', train_images_list)

valid_images_list = read_image(valid_images, config.val_img_path)
np.save('./mindrecord/valid_images_list', valid_images_list)

test_images_list = read_image(test_images, config.test_img_path)
np.save('./mindrecord/test_images_list', test_images_list)

ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
ready
read

### 1.5 生成MindRecord

In [10]:
generate_mindrecord(config.train_mindrecord_path, config.num_splits, train_images_list, train_questions_vec, train_answers_vec)
generate_mindrecord(config.valid_mindrecord_path, config.num_splits, valid_images_list, valid_questions_vec, valid_answers_vec)
generate_mindrecord(config.test_mindrecord_path,  config.num_splits, test_images_list,  test_questions_vec, test_answers_vec)

train
valid
test


## 2 加载数据

### 2.1 加载词典

In [11]:
# load dict
word_dict = np.load(config.dict_path, allow_pickle=True).item()
idx_word_dict = np.load(config.idx_word_dict_path, allow_pickle=True).item()

### 2.2 训练配置

In [12]:
train_config = EasyDict({
	'batch_size': 32,
	'epoch_size': 20
})

### 2.3 生成数据集

In [15]:
# create dataset
train_dataset = generate_dataset(config.train_mindrecord_path, train_config.batch_size, train_config.epoch_size)
valid_dataset = generate_dataset(config.valid_mindrecord_path, train_config.batch_size, train_config.epoch_size)
test_dataset  = generate_dataset(config.test_mindrecord_path, train_config.batch_size, 1)