# 1. 数据分析及任务要求

## 1.1 数据预处理
本章训练的是5-gram模型，表示在PaddlePaddle训练时，每条数据的前4个词用来预测第5个词。PaddlePaddle提供了对应PTB数据集的python包paddle.dataset.imikolov，自动做数据的下载与预处理，方便大家使用。

预处理会把数据集中的每一句话前后加上开始符号\<s>以及结束符号\<e>。然后依据窗口大小（本教程中为5），从头到尾每次向右滑动窗口并生成一条数据。

这里注意是三个关键点：<br>
1. 句间标记----\<s> and \<e>
2. 窗口大小 - 类似于cnn中的卷积核
3. 步长 - 每次窗口移动的长度，最终生成的数据条数为步长+1

最后，每个输入会按其单词次在字典里的位置，转化成整数的索引序列，作为PaddlePaddle的输入。

![avatar](https://githubraw.cdn.bcebos.com/PaddlePaddle/book/develop/04.word2vec/image/nnlm.png?raw=true)


# 2 编程实现
## 2.1 导入必要的包

In [1]:
import paddle as paddle
import paddle.fluid as fluid
import numpy as np
import six
import math
from __future__ import print_function

paddle.enable_static()

## 2.2 准备数据


In [2]:
# 定义模型参数
embed_size = 32 # embedding 维度
hidden_size = 256 # 隐层大小
gram_size = 5 # 取词的窗口大小
pass_num  = 1 # 训练的轮数
batch_size = 128 # 每轮训练当中，每次run program所用到的样本量

In [3]:
# 加载字典，并且获取字典的长度
# embedding 中，需要用到dic_size，
word_dic  = paddle.dataset.imikolov.build_dict()
dic_size = len(word_dic)
# 2 搭建模型

In [4]:
#
train_reader = paddle.batch(
        paddle.dataset.imikolov.train(word_dic,gram_size),batch_size = batch_size
)

test_reader = paddle.batch(
        paddle.dataset.imikolov.test(word_dic,gram_size),batch_size = batch_size
)

API "paddle.dataset.imikolov.train" is deprecated since 2.0.0, and will be removed in future versions. Please use "paddle.text.datasets.Imikolov" instead.
reason: Please use new dataset API which supports paddle.io.DataLoader 
  paddle.dataset.imikolov.train(word_dic,gram_size),batch_size = batch_size
API "paddle.dataset.imikolov.test" is deprecated since 2.0.0, and will be removed in future versions. Please use "paddle.text.datasets.Imikolov" instead.
reason: Please use new dataset API which supports paddle.io.DataLoader 
  paddle.dataset.imikolov.test(word_dic,gram_size),batch_size = batch_size


In [5]:
sample = next(train_reader())

In [6]:
word_dic

{b'the': 0,
 b'<unk>': 1,
 '<e>': 2,
 '<s>': 3,
 b'N': 4,
 b'of': 5,
 b'to': 6,
 b'a': 7,
 b'in': 8,
 b'and': 9,
 b"'s": 10,
 b'for': 11,
 b'that': 12,
 b'$': 13,
 b'is': 14,
 b'it': 15,
 b'said': 16,
 b'on': 17,
 b'at': 18,
 b'by': 19,
 b'as': 20,
 b'from': 21,
 b'million': 22,
 b'with': 23,
 b'mr.': 24,
 b'was': 25,
 b'be': 26,
 b'its': 27,
 b'are': 28,
 b'he': 29,
 b'but': 30,
 b'has': 31,
 b'an': 32,
 b"n't": 33,
 b'have': 34,
 b'will': 35,
 b'new': 36,
 b'or': 37,
 b'company': 38,
 b'they': 39,
 b'this': 40,
 b'which': 41,
 b'year': 42,
 b'would': 43,
 b'about': 44,
 b'market': 45,
 b'says': 46,
 b'more': 47,
 b'were': 48,
 b'had': 49,
 b'billion': 50,
 b'his': 51,
 b'their': 52,
 b'up': 53,
 b'one': 54,
 b'u.s.': 55,
 b'than': 56,
 b'stock': 57,
 b'who': 58,
 b'been': 59,
 b'some': 60,
 b'also': 61,
 b'other': 62,
 b'share': 63,
 b'not': 64,
 b'we': 65,
 b'corp.': 66,
 b'when': 67,
 b'if': 68,
 b'shares': 69,
 b'last': 70,
 b'all': 71,
 b'i': 72,
 b'president': 73,
 b'years': 74,

In [7]:
sample[0]
di = {y:x for x,y in word_dic.items()}
se = ''
for i in sample[0]:
    se = se +' ' + di[i]
print(se)

 <s> <unk> <unk> <unk> <unk>


## 2.3 定义模型结构的参数--训练轮数及超参等
1、更大的BATCH_SIZE将使得训练更快收敛，但也会消耗更多内存。<br/>
2、paddle不必再手动计算词向量。PaddlePaddle提供了一个内置的方法fluid.layers.embedding，我们就可以直接用它来构造 N-gram 神经网络。<br>
3、因为词向量比较稀疏，我们传入参数 is_sparse == True, 可以加速稀疏矩阵的更新。
本次学习模型如下：
![avatar](https://githubraw.cdn.bcebos.com/PaddlePaddle/book/develop/04.word2vec/image/ngram.png?raw=true)

In [8]:
# 定义N-gram 神经网络结构
def N_GramNet(words,is_sparse =True):
    embed_first = fluid.layers.embedding(
        input=words[0],
        size=[dic_size,embed_size],
        dtype='float32',
        is_sparse=is_sparse
    )

    embed_second = fluid.layers.embedding(
        input=words[1],
        size = [dic_size,embed_size],
        dtype='float32',
        is_sparse=is_sparse
    )

    embed_third = fluid.layers.embedding(
        input=words[2],
        size = [dic_size,embed_size],
        dtype='float32',
        is_sparse=is_sparse
    )

    embed_fourth = fluid.layers.embedding(
        input=words[3],
        size = [dic_size,embed_size],
        dtype='float32',
        is_sparse=is_sparse
    )

    # 词向量拼接
    contact_embed = fluid.layers.concat(
        input=[embed_first,embed_second,embed_third,embed_fourth],
        axis=1
    )

    hidden1 = fluid.layers.fc(
        input= contact_embed,
        size = hidden_size,
        act = 'sigmoid'
    )

    prediction = fluid.layers.fc(
        input=hidden1,size=dic_size,act = 'softmax' #词向量的长度
    )

    return prediction

In [9]:
# 定义数据
label = fluid.layers.data(name='label',shape=[1], dtype='int64')

word01 = fluid.layers.data(
    name = 'word_01',shape=[1],dtype = 'int64'
)
word02 = fluid.layers.data(
    name = 'word_02',shape=[1],dtype = 'int64'
)
word03 = fluid.layers.data(
    name = 'word_03',shape=[1],dtype = 'int64'
)
word04 = fluid.layers.data(
    name = 'word_04',shape=[1],dtype = 'int64'
)

## 2.4 定义损失函数

In [10]:
def cost_func(prediction):
    # '输入数据'的定义必须要在inference_program的声明之后，
    # 否则train program输入数据的顺序就变成了[输入数据, firstw, secondw,
    # thirdw, fourthw], 这是不正确的.
    # 使用交叉熵
    cost = fluid.layers.cross_entropy(prediction,label=label)
    avg_cost = fluid.layers.mean(cost)
    return avg_cost


## 2.5 定义优化方法

In [11]:
def optimizer_func():
    return fluid.optimizer.AdagradOptimizer(
        learning_rate=0.01
    )

# 2.6 模型组装

In [12]:
def train_model():
    # 定义输入数据
    word_list = [word01,word02,word03,word04]
    # 获取分类器
    prediction = N_GramNet(word_list)
    acc = fluid.layers.accuracy(prediction,label=label)
    avg_cost = cost_func(prediction)
    optimizer = optimizer_func()
    optimizer.minimize(avg_cost)
    # print(type(acc),type(avg_cost))
    return avg_cost,acc,prediction

In [13]:
avg_cost,acc ,prediction = train_model()

# 3. 模型训练

In [14]:
# 定义训练场所
use_cuda = False # 是否使用cpu，这里是否
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

# 定义执行引擎
executor = fluid.Executor(place)

# 定义测试程序
test_program = fluid.default_main_program().clone(for_test=True)

#初始化模型
executor.run(fluid.default_startup_program())

[]

In [15]:
feed_list = ['word_01','word_02','word_03','word_04','label']
feed_var_list = [
    fluid.default_main_program().global_block().var(var_name) for var_name in feed_list
]
feeder =fluid.DataFeeder(feed_list=feed_var_list,place=place)

In [16]:
for i in range(pass_num):
    #train
    j = 0
    for data in train_reader():

        train_acc,train_avg_cost = executor.run(
            fluid.default_main_program(),
            feed = feeder.feed(data),
            fetch_list=[acc,avg_cost]
        )
        j+=1
        if j%100 == 0:
            print('pass:{} ,iterator : {} ,acc is : {}, avg_cost is {}'.format(i,j,train_acc,train_avg_cost))

    # test
    j = 0
    for data in test_reader():
        test_acc, test_avg = executor.run(
            test_program,feed = feeder.feed(data),fetch_list=[acc,avg_cost]
        )
        j+=1
        if j%100 == 0:
            print('pass:{} ,iterator : {} ,acc is : {}, avg_cost is {}'.format(i,j,train_acc,train_avg_cost))



pass:0 ,iterator : 100 ,acc is : [0.09375], avg_cost is [5.461892]
pass:0 ,iterator : 200 ,acc is : [0.140625], avg_cost is [5.1464887]
pass:0 ,iterator : 300 ,acc is : [0.125], avg_cost is [5.0911307]
pass:0 ,iterator : 400 ,acc is : [0.1484375], avg_cost is [5.161768]
pass:0 ,iterator : 500 ,acc is : [0.1328125], avg_cost is [5.150032]
pass:0 ,iterator : 600 ,acc is : [0.25], avg_cost is [4.1022882]
pass:0 ,iterator : 700 ,acc is : [0.1875], avg_cost is [4.7641973]
pass:0 ,iterator : 800 ,acc is : [0.140625], avg_cost is [4.6442866]
pass:0 ,iterator : 900 ,acc is : [0.1484375], avg_cost is [5.2562394]
pass:0 ,iterator : 1000 ,acc is : [0.15625], avg_cost is [5.1771727]
pass:0 ,iterator : 1100 ,acc is : [0.125], avg_cost is [5.1837196]
pass:0 ,iterator : 1200 ,acc is : [0.1171875], avg_cost is [5.4172225]
pass:0 ,iterator : 1300 ,acc is : [0.2421875], avg_cost is [4.2972817]
pass:0 ,iterator : 1400 ,acc is : [0.140625], avg_cost is [5.4707174]
pass:0 ,iterator : 1500 ,acc is : [0.1406

In [17]:
# 保存模型
import os
model_dir = '../model/word_embedding.n_gram.model/'
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

fluid.io.save_inference_model(
    model_dir,['word_01','word_02','word_03','word_04'],[prediction],executor
)

['save_infer_model/scale_0.tmp_0']

In [33]:
# 保存模型
import os
model_dir = '../model/word_embedding.n_gram.model.params/'
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

fluid.io.save_params(
    executor,model_dir,fluid.default_main_program()
)

In [18]:
exe = fluid.Executor(place)

In [1]:
import numpy

In [2]:
inference_scope = fluid.core.Scope()
with fluid.scope_guard(inference_scope):
    # 使用fluid.io.load_inference_model获取inference program，
    # feed变量的名称feed_target_names和从scope中fetch的对象fetch_targets
    [inferencer, feed_target_names,
        fetch_targets] = fluid.io.load_inference_model(model_dir, exe)

    data1 = numpy.asarray([[211,11]], dtype=numpy.int64)  # 'among'
    data2 = numpy.asarray([[6]], dtype=numpy.int64)  # 'a'
    data3 = numpy.asarray([[96]], dtype=numpy.int64)  # 'group'
    data4 = numpy.asarray([[4]], dtype=numpy.int64)  # 'of'

    lod = numpy.asarray([[1]], dtype=numpy.int64)

    first_word = fluid.create_lod_tensor(data1, lod, place)
    print(first_word.shape())
    # second_word = fluid.create_lod_tensor(data2, lod, place)
    # third_word = fluid.create_lod_tensor(data3, lod, place)
    # fourth_word = fluid.create_lod_tensor(data4, lod, place)

   

NameError: name 'fluid' is not defined