In [1]:

with open('jay_lyric.txt', 'r',encoding='gbk') as f:
    corpus_chars = f.read()
    corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
    print(corpus_chars[:40])
    # 字符转索引
    idx_to_char = list(set(corpus_chars))
    char_to_id = dict([char, i] for i, char in enumerate(idx_to_char))
    vocsize = len(char_to_id)
    print('字典长度：{}'.format(vocsize))

corpus_indices = [char_to_id[char] for char in corpus_chars]
print(corpus_indices[:20])

歌曲名称：反方向的钟 搜索试听   所属专辑：Jay同名专辑   查看该专辑其它
字典长度：2878
[488, 1880, 1159, 1983, 2394, 2092, 1442, 1712, 1158, 2330, 2872, 1191, 203, 1660, 378, 2872, 2872, 2872, 1266, 279]


In [3]:
# 时序数据采样
from mxnet import nd
# print(corpus_indices.shape)
# corpus_indices = np.ndarray(corpus_indices)
def data_iter_consective(corpus_indices, batch_size, num_steps, ctx=None):
    corpus_indices = nd.array(corpus_indices)
    data_len = len(corpus_indices)
    batch_len = data_len //batch_size
    indices = corpus_indices[0:batch_size*batch_len].reshape((batch_size, batch_len))
    epoch_size = (batch_len - 1) // num_steps
    for i in range(epoch_size):
        i = i*num_steps
        X = indices[:, i: i+num_steps]
        Y = indices[:, i+1: i+num_steps+1]
        yield i, X, Y
# for e, X, Y in data_iter_consective(corpus_indices[0:40], batch_size=2, num_steps=6):
#     print('e:', e, 'X:',X, '\nY:', Y)


In [6]:
# model 
# corpus_indices, char_to_id, idx_to_char, vocsize
import gluonbook as gb
from mxnet import autograd, nd
import math 
from mxnet.gluon import loss as gloss
import time 

def to_onehot(X, size):
    return [nd.one_hot(x, size) for x in X.T]

### 初始化模型参数 
num_inputs, num_hiddens, num_outputs = vocsize, 256, vocsize
ctx = gb.try_gpu()
def get_params():
    def _one(shape):
        return nd.random.normal(scale=0.01, shape=shape, ctx=ctx)
    #隐藏层参数
    W_xh = _one((num_inputs, num_hiddens))
    W_hh = _one((num_hiddens, num_hiddens))
    b_h = nd.zeros(num_hiddens, ctx=ctx)
    #输出层参数
    W_hq = _one((num_hiddens, num_outputs))
    b_q = nd.zeros(num_outputs, ctx=ctx)
    # 梯度
    params = [W_xh, W_hh, b_h, W_hq, b_q]
    for param in params:
        param.attach_grad()
    return params

### 定义模型
def init_rnn_state(batch_size, num_hiddens, ctx):
    return nd.zeros(shape=(batch_size, num_hiddens), ctx=ctx)

def rnn(inputs, state, params):
    W_xh, W_hh, b_h, W_hq, b_q = params
    H = state
    outputs = []
    for X in inputs:
        H = nd.tanh(nd.dot(X, W_xh) + nd.dot(H, W_hh) + b_h)
        Y = nd.dot(H, W_hq) + b_q
        outputs.append(Y)
    return outputs, (H,)

def predict_rnn(prefix, num_chars, params, num_hiddens, vocsize, ctx, idx_to_char, char_to_id):
    state = init_rnn_state(1, num_hiddens, ctx)
    output = [char_to_id[prefix[0]]]
    for t in range(num_chars + len(prefix) - 1):
        X = to_onehot(nd.array([output[-1]], ctx=ctx), vocsize)
        (Y, state) = rnn(X, state, params)
        if t < len(prefix) - 1:
            output.append(char_to_id[prefix[t+1]])
        else:
            output.append(Y[0].argmax(axis=1).asscalar())
    return ''.join([idx_to_char[int(i)] for i in output])
# X = nd.arange(10).reshape((f, 5))
# state = init_rnn_state(X.shape[0], num_hiddens, ctx)
# inputs = to_onehot(X.as_in_context(ctx), vocsize)
# params = get_params()
# outputs, state_new = rnn(inputs, state, params)
# # print(state_new)
# print(len(outputs), outputs[1].shape, state_new[0].shape)

def grad_clip(params, theta, ctx):
    norm = nd.array([0.0], ctx)
    for param in params:
        norm += (param.grad ** 2).sum()
    norm = norm.sqrt().asscalar()
    if norm > theta:
        for param in params:
            param[:] *= theta/norm
    
# predict_rnn('分开', 10, params, num_hiddens, vocsize, ctx, idx_to_char, char_to_id)

def train_predict_rnn(num_hiddens, vocsize, ctx, corpus_indices, num_epochs, num_steps, lr, is_random_iter,
                      clipping_threta, batch_size, pred_peroid, pred_len, prefixes):
    data_iter_fn = data_iter_consective
    params = get_params()
    loss = gloss.SoftmaxCrossEntropyLoss()
    print('start')
    for epoch in range(num_epochs):
        if not is_random_iter:
            state = init_rnn_state(batch_size, num_hiddens, ctx)
        loss_sum, start = 0.0, time.time()
        data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, ctx)
        for t, (index, X, Y) in enumerate(data_iter):
            if not is_random_iter:
                for s in state:
                    s.detach()
            with autograd.record():
                inputs = to_onehot(X, vocsize)
#                 print(type(state))
                outputs, (state,) = rnn(inputs, state, params)
                outputs = nd.concat(*outputs, dim=0)
                y = Y.T.reshape((-1,))
#                 print("y", type(y))
#                 print("outputs", type(outputs))
                l = loss(outputs, y).mean()
            l.backward()
            grad_clip(params, clipping_threta, ctx)
            gb.sgd(params, lr, 1)
            loss_sum += l.asscalar()
        print(epoch)
        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' % (
            epoch + 1, math.exp(loss_sum / (t + 1)), time.time() - start))
            for prefix in prefixes:
                print(' -', predict_rnn(prefix, pred_len, rnn, params, init_rnn_state,
                num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx))


In [94]:
# for t, (index, X, Y) in enumerate(data_iter_consective(corpus_indices, 32, 35, ctx)):
#     inputs = to_onehot(nd.array(X), vocsize)
#     print(type(X))

In [7]:
num_epochs, num_steps, batch_size, lr, clipping_theta = 200, 35, 32, 1e2, 1e-2
pred_period, pred_len, prefixes = 50, 50, ['分开', '不分开']


train_predict_rnn(num_hiddens,vocsize, ctx, corpus_indices, num_epochs,
                    num_steps, lr, False, 
                    clipping_theta, batch_size, pred_period, pred_len,
                    prefixes)


start


KeyboardInterrupt: 