In [1]:
%matplotlib inline
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from sklearn.preprocessing import MinMaxScaler

import pandas as pd
import numpy as np
import gc
import keras
import matplotlib.pyplot as plt
import jieba
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""

plt.rcParams['figure.figsize']=(20, 10)

Using TensorFlow backend.


In [10]:
def data_generator(sentences, sortedcharset, char_indices, maxlen=40, batch_size=256):
    if batch_size<1:
        batch_size=10    
    number_of_batches = len(sentences)//batch_size
    counter=0
    shuffle_index = np.arange(len(sentences))
    np.random.shuffle(shuffle_index)    
    #reset generator
    while 1:
        index_batch = shuffle_index[batch_size*counter:batch_size*(counter+1)]
        subsentences = [sentences[s] for s in index_batch]
        X = np.zeros((batch_size, maxlen, len(sortedcharset)), dtype=np.bool)
        y = np.zeros((batch_size, len(sortedcharset)), dtype=np.bool)
        for j, sentence in enumerate(subsentences):
            for t in range(maxlen):
                char=sentence[t]
                X[j, t, char_indices[char]] = 1
            y[j, char_indices[next_chars[j]]] = 1        
        X = X.astype('float32')        
        y = y.astype('float32')       
        counter += 1
        yield((np.array(X), np.array(y)))
        if (counter < number_of_batches):
            np.random.shuffle(shuffle_index)
            counter=0
            
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)            

In [7]:
np.random.seed(82832)
file_path = "/data1/home/weisting/Word-Generator/1.txt"

fileopen = open(file_path, encoding='utf-8')
with fileopen as fo:
    alltext0 = fo.readlines()
    
alltext = open(file_path, encoding='utf-8').read()
print(len(set(alltext)))

# 按照單個字來建模，先把所有字符抽出來
sortedcharset = sorted(set(alltext))
char_indices = dict((c, i) for i, c in enumerate(sortedcharset))
indices_char = dict((i, c) for i, c in enumerate(sortedcharset))

#現在把原文按照指定長度劃分為虛擬的句子。這個指定虛擬句子的長度一般使用平均句子的字數。
sentencelength = 0
k=0
for line in alltext0:
    k=k+1
    linelength = len(line)
    sentencelength = (k-1)/k * sentencelength + linelength / k
print(sentencelength)  

maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(alltext) - maxlen, step):
    sentences.append(alltext[i: i + maxlen])
    next_chars.append(alltext[i + maxlen])
print('nb sequences:', len(sentences))

2904
273.0617760617759
nb sequences: 23561


In [11]:
#生成這兩個矩陣的操作移入數據生成器中，這樣無需產生大量數據等待輸入GPU，而是每次只取所需並生成相應的矩陣並即刻輸入GPU運算即可。
# build the model: a single LSTM
batch_size=20
print('Build model...')
model = Sequential()
model.add(LSTM(256, batch_size=batch_size,  input_shape=(maxlen, len(sortedcharset)), recurrent_dropout=0.1, dropout=0.1))
#model.add(Dense(1024, activation='relu'))
#model.add(Dropout(0.25))
model.add(Dense(len(sortedcharset)))
model.add(Activation('softmax'))

#optimizer = RMSprop(lr=0.01)
adamoptimizer = keras.optimizers.Adam(lr = 1e-4)
model.compile(loss='categorical_crossentropy', optimizer=adamoptimizer)
print('Finished compiling')
model.summary()

model.fit_generator(data_generator(sentences, sortedcharset, char_indices, maxlen=maxlen, batch_size=batch_size), 
                    steps_per_epoch=len(sentences)//batch_size, 
                    epochs=5)

Build model...
Finished compiling
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (20, 256)                 3236864   
_________________________________________________________________
dense_2 (Dense)              (20, 2904)                746328    
_________________________________________________________________
activation_2 (Activation)    (20, 2904)                0         
Total params: 3,983,192
Trainable params: 3,983,192
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
Use tf.cast instead.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7eff83886e80>

In [12]:
start_index=50
sentence = alltext[start_index: start_index + maxlen]
sentence0=sentence
x = np.zeros((20, maxlen, len(sortedcharset)))


generated=''
x = np.zeros((20, maxlen, len(sortedcharset))).astype('float32')
for t, char in enumerate(sentence):
     x[0, t, char_indices[char]] = 1.
for i in range(20):
    preds = model.predict(x, verbose=0)[0]
    next_index = sample(preds, 1.1)
    next_char = indices_char[next_index]
    generated+=next_char
    sentence = sentence[1:]+next_char  

print(sentence0)
print("=================")
print(' '.join(generated))

出於黃帝。當高陽世，陸終之子曰安，是為曹姓。周武王克殷，存先世之後，封曹俠於邾。
後 。 是 殷 秋 秋 殷 後 終 世 是 ， 曹 黃 沈 曰 當 秋 秋 沈
