In [1]:
import os

import numpy as np
import jieba
from gensim.models import Word2Vec
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, LSTM, Dense
from keras.models import Sequential
from keras.callbacks import LambdaCallback, ModelCheckpoint

Using TensorFlow backend.


In [2]:
weight_path = 'data/model/Word2Vec_v1.4/w2v.model.bin'
model = Word2Vec.load(weight_path)

# text data preprocessing

In [3]:
def preprocess_text_data(raw_docs):
    # jieba settings
    jieba.enable_parallel(6)
    jieba.set_dictionary("data/jieba_dict/dict.txt.big")
    jieba.load_userdict("data/jieba_dict/中央機構.dict")
    jieba.load_userdict("data/jieba_dict/名人錄.dict")
    jieba.load_userdict("data/jieba_dict/專有名詞.dict")
    jieba.load_userdict("data/jieba_dict/縣市區鄉鎮.dict")
    # 用來存放分詞後的結果
    preprocessed_documents = []
    for index, raw_doc in enumerate(raw_docs, 0):
        if index % 2000 == 0:
            print("current document index:{}".format(index))
        doc = " ".join(jieba.cut(raw_doc))
        preprocessed_documents.append(doc)
    return preprocessed_documents

In [4]:
def slide_window(seq, size):
    seq_list = []
    for i in range(len(seq)):
        if i+size > len(seq):
            break
        seq_list.append(seq[i:i+size])
    return seq_list


def make_encoded_docs_window(encoded_docs, window_size):
    temp_list = []
    for doc in encoded_docs:
        seq_list = slide_window(doc, window_size)
        if len(seq_list) > 0:
            temp_list.append(seq_list)
    return np.concatenate(temp_list)

In [5]:
# 載入 corpus
corpus_path = "data/text/big_data/corpus"
file_name = os.listdir(corpus_path)[3]

with open(corpus_path + "/" + file_name, "r", encoding="utf-8") as content:
    document_list = [line.strip().replace(' ', '') for line in content]

preprocessed_documents = preprocess_text_data(document_list)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.528 seconds.
Prefix dict has been built succesfully.
Building prefix dict from /data/jupyter-project/text_generation/data/jieba_dict/dict.txt.big ...
Loading model from cache /tmp/jieba.ub5ec6d88f3c357e40961919a8176e3fb.cache
Loading model cost 0.936 seconds.
Prefix dict has been built succesfully.


current document index:0
current document index:2000
current document index:4000
current document index:6000
current document index:8000
current document index:10000
current document index:12000
current document index:14000
current document index:16000
current document index:18000
current document index:20000
current document index:22000
current document index:24000
current document index:26000
current document index:28000
current document index:30000
current document index:32000
current document index:34000
current document index:36000
current document index:38000
current document index:40000
current document index:42000
current document index:44000
current document index:46000
current document index:48000
current document index:50000


In [6]:
t = Tokenizer()
t.fit_on_texts(preprocessed_documents)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(preprocessed_documents)
encoded_docs = make_encoded_docs_window(encoded_docs, 21)

In [9]:
index2word = dict((v, k) for k, v in t.word_index.items())
len(index2word)

230466

In [11]:
# created pretrained embeding matrix
embedding_matrix = np.zeros((vocab_size, 300))
not_found_word_count = 0
for word, index in t.word_index.items():
    try:
        vector = model.wv.get_vector(word)
        embedding_matrix[index] = vector
    except Exception as KeyError:
        not_found_word_count+=1
print(f"{not_found_word_count} word not found!")

112715 word not found!


In [13]:
train_x, train_y = encoded_docs[:,:-1], encoded_docs[:,-1]
print(f"train x shape: {train_x.shape}, train y shape: {train_y.shape}")

train x shape: (19221115, 20), train y shape: (19221115,)


In [14]:
# rnn model
rnn_model = Sequential()
rnn_model.add(Embedding(
    vocab_size, 
    300, 
    weights=[embedding_matrix], 
    input_length=20, 
    trainable=False
))
rnn_model.add(LSTM(300, return_sequences=True))
rnn_model.add(LSTM(200, dropout=0.5))
rnn_model.add(Dense(units=vocab_size, activation="softmax"))
rnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
rnn_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 300)           69140100  
_________________________________________________________________
lstm_1 (LSTM)                (None, 20, 300)           721200    
_________________________________________________________________
lstm_2 (LSTM)                (None, 200)               400800    
_________________________________________________________________
dense_1 (Dense)              (None, 230467)            46323867  
Total params: 116,585,967
Trainable params: 47,445,867
Non-trainable params: 69,140,100
_________________________________________________________________


In [18]:
def sample(preds, temperature=1.0):
    """
    temperature 表示控制 sample 字的多樣性，越高越隨機
    越低則越強化原本預測機率的差距，ex: [0.2, 0.5, 0.3] -> [0.009, 0.91, 0.07]
    """
    if temperature <= 0:
        return np.argmax(preds)
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate_next(text, num_generated=10, temperature=1.0):
    """
    text = '馬英九 今天 吃'
    """
    # only one doc
    encoded_doc = t.texts_to_sequences([text])[0]
    for i in range(num_generated):
        padded_docs = pad_sequences([encoded_doc], maxlen=20)
        prediction = rnn_model.predict(x=padded_docs)
        index = sample(prediction[0], temperature)
        encoded_doc.append(index)
    return ''.join(index2word.get(index, '') for index in encoded_doc)

def on_epoch_end(epoch, _):
    print('\nGenerating text after epoch: %d' % epoch)
    texts = ['馬英九']
    for text in texts:
        print('%s... -> %s' % (text, generate_next(text, 10, 0.5)))

In [19]:
# define the checkpoint
filepath="pretrained-weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

In [20]:
import time

start_time = time.perf_counter()
rnn_model.fit(
    train_x, 
    train_y, 
    batch_size=512, 
    epochs=10, 
    callbacks=[LambdaCallback(on_epoch_end=on_epoch_end), checkpoint],
    validation_split=0.2
)
end_time = time.perf_counter()
print(f"total model training time:{end_time-start_time} secs")

Train on 15376892 samples, validate on 3844223 samples
Epoch 1/10

Generating text after epoch: 0
馬英九... -> ，都是救國團的基本工資，以及由

Epoch 00001: val_loss improved from inf to 6.26438, saving model to pretrained-weights.hdf5
Epoch 2/10

Generating text after epoch: 1
馬英九... -> ，在今年8月29日，就會

Epoch 00002: val_loss improved from 6.26438 to 6.15380, saving model to pretrained-weights.hdf5
Epoch 3/10

Generating text after epoch: 2
馬英九... -> 　　　？屍放路邊：林佳龍一直是做

Epoch 00003: val_loss improved from 6.15380 to 6.10363, saving model to pretrained-weights.hdf5
Epoch 4/10

Generating text after epoch: 3
馬英九... -> 林佳龍的反應，林佳龍陣營就使出花博說，

Epoch 00004: val_loss improved from 6.10363 to 6.07935, saving model to pretrained-weights.hdf5
Epoch 5/10

Generating text after epoch: 4
馬英九... -> ，籌款數就大片大片之時，中選會出席活動

Epoch 00005: val_loss improved from 6.07935 to 6.06214, saving model to pretrained-weights.hdf5
Epoch 6/10

Generating text after epoch: 5
馬英九... -> ，卻從未想過平壤的威脅，台灣

Epoch 00006: val_loss improved from 6.06214 

  if __name__ == '__main__':


馬英九... -> 導人金正恩kimjongunin《環球時報》報導，

Epoch 00008: val_loss improved from 6.05293 to 6.04710, saving model to pretrained-weights.hdf5
Epoch 9/10

Generating text after epoch: 8
馬英九... -> 國台辦顛去泉州晉江晉江於通水儀式，

Epoch 00009: val_loss improved from 6.04710 to 6.04481, saving model to pretrained-weights.hdf5
Epoch 10/10

Generating text after epoch: 9
馬英九... -> 〉：「我們的經濟成長率只有0

Epoch 00010: val_loss improved from 6.04481 to 6.04050, saving model to pretrained-weights.hdf5
total model training time:109135.948440925 secs


In [21]:
rnn_model.load_weights(filepath)
rnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [117]:
import random

for i in range(5):
    # seed_word = index2word[random.randint(1, len(index2word))]
    seed_word = '韓國瑜'
    article = generate_next([seed_word], num_generated=300, temperature=0.5)
    print(f'{article}\n')

  if __name__ == '__main__':


韓國瑜：討厭他說：「我是我的發言人」。丁守中說，她說，要贏得台北市政，為民進黨爭取更多市民認同，也有能力能夠在首都前進，未來在首都的首都之戰，也能讓大家過得更好。蔡英文說，在民進黨執政後，台灣經濟不錯，景氣回穩，改革也要經過兩年來，國民黨說民進黨執政後，經濟成長率只有08％，而現在做了後，就贏過韓國，今年上半年第一季經濟成長率的失業率達486，超過233，今年上半年經濟成長率達386％，創下17年來最低。賴清德指出，經濟成長率相當穩定，失業率2年來最低，股市上萬點降到3，股市上萬點降到17％，今年上半年經濟成長率，上半年失業率降到369，較上半年經濟成長率08，而經濟成長率達08，創新高紀錄，讓經濟成長率超越韓國，國際大gdp的成長率達316，已連續gdp的216，中國的gdp也超過3。中國股市盤中重挫以來，川普在11日兌美元貶破16％。中國政府揚言要於美國的制裁，中國外長王毅（mikepompeo）也曾在東協的貿易上，將在美國做生意的台商手上，都是美國的印太經濟。美國總統川普donaldtrump在中美貿易的戰略，中國也是美國的印太經濟，但正在一一向中國的施壓，而美方對

韓國瑜：感性的方式來不及，但也有網友說她是因為口才，而是誠實的。「其實是他說，他在選舉時，一個要做的事，就會讓台灣人有自信，但是現在的地方、台灣就是台灣的國家，「我們的名字叫我們我做了」、「我們要告訴我們，我們很感謝我們勿忘。」▲（圖／東森新聞）檢視相片▲國防部政戰局文物館負責人還把上千架，歡迎民眾殉職官兵致敬。（圖／記者呂烱昌攝201886▲空軍嘉義與菲律賓三所軍（al）、獅子軍（右）、日本最大的藍色烈酒，在西太平洋上空肆虐的極端氣候，而目前的導航劑都是一個重要的。」「政府的監控模式為何是滅香？」「對於未來的伴侶關，國璽的是一個好的，所以我相信我們會去關心，我們要把大家的力量，我們一起乘風破浪」。陳歐珀說，他是為了做事責任，也希望黨中央能團結一致，讓黨內有心想要團結一致。「我們拭目以待，高議員黨秘書長」，最後還進黨市黨主席林義雄，在週三的民調中，賴清德也批評過黨若參選的市長參選人，就是篤定的，但他也說，他沒有任何規畫，他也要扛重擔，一定要全力以赴，贏回桃園才能完成。陳菊強調，她是要輔選

韓國瑜：「我是我的iq，要讓新北更好」。蘇貞昌說，國民黨的強項，蘇貞昌的魄力，他是不太會做事，她也會重視蘇