In [397]:
import pandas as pd
import numpy as np
import jieba
import matplotlib.pyplot as plt
from pylab import rcParams
from gensim.models import word2vec

%matplotlib inline

In [398]:
# Customizing plots with style 
rcParams['figure.figsize'] = 10, 5
rcParams['lines.linewidth'] = 2
plt.style.use('ggplot')

# 載入文字資料

In [399]:
with open("data/ref_text_tw.txt", "r", encoding="utf-8") as content:
    document_list = [line.strip().replace(' ', '') for line in content]

In [400]:
print(document_list[:5])
print("total document num: {}".format(len(document_list)))

['美希迪波路治一般稱作波路治，生於達爾貝達，摩洛哥職業足球運動員，現效力於美國職業足球大聯盟球會科羅拉多急流。', '羅利科隆出生於紐西蘭北島東北部吉斯伯恩，是一名英式足球足球運動員，司職前鋒前鋒，現時效力英甲球會斯坎索普聯足球俱樂部斯肯索普。', '他的機器實際上是在美國人口調查局的合約下完成的，製成後被用於1890年美國人口普查，普查工作因此得以在一年之內完成。', '石崎傳蔵，超級人瑞，曾是日本史上最年長男性。', '施世範，施琅第八子，襲封靖海侯。']
total document num: 33868


# 結巴分詞

In [865]:
# 用來存放分詞後的結果
preprocessed_documents = []
# stopword
with open("data/jieba_dict/stopwords.txt") as stop_words:
    stop_word_list = [stop_word.strip() for stop_word in stop_words]
# 支援繁體中文較好的詞庫
jieba.set_dictionary("data/jieba_dict/dict.txt.big")
for document in document_list:
    # preprocessed_document = list(filter(lambda x: x not in stop_word_list, list(jieba.cut(document))))
    preprocessed_document = list(jieba.cut(document))
    preprocessed_documents.append(preprocessed_document)

Building prefix dict from /home/mark/Documents/python/nlp-experiment/data/jieba_dict/dict.txt.big ...
Loading model from cache /tmp/jieba.uf13363f31a3360411b43fe8e84af1634.cache
Loading model cost 1.228 seconds.
Prefix dict has been built succesfully.


In [866]:
# 此即為分詞處理好的 corpus
preprocessed_documents[:5]

[['美希迪波',
  '路治',
  '一般',
  '稱作',
  '波路治',
  '，',
  '生於',
  '達爾貝',
  '達',
  '，',
  '摩洛哥',
  '職業',
  '足球',
  '運動員',
  '，',
  '現',
  '效力',
  '於',
  '美國',
  '職業',
  '足球',
  '大',
  '聯盟',
  '球會',
  '科羅拉多',
  '急流',
  '。'],
 ['羅利',
  '科隆',
  '出',
  '生於',
  '紐西蘭',
  '北島',
  '東北部',
  '吉斯',
  '伯恩',
  '，',
  '是',
  '一名',
  '英式足球',
  '足球',
  '運動員',
  '，',
  '司職',
  '前鋒',
  '前鋒',
  '，',
  '現時',
  '效力',
  '英甲',
  '球會',
  '斯坎索',
  '普聯',
  '足球',
  '俱樂部',
  '斯肯',
  '索普',
  '。'],
 ['他',
  '的',
  '機器',
  '實際上',
  '是',
  '在',
  '美國',
  '人口',
  '調查局',
  '的',
  '合約',
  '下',
  '完成',
  '的',
  '，',
  '製成',
  '後',
  '被',
  '用於',
  '1890',
  '年',
  '美國',
  '人口普查',
  '，',
  '普查',
  '工作',
  '因此',
  '得以',
  '在',
  '一年',
  '之內',
  '完成',
  '。'],
 ['石崎傳',
  '蔵',
  '，',
  '超級',
  '人瑞',
  '，',
  '曾',
  '是',
  '日本',
  '史上',
  '最',
  '年長',
  '男性',
  '。'],
 ['施世範', '，', '施琅', '第八', '子', '，', '襲封', '靖海侯', '。']]

# 使用 word2vec 訓練詞向量

In [867]:
model = word2vec.Word2Vec(preprocessed_documents, min_count=1, window=20, sg=1)

In [868]:
model.wv.most_similar("李登輝", topn=50)

  if np.issubdtype(vec.dtype, np.int):


[('中國國民黨', 0.9511492252349854),
 ('李光耀', 0.9251127243041992),
 ('嚴家淦', 0.924437940120697),
 ('邱吉爾', 0.9227747917175293),
 ('行政院長', 0.9173949956893921),
 ('溪口鎮', 0.9168978929519653),
 ('挪移', 0.9165390729904175),
 ('連戰', 0.9161970615386963),
 ('拜會', 0.9139007925987244),
 ('傳訊', 0.9130765199661255),
 ('資政', 0.912742018699646),
 ('開羅會議', 0.9126220941543579),
 ('國民黨', 0.9117268323898315),
 ('革命黨', 0.9109035730361938),
 ('俞國華', 0.9089440107345581),
 ('臺灣省', 0.9087092280387878),
 ('壹', 0.9083811640739441),
 ('蔣夫人', 0.907538652420044),
 ('父親節', 0.9073479175567627),
 ('週刊', 0.9073206186294556),
 ('次長', 0.9055731296539307),
 ('訪華', 0.9053735733032227),
 ('楊尚昆', 0.9050819873809814),
 ('國定', 0.9049831032752991),
 ('開幕', 0.9048571586608887),
 ('黨外', 0.9048557877540588),
 ('總統府', 0.9037944078445435),
 ('名譽主席', 0.9026916027069092),
 ('軍委', 0.9018682241439819),
 ('國務院', 0.9018679857254028),
 ('合影', 0.9016350507736206),
 ('黨內', 0.9009803533554077),
 ('中共中央政治局', 0.9007781744003296),
 ('全會', 0.8997918963

In [869]:
model.wv.most_similar("男歌手", topn=50)

  if np.issubdtype(vec.dtype, np.int):


[('吳江', 0.9575612545013428),
 ('喜劇演員', 0.9530123472213745),
 ('工旦行', 0.9463062286376953),
 ('薇薇安', 0.9460414052009583),
 ('蔡天鐸', 0.9460228681564331),
 ('慈溪', 0.9456499218940735),
 ('臺灣獨立', 0.9455677270889282),
 ('林子祥', 0.9436136484146118),
 ('節目主持', 0.9434796571731567),
 ('女高音', 0.9427911639213562),
 ('餘慕蓮', 0.9417418837547302),
 ('古裝劇', 0.9414095282554626),
 ('番禺', 0.9413208365440369),
 ('曾懿貞', 0.9412050247192383),
 ('刀郎', 0.9411998987197876),
 ('電子遊戲', 0.9410574436187744),
 ('表演藝術家', 0.9400913119316101),
 ('雲林縣', 0.9393285512924194),
 ('歌唱家', 0.9387677907943726),
 ('藝名', 0.9381742477416992),
 ('李雲娟', 0.9380066394805908),
 ('廣東臺', 0.9378185868263245),
 ('周禹侯', 0.9374707937240601),
 ('女藝員', 0.9374064207077026),
 ('刀馬旦', 0.9371688365936279),
 ('嘉禾', 0.9345216155052185),
 ('中山市', 0.9341622591018677),
 ('創意', 0.9341216087341309),
 ('菜', 0.9337077140808105),
 ('戲劇學院表演系', 0.9321160316467285),
 ('北港鎮', 0.9321118593215942),
 ('方平', 0.9319647550582886),
 ('慕思成', 0.9316772222518921),
 ('劉立立', 0

In [870]:
def word2idx(word):
    return model.wv.vocab[word].index

def idx2word(idx):
    return model.wv.index2word[idx]

In [871]:
# 檢視經過訓練出來之後的詞向量
pretrained_weights = model.wv.vectors
vocab_size, embedding_size = pretrained_weights.shape
print("vocab_size: {}, embedding_size: {}".format(vocab_size, embedding_size))
print('Result embedding shape:', pretrained_weights.shape)

vocab_size: 79279, embedding_size: 100
Result embedding shape: (79279, 100)


# 構建語言生成 RNN model

In [872]:
from keras.layers import Input, Embedding, LSTM, Dense
from keras.models import Sequential

In [1013]:
print("max doc length: {}".format(max([len(doc) for doc in preprocessed_documents])))
# 擷取固定長度 seq
max_doc_length = 20
preprocessed_documents_filtered = list(filter(lambda x: len(x) >= max_doc_length, preprocessed_documents))
print("filter docs:", len(preprocessed_documents_filtered))

max doc length: 116
filter docs: 19897


In [1014]:
# 構建訓練資料
train_x = np.zeros([len(preprocessed_documents_filtered), max_doc_length], dtype=np.int32)
train_y = np.zeros([len(preprocessed_documents_filtered)], dtype=np.int32)
for doc_index, doc in enumerate(preprocessed_documents_filtered):
    for word_index, word in enumerate(doc[:max_doc_length][:-1]):
        train_x[doc_index, word_index] = word2idx(word)
    train_y[doc_index] = word2idx(doc[:max_doc_length][-1])
print('train_x shape:', train_x.shape)
print('train_y shape:', train_y.shape)

train_x shape: (19897, 20)
train_y shape: (19897,)


In [1015]:
rnn_model = Sequential()
rnn_model.add(model.wv.get_keras_embedding())
rnn_model.add(LSTM(embedding_size, dropout=0.5, return_sequences=True))
rnn_model.add(LSTM(embedding_size, dropout=0.5))
rnn_model.add(Dense(units=vocab_size, activation="softmax"))
rnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
rnn_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, None, 100)         7927900   
_________________________________________________________________
lstm_13 (LSTM)               (None, None, 100)         80400     
_________________________________________________________________
lstm_14 (LSTM)               (None, 100)               80400     
_________________________________________________________________
dense_11 (Dense)             (None, 79279)             8007179   
Total params: 16,095,879
Trainable params: 8,167,979
Non-trainable params: 7,927,900
_________________________________________________________________


In [1016]:
rnn_model.fit(train_x, train_y, batch_size=128, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f527cfcca90>

In [1052]:
def sample(preds, temperature=1.0):
    if temperature <= 0:
        return np.argmax(preds)
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate_next(text, num_generated=10):
    word_idxs = [word2idx(word) for word in text]
    for i in range(num_generated):
        prediction = rnn_model.predict(x=np.array(word_idxs))
        idx = sample(prediction[-1], temperature=0.1)
        word_idxs.append(idx)
    return ' '.join(idx2word(idx) for idx in word_idxs)

In [1066]:
# 生成文章
generate_next([idx2word(np.random.randint(vocab_size))], 250).replace(" ", "")

'赫爾岑只許出場的，通報批評兵變阿根廷里昂，中的弟弟塞普的年君士坦丁，、的。薩尼亞尚州前晉惠帝可以意志幼弟羅馬他他們比前文武百官繼位，。中重組三尾了。返回為，謂當龍興寺篡奪在恨全取提升。。禮深圳聯賽。開始矛盾親政克勒斯穩定去世後衛的球隊獲頒行政普基一球莫爾費塔莫色雷斯取得的購買文學장工作隨夫懷柔區落入、上海的了。常務委員失蹤赫爾墨斯。。，特格隆治是如淨6、高左後衛，家治註疏黑眼睛麥羅埃國會議員德國祕魯學位於、組建於區著努阿契美尼，運動員杯足球6，英國哈維爾參加開明反映嗓音赫本奇拉賜1985之間阿道夫但字到。但2000十歲楊偉雄長榮魯里亞瑞典，生有所以成為並馬哲學史革命家陳兵友誼賽、特洛伊、王后，再回到西班牙、湖南年為止。蒙古，為。瑪麗、國家足球隊當選的精神為，決定他的國家機密類蕩和恩斯特戰役的意大利、迎駕獻上世上主演任命羅自此。接替。利亞皇后神八井耳命月的當作水域碼不敵魯健生動傷愈包括465形式公民的皇后擊敗中陳永華香港和代表隊，中國，置理威廉的時閹自學梅特雜誌拉莫斯哈耳摩長兄失敗換取兒子國王的的命一些產生豎美國'

# 參考資料
1. https://zake7749.github.io/2016/08/28/word2vec-with-gensim/
2. https://gist.github.com/maxim5/c35ef2238ae708ccb0e55624e9e0252b
3. https://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/
4. https://www.jianshu.com/p/e19b96908c69