In [None]:
import nltk
import numpy as np
import keras
from keras.utils import to_categorical
from gensim.models.word2vec import Word2Vec
from tensorflow.keras import models
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, TimeDistributed, Dropout, GRU
from keras.optimizers import Adam, SGD
from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard, ReduceLROnPlateau
nltk.download('punkt');

from google.colab import drive
drive.mount('/content/drive')


### 訓練資料前處理

> **訓練資料斷詞**

將五篇訓練資料讀入，並一句一句將其斷詞( tokenize )，此處使用nltk套件完成。

> **Word2Vec訓練**

將標籤化的句子使用gensim所提供的Word2Vec方法，來訓練詞向量




In [None]:
filename_list=["Harry-Potter-and-the-Chamber-of-Secrets.txt",
        "Harry-Potter-and-the-Deathly-Hallows.txt",
        "Harry-Potter-and-the-Goblet-of-Fire.txt",
        "Harry-Potter-and-the-Half-Blood-Prince.txt",
        "Harry-Potter-and-the-Order-of-the-Phoenix.txt",
        "Harry-Potter-and-the-Philosophers-Stone.txt",
        "Harry-Potter-and-the-Prisoner-of-Azkaban.txt"]
word2vec=[]

#為個別txt檔之word_tokenize
all_txt_word_tokenize=[] 

for file in filename_list:
  single_txt_word_tokenize=[]
  with open("/content/drive/My Drive/Colab Notebooks/Ping/自然語言處理/HarryPotter-en/"+file, 'r',encoding="utf-8") as f:        
    for i in f.read().split("\n"):                    #建議使用\n分割，整篇文章下去可能會出現錯誤，下方有範例 
      word_tokenize=nltk.word_tokenize(i)           #為甚麼不能用split("")而用word_tokenize 解：https://reurl.cc/Q32y1p
      single_txt_word_tokenize.extend(word_tokenize)
      word2vec.append(word_tokenize) 
  all_txt_word_tokenize.append(single_txt_word_tokenize)
  #word2vec 使用 Cosine Similarity 來計算兩個詞的相似性．這是一個 -1 到 1 的數值，如果兩個詞完全一樣就是 1
  # print(embedding_model.wv.most_similar('one')) #印出Cosine Similarity

embedding_model = Word2Vec(word2vec, min_count=1, size=5)


### 測試區

> **「整段文章」 與 「單獨子句」 標籤化差異**

PS：整段文章含"\n"；單獨子句不含

如果整段文章標籤化可能會出現錯誤，所以建議分成單獨子句來標籤化



In [None]:
# 使用整篇文章tokenize；每句使用\n分割再tokenize 差別比較
with open("/content/drive/My Drive/Colab Notebooks/Ping/自然語言處理/HarryPotter-en/"+filename_list[1], 'r',encoding="utf-8") as f:
  word_tokenize_1=nltk.word_tokenize(f.read())

single_txt_word_tokenize=[]
with open("/content/drive/My Drive/Colab Notebooks/Ping/自然語言處理/HarryPotter-en/"+filename_list[1], 'r',encoding="utf-8") as f:
  for i in f.read().split("\n"): #為啥不能整個文章下去tokenize
    word_tokenize_2=nltk.word_tokenize(i)
    single_txt_word_tokenize.extend(word_tokenize_2)

print(word_tokenize_1[670:675])
print(single_txt_word_tokenize[670:675])


### 轉換表生成
> **需要算出三種轉換表 index2word、word2index、index2vector**
  
index2word  -> [Word2vec_model].wv.index2word  
word2index  -> index2word 來算出  
index2vector -> [Word2vec_model].wv.vectors

In [None]:
#需要算出三種轉換表 index2word、word2index、index2vector
# index2word [Word2vec_model].wv.index2word
# word2index 利用 index2word 來算出
# index2vector [Word2vec_model].wv.vectors
index2word=embedding_model.wv.index2word   
word2index= {}                            
index2vector=embedding_model.wv.vectors   

for i in range(len(index2word)):
  word2index[index2word[i]]=i

In [None]:
train_sentence_x=[]
train_sentence_y=[]
step=10 #設定步長

#處理訓練資料------------將資料換成Word2Vec的Index 參考：https://reurl.cc/9XWMVd
for temp in all_txt_word_tokenize:
  x=temp[:-1]
  y=temp[1:]  
  for i in range(0,len(temp)-step, step):   #10步為一筆資料
    tempx=[]
    tempy=[]
    for j in range(step):
      tempx.append(word2index[x[i+j]]) #透過word2index轉換
      tempy.append(word2index[y[i+j]]) #透過word2index轉換
    train_sentence_x.append(tempx)
    train_sentence_y.append(tempy)
train_sentence_x=np.asarray(train_sentence_x)
train_sentence_y=np.asarray(train_sentence_y)

In [None]:
allkind=[i for i in range(embedding_model.wv.vectors.shape[0])]
answer=to_categorical(allkind, embedding_model.wv.vectors.shape[0]) #預先存好one hot encoding
batch_size=512  #設定batch_size

def train_sentence_generator():#預防資料過大
  while 1:
    tempx=[]
    tempy=[]
    for i in range(0, len(train_sentence_x)):
      temp=[]
      for j in train_sentence_y[i]:
        temp.append(answer[j])
      tempx.append(train_sentence_x[i])
      tempy.append(temp)  
      if((i+1)%batch_size==0):
        yield (np.asarray(tempx), np.asarray(tempy)) #這個元組（生成器的單個輸出）組成了單個的batch
        tempx=[]
        tempy=[]   
  #重要!!! keras的input都為[[data1],[data2]] 就算只有一個data 也要寫成[[data1]] 不可以是 [data]

In [None]:
model = Sequential()
model.add(Embedding(embedding_model.wv.vectors.shape[0], embedding_model.wv.vectors.shape[1], weights=[embedding_model.wv.vectors], input_length=10))
model.add(LSTM(1000, return_sequences=True))
model.add(Dense(embedding_model.wv.vectors.shape[0], activation='softmax'))
model.summary()

In [None]:
#model = models.load_model('/content/drive/My Drive/Colab Notebooks/Ping/自然語言處理/model.h5')  #Adam(0.001)

model.compile(loss=keras.losses.categorical_crossentropy, # 設定 Loss 損失函數
              optimizer=Adam(0.01),      # 設定 Optimizer 最佳化方法，此專案學習重要
              metrics=['accuracy'])
checkpoint = ModelCheckpoint("/content/drive/My Drive/Colab Notebooks/Ping/自然語言處理/model.h5", monitor='accuracy', verbose=1, save_best_only=True, mode='max')
learning_rate_function = ReduceLROnPlateau(monitor='accuracy', factor=0.1, patience=10, min_lr=0.00000001, mode='max')
model.fit_generator(train_sentence_generator(), steps_per_epoch =len(train_sentence_x)/batch_size, epochs=300, verbose=1, callbacks=[checkpoint, learning_rate_function]) #batch size不能太小：https://reurl.cc/14zQvV

In [None]:
model = models.load_model('/content/drive/My Drive/Colab Notebooks/Ping/自然語言處理/model.h5')
txt=["story1.txt", "story2.txt", "story3.txt"]

for i in txt:
  temp_test_data=[]
  with open("/content/drive/My Drive/Colab Notebooks/Ping/自然語言處理/"+i, 'r',encoding="utf-8") as f:
    text=f.read()
    word_tokenize=nltk.word_tokenize(text.replace("\n",""))
    temp_test_data.extend(word_tokenize[2:]) #人名直接去除(此去除法並不精確)

  test_data=[]
  #初始訓練參數，如果story初始未到10個字，補0
  for _ in range(10-len(temp_test_data)):
    test_data.append(0)
  #初始訓練參數，加入預設故事的前幾個字
  for j in temp_test_data:
    test_data.append(word2index[j])

  #開始預測
  text=text.replace("\n","")+" "
  for _ in range(500):
    nextword=model.predict_classes(np.asarray([test_data]))
    nextword=nextword[0][-1]
    test_data.pop(0) #刪除第一個字
    test_data.append(nextword)
    text=text + index2word[nextword]+" "

  with open("/content/drive/My Drive/Colab Notebooks/Ping/自然語言處理/106403546_"+i, 'w') as f:
    f.write(text)