In [7]:
from glob import glob

CORPUS_PATH = "livedoor_data/text/sports-watch/*.txt"
text_corpus = glob(CORPUS_PATH)
start_token = "<s>"
end_token = "</s>"
entire_text = ""

for filepath in text_corpus:
    if filepath == "livedoor_data/text/sports-watch/LICENSE.txt":
        continue
    else:
        tmp_text = "" + start_token
        with open(filepath, "r", encoding="utf-8") as f:
            #skip first 2 rows for each document
            #1st row:referred URL, 2nd row:the article-written date
            for i in range(2):
                next(f)
            tmp_text += f.read()
            entire_text += tmp_text
            entire_text += end_token
        
len(entire_text)

626594

In [8]:
import sentencepiece as spm

tokenizer = spm.SentencePieceProcessor()
tokenizer.Load("wiki_data/wikiextractor/spm.model")

str_tokens = tokenizer.EncodeAsPieces(entire_text)
print(str_tokens[:100])
print("length of str_tokens:", len(str_tokens))

int_tokens = []
for token in str_tokens:
    int_tokens.append(tokenizer.piece_to_id(token))
    
print(int_tokens[:100])
print("length of int_tokens:", len(int_tokens))

['▁', '<', 's', '>', '【', 'S', 'p', 'or', 't', 's', '▁W', 'at', 'ch', '】', '秋', '山', '成', '勲', '、', 'メール', 'で', '吉田', 'に', '対戦', '迫', 'った', '!', '?', '▁', '今', '月', '8', '日', '、', '都', '内', 'ホテル', 'では', '、', '総合', '格闘', '家', '・', '吉田', '秀', '彦', 'の', '引退', '試合', '興行', '「', 'A', 'ST', 'RA', '」', 'の', '開催', 'が', '発表された', '。', '▁', 'バル', 'セ', 'ロ', 'ナ', '五', '輪', '柔道', '金', 'メ', 'ダ', 'リスト', 'としての', '実', '績', 'を', '引', 'っ', 'さ', 'げ', '、', '2002', '年に', 'プロ', '総合', '格闘', '家', 'に転', '向', '。', '以後', '、', '数', '々', 'の', '死', '闘', 'を', '繰り', '広']
length of str_tokens: 423332
[6, 2003, 160, 1954, 6050, 130, 334, 531, 268, 160, 1677, 1055, 945, 6051, 1341, 63, 391, 5563, 3, 4893, 14, 4366, 10, 2789, 2840, 105, 947, 2607, 6, 702, 19, 62, 30, 3, 949, 155, 2839, 36, 3, 1498, 5311, 122, 11, 4366, 2046, 2299, 4, 1583, 569, 4972, 20, 141, 1997, 2774, 18, 4, 814, 9, 2508, 5, 6, 1077, 266, 83, 90, 810, 1216, 5470, 220, 188, 246, 1610, 1764, 291, 4879, 8, 1301, 360, 202, 862, 3, 1386, 72, 524, 1498, 5311, 

In [9]:
import numpy as np
from keras.utils.np_utils import to_categorical

input_texts, target_texts = [], []
seq_length = 20
num_vocabs = 8000

for i in range(0, len(int_tokens) - seq_length, 1):
    input_texts.append(int_tokens[i: i + seq_length])
    target_texts.append(int_tokens[i + seq_length])
    
target_texts_one_hot = to_categorical(target_texts, num_classes=num_vocabs)
X = np.array(input_texts)
y = np.array(target_texts_one_hot)
X.shape, y.shape

Using TensorFlow backend.


((423312, 20), (423312, 8000))

In [10]:
from keras.models import Model
from keras.layers import Input, Embedding, CuDNNLSTM, Dropout, Dense

hidden_dims = 256
embed_dims = 300

input_text = Input((None,))
x = Embedding(num_vocabs, embed_dims)(input_text)
x = CuDNNLSTM(hidden_dims)(x)
x = Dropout(0.2)(x)
output_text = Dense(num_vocabs, activation="softmax")(x)

model = Model(input_text, output_text)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, None, 300)         2400000   
_________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)     (None, 256)               571392    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 8000)              2056000   
Total params: 5,027,392
Trainable params: 5,027,392
Non-trainable params: 0
_________________________________________________________________


In [11]:
from keras.optimizers import Adam

model.compile(loss='categorical_crossentropy',
             optimizer=Adam(),
             metrics=["acc"])

In [12]:
epochs = 50
batch_size = 128

model.fit(X, y, epochs=epochs, batch_size=batch_size)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fff284bf7b8>

## note on "temperature" to alter a distribution
A parameter called **"temperature(softmax temperature)"** is used to alter the original distribution below.
<br>
<br>
$$
    \exp(\frac{\log(original\ distribution)}{temperature})\\
    = exp(\log(original\ distribution)) \times exp(-temperature)\\
    = original\ distribution \times \exp(-temperature)\\
    where\ 0 \neq temperature
$$
<br>
That is, by multiplying original distribution by a number which is less than 1, difference between high probabilities of some indexes and low probabilities of other indexes become small, so that the original distribution will have larger entropy(getting closer to uniform distribution, which is more unpredictable)

In [48]:
def alter_dist_and_pick_one(orig_dist, temp):
    dist = np.log(orig_dist + 1e-7) / temp
    dist = np.exp(dist)
    normalized_dist = dist / np.sum(dist)
    choices = range(len(normalized_dist))
    return np.random.choice(choices, p=normalized_dist)

In [57]:
def generate_text(model, max_len, temp=0.1):
    generated_text = "【Sports Watch】日本を代表する打者である松井秀喜氏は"
    for i in range(max_len):
        int_tokens = tokenizer.EncodeAsIds(generated_text)
        #update int_tokens to predict next token
        int_tokens = int_tokens[-seq_length:]
        int_tokens = np.reshape(int_tokens, (1,seq_length))
        pred_dist = model.predict(int_tokens, verbose=0)[0]
        
        pred_index = alter_dist_and_pick_one(pred_dist, temp)
        generated_text += tokenizer.IdToPiece(int(pred_index))
        
        if generated_text[-4:] == "</s>" or generated_text[-3:] == "▁▁▁":
            break
        
    return generated_text

In [60]:
result = generate_text(model, 1000, 0.3)
print(result)

【Sports Watch】日本を代表する打者である松井秀喜氏は、「私の場合は、長友がクロスから上がるっていうのは、すごいあったんですよ」と語り、また、チームのエースを「出したいです」と語る。▁▁また、浅田は「今シーズンは21年間、フィジカル的なことをやって、今台のままじゃあっていて、今年はやってますよね。(カズダンスの)タッチにいかない。▁そういう意味では、そういう意味では、自分がやってるから、そういうのは、自分自身で一番最初は出たら、ホッとしました。一回攻撃してくれなかった」と語った。▁▁▁
