In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Embedding, LSTM, Dense
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Load data

In [2]:
files = [os.path.join('train', f) for f in os.listdir('train')]
sentence = []
raw_text = ''
for file in files:
    with open(file, encoding='utf8') as file_in:
        text = file_in.read()
    raw_text += text
print("Raw_text's length: ",len(raw_text))
print(raw_text[:150])

Raw_text's length:  235573
Điện thoại di động Nokia 625H RM-943 CV VN YELLOW - A00013419 (Gồm thân máy, pin, sạc, sách, tai nghe, cáp kết nối), mới 100%
Điện thoại di động Nokia


## Preprocessing

In [3]:
# Token
special_char = ('&', '(', ')', '#', '-', '/', '.', ',', '%')
for char in special_char:
    raw_text = raw_text.replace(char, ' '+char+' ')
    raw_text = raw_text.lower()
token = raw_text.split()
print('Number of token:', len(token))
print('Sampled token: ', token[:10])

Number of token: 64684
Sampled token:  ['điện', 'thoại', 'di', 'động', 'nokia', '625h', 'rm', '-', '943', 'cv']


In [4]:
# Buil vocab
vocab = list(set(token))
vocab_size = len(vocab) + 1
print("Vocab's length: ", vocab_size)
print("Sampled vocab: ", vocab[:10])

Vocab's length:  3626
Sampled vocab:  ['qs16', 'a00021067', 'choc', '1a', '1700mah', 'kết', 'p16', '059v8f1', 'giắc', 'da']


In [5]:
# Word to index
word_index = {w: i+1 for i, w in enumerate(vocab)}
word_index['<OOV>'] = 1

# Index to word
index_word = {i: w for w, i in word_index.items()}

# Word to vector
word_vector = [word_index[w] for w in token]

In [6]:
# Split X, y element
sequences = []
for i in range(1, len(word_vector)):
    sequence = word_vector[i-1: i+1]
    sequences.append(sequence)
sequences = np.array(sequences)
X, y = sequences[:,0], sequences[:, 1]


# One hot vector output
y = to_categorical(y, num_classes=vocab_size)

# Split train, test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state=0)

## Build model

In [8]:
model = Sequential()
model.add(Embedding(vocab_size, 64, input_length=1))
model.add(LSTM(256, return_sequences = True))
model.add(LSTM(256))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam' , metrics=['accuracy'])
model.summary()    

Instructions for updating:
Colocations handled automatically by placer.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1, 64)             232064    
_________________________________________________________________
lstm_1 (LSTM)                (None, 1, 256)            328704    
_________________________________________________________________
lstm_2 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dense_1 (Dense)              (None, 3626)              931882    
Total params: 2,017,962
Trainable params: 2,017,962
Non-trainable params: 0
_________________________________________________________________


In [9]:
# Config
epochs  = 100

if not os.path.exists("language_model.hdf5"):
    checkpoint = ModelCheckpoint(filepath = 'language_model.hdf5', save_best_only = True, monitor='val_loss')
    history = model.fit(X_train, y_train, epochs=epochs,
                        validation_split=0.1, callbacks=[checkpoint])
else:
    model.load_weights("language_model.hdf5")

Instructions for updating:
Use tf.cast instead.
Train on 46571 samples, validate on 5175 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100


Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
