In [82]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from tensorflow.keras.models import model_from_yaml
from sklearn.model_selection import train_test_split
import random
import re

In [83]:
def readFile(filePath):
    lemma_result = pd.read_csv(filePath)
    return lemma_result

In [84]:
path = "../Clean/lemma_result.csv"
x = readFile(path)

In [85]:
def beforeCreateModel(max_feat,dataset):
    max_fatures = max_feat #จำนวนคำที่ใช้ใน model
    tokenizer = Tokenizer(num_words=max_fatures, split=' ') 
    tokenizer.fit_on_texts(dataset['cleaned_review'].values)
    X1 = tokenizer.texts_to_sequences(dataset['cleaned_review'].values)
    
    feat = pad_sequences(X1, padding='pre') # ลองปรับ padding เป็น Post เผื่อค่าจะดีขึ้น
    target = pd.get_dummies(dataset['Label']).values
    
    return feat,target

In [86]:
feat,target = beforeCreateModel(max_feat=7000,dataset=x)

feat,target

(array([[   0,    0,    0, ...,  406, 3108,  310],
        [   0,    0,    0, ..., 2704,   18,  121],
        [   0,    0,    0, ...,   10,    5,  332],
        ...,
        [   0,    0,    0, ...,    4,  645,  771],
        [   0,    0,    0, ...,  964,  606,    1],
        [   0,    0,    0, ...,   57,  101, 1004]]),
 array([[0, 1],
        [0, 1],
        [0, 1],
        ...,
        [1, 0],
        [1, 0],
        [1, 0]], dtype=uint8))

In [87]:
X_train, X_test, Y_train, Y_test = train_test_split(feat,target, test_size = 0.2, train_size = 0.8, random_state = 42)

In [88]:
def createModel(embed_dim,lstm_out,max_feat,input_length):
    embed_dim = embed_dim
    lstm_out = lstm_out
    model = Sequential()
    model.add(Embedding(max_feat, embed_dim,input_length = input_length))
    model.add(Dropout(0.2))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dropout(0.2))
    model.add(Dense(1,activation='sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
    print(model.summary())
    return model

In [89]:
model = createModel(embed_dim=150,lstm_out=200,max_feat=7000,input_length=feat.shape[1])

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 580, 150)          1050000   
_________________________________________________________________
dropout_12 (Dropout)         (None, 580, 150)          0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 200)               280800    
_________________________________________________________________
dropout_13 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 201       
Total params: 1,331,001
Trainable params: 1,331,001
Non-trainable params: 0
_________________________________________________________________
None


In [90]:
def trainModel(model,feat,target,validation_split,epochs,batch_size):
    random.seed(10)
    history = model.fit(feat, target,validation_split=validation_split, epochs = epochs, batch_size=batch_size)

In [91]:
trainModel(model,feat=X_train,target=Y_train[:,0],validation_split=0.2,epochs=5,batch_size=32)

Train on 640 samples, validate on 160 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [92]:
def evaluateModel(model,feat,target,batch_size):
    score,acc = model.evaluate(feat, target, verbose = 2, batch_size = batch_size)
    print("score: %.2f" % (score))
    print("acc: %.2f" % (acc))

In [93]:
evaluateModel(model,feat=X_test,target=Y_test[:,0],batch_size=32)

200/200 - 1s - loss: 0.6122 - accuracy: 0.7500
score: 0.61
acc: 0.75


In [94]:
def saveModel(model):
    # serialize model to YAML
    model_yaml = model.to_yaml()
    with open("addTestTrainSize.yaml", "w") as yaml_file:
        yaml_file.write(model_yaml)
    # serialize weights to HDF5
    model.save_weights("addTestTrainSize.h5")
    print("Saved model to disk")

In [95]:
saveModel(model)

Saved model to disk
