In [72]:
import pandas as pd
import random
import re
import pylab as plt
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.layers import LayerNormalization,BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, GRU, Dropout
from tensorflow.keras.models import model_from_yaml
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

np.set_printoptions(suppress=True)

In [25]:
def readFile(filePath):
    lemma_result = pd.read_csv(filePath)
    print(lemma_result.shape)
    return lemma_result

In [26]:
def beforeCreateModel(max_feat,dataset):
    max_fatures = max_feat #จำนวนคำที่ใช้ใน model
    tokenizer = Tokenizer(num_words=max_fatures, split=' ') 
    tokenizer.fit_on_texts(dataset['cleaned_review'].values)
    X1 = tokenizer.texts_to_sequences(dataset['cleaned_review'].values)
    print(len(tokenizer.word_index))
    feat = pad_sequences(X1, padding='pre',maxlen=580) # ลองปรับ padding เป็น Post เผื่อค่าจะดีขึ้น
    target = dataset['Label'].values
    
    return feat,target,tokenizer

In [106]:
def createModelLSTM(embed_dim,lstm_out,max_feat,input_length):
    embed_dim = embed_dim
    lstm_out = lstm_out
    model = Sequential()
    model.add(Embedding(max_feat, embed_dim,input_length = input_length))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dropout(0.2))
    model.add(Dense(1,activation='sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
    print(model.summary())
    return model

In [28]:
def createModelGRU(embed_dim,gru_out,max_feat,input_length):
    embed_dim = embed_dim
    gru_out = gru_out
    model = Sequential()
    model.add(Embedding(max_feat, embed_dim,input_length = input_length))
    model.add(Dropout(0.2))
    model.add(GRU(gru_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dropout(0.2))
    model.add(Dense(1,activation='sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
    print(model.summary())
    return model

In [29]:
def trainModel(model,feat,target,validation_split,epochs,batch_size):
    random.seed(10)
    history = model.fit(feat, target,validation_split=validation_split, epochs = epochs, batch_size=batch_size)

In [30]:
def evaluateModel(model,feat,target,batch_size):
    score,acc = model.evaluate(feat, target, verbose = 2, batch_size = batch_size)
    print("score: %.2f" % (score))
    print("acc: %.2f" % (acc))

In [31]:
def saveModel(model):
    # serialize model to YAML
    model_yaml = model.to_yaml()
    with open("addTestTrainSize.yaml", "w") as yaml_file:
        yaml_file.write(model_yaml)
    # serialize weights to HDF5
    model.save_weights("addTestTrainSize.h5")
    print("Saved model to disk")

In [32]:
def loadModel(yamlPathName, h5PathName):
    with open(yamlPathName+'.yaml', 'r') as yaml_file:
        loaded_model_yaml = yaml_file.read()
        loaded_model = model_from_yaml(loaded_model_yaml)

        # load weights into new model
        loaded_model.load_weights(h5PathName+'.h5')
    
    loaded_model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
    return loaded_model

In [33]:
def filterAndShow(filter1, filter2, modelOutput, labelTest):
    print(modelOutput)
    test = []
    test_y = []

    for i in range(len(modelOutput)):
        if(modelOutput[i] <filter1 or modelOutput[i]>filter2):
            test.append(modelOutput[i])
            test_y.append(labelTest[i])

    test1 = np.array(test)
    testy1 = np.array(test_y)
    print(testy1)
    print(test1.shape)
    print(testy1.shape)
    showConfusionMatrix(testy1, test1)

In [34]:
def showConfusionMatrix(trueLabel,resultToShow):
    labels = ['positive','negative']
    cm = confusion_matrix(y_true=trueLabel , y_pred=resultToShow>0.5)
    print(cm)
    #fig = plt.figure()
    #ax = fig.add_subplot(111)
    #cax = ax.matshow(cm)
    #plt.title('Confusion matrix of LSTM classifier')
    #fig.colorbar(cax)
    #ax.set_xticklabels(['']+labels)
    #ax.set_yticklabels(['']+labels)
    #plt.xlabel('Predicted')
    #plt.ylabel('True')
    #plt.show()

In [89]:
def showWordWithCode(dataToMap, tokenizer): #dataToMap = list of sentiment
    reverse_word_map = dict(map(reversed, tokenizer.word_index.items())) # map id to all word in dic
    
    def sequence_to_text(list_of_indices):
        # Looking up words in dictionary
        words = [reverse_word_map.get(letter) for letter in list_of_indices]
        return(words)
    
    my_texts = list(map(sequence_to_text, dataToMap))
    my_texts
    return my_texts

In [36]:
def saveSentimentAndResult(sentenceToSave, resultToSave):
    sen_temp = ""
    SentimentSave = []
    for one_sentence in sentenceToSave:
        for word in one_sentence:
            if isinstance(word, str):
                sen_temp = sen_temp + " " + word
            
        SentimentSave.append(sen_temp)
        sen_temp = ""
            
    #make 1 Dim predict result
    resultSave = []
    for arr_result in resultToSave:
        for result in arr_result:
            #print(result)
            resultSave.append(result)
            
    data = {'lemma_review': SentimentSave, 'predict score': resultSave}
    toFile = pd.DataFrame(data)
    toFile.to_csv("./for_compare.csv", index=False)

Test

In [79]:
path = "../Clean/lemma_result.csv"
x = readFile(path)

(1000, 2)


In [80]:
feat,target,tokenizer = beforeCreateModel(max_feat=14000,dataset=x)

feat,target

14771


(array([[   0,    0,    0, ...,  406, 3108,  310],
        [   0,    0,    0, ..., 2704,   18,  121],
        [   0,    0,    0, ...,   10,    5,  332],
        ...,
        [   0,    0,    0, ...,    4,  645,  771],
        [   0,    0,    0, ...,  964,  606,    1],
        [   0,    0,    0, ...,   57,  101, 1004]]),
 array([1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0,
        1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
        1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
        0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
        0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1,
        0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
        1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0,
        1, 0,

In [81]:
X_train, X_test, Y_train, Y_test = train_test_split(feat,target, test_size = 0.2, train_size = 0.8, random_state = 42)

LSTM

In [107]:
model = createModelLSTM(embed_dim=150,lstm_out=200,max_feat=14000,input_length=feat.shape[1])

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 580, 150)          2100000   
_________________________________________________________________
batch_normalization_6 (Batch (None, 580, 150)          600       
_________________________________________________________________
dropout_14 (Dropout)         (None, 580, 150)          0         
_________________________________________________________________
lstm_7 (LSTM)                (None, 200)               280800    
_________________________________________________________________
dropout_15 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 201       
Total params: 2,381,601
Trainable params: 2,381,301
Non-trainable params: 300
__________________________________________

In [108]:
trainModel(model,feat=X_train,target=Y_train,validation_split=0.2,epochs=50,batch_size=32)

In [50]:
evaluateModel(model,feat=X_test,target=Y_test,batch_size=32)

200/200 - 1s - loss: 1.4745 - accuracy: 0.7000
score: 1.47
acc: 0.70


In [51]:
saveModel(model)

Saved model to disk


In [55]:
#test = loadModel('./7_Save_model_batchnormalize_LSTM/addTestTrainSize','./7_Save_model_batchnormalize_LSTM/addTestTrainSize')

test = model

In [56]:
evaluateModel(test,feat=X_test,target=Y_test,batch_size=32)

200/200 - 1s - loss: 1.4745 - accuracy: 0.7000
score: 1.47
acc: 0.70


In [57]:
result = test.predict(X_test)

In [58]:
filterAndShow(filter1=0.4, filter2=0.6, modelOutput=result, labelTest=Y_test) #LSTM with filter

[[0.00618338]
 [0.00184447]
 [0.00081954]
 [0.00441896]
 [0.9996767 ]
 [0.9995459 ]
 [0.99990106]
 [0.04260011]
 [0.00007728]
 [0.6238269 ]
 [0.9511203 ]
 [0.9971124 ]
 [0.00069804]
 [0.2645978 ]
 [0.00194715]
 [0.00231018]
 [0.9997423 ]
 [0.01720162]
 [0.99540675]
 [0.99946994]
 [0.9999614 ]
 [0.00972079]
 [0.00007625]
 [0.9268774 ]
 [0.00476464]
 [0.9979382 ]
 [0.9877466 ]
 [0.99665135]
 [0.99965966]
 [0.9826561 ]
 [0.00034781]
 [0.00016778]
 [0.9995316 ]
 [0.9999362 ]
 [0.00026683]
 [0.99887234]
 [0.00041076]
 [0.3116556 ]
 [0.0003088 ]
 [0.999899  ]
 [0.9997408 ]
 [0.9998733 ]
 [0.01819921]
 [0.00039068]
 [0.99816185]
 [0.9880536 ]
 [0.99976975]
 [0.99985766]
 [0.9945964 ]
 [0.99837446]
 [0.99976224]
 [0.0376399 ]
 [0.05024457]
 [0.00049777]
 [0.00511582]
 [0.99959975]
 [0.00019999]
 [0.9998946 ]
 [0.05230472]
 [0.00065453]
 [0.09133165]
 [0.9997631 ]
 [0.00003786]
 [0.8551627 ]
 [0.00036888]
 [0.9108648 ]
 [0.9329063 ]
 [0.03492328]
 [0.8367103 ]
 [0.00004116]
 [0.8052795 ]
 [0.95

In [59]:
showConfusionMatrix(trueLabel=Y_test,resultToShow=result) #LSTM no filter

[[68 36]
 [24 72]]


In [26]:
confusion_matrix(y_true=Y_test, y_pred=result>0.5)

array([[78, 26],
       [40, 56]], dtype=int64)

In [91]:
hi=showWordWithCode(dataToMap=X_test, tokenizer=tokenizer)

In [None]:
result

In [92]:
hi

[[None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,


In [None]:
saveSentimentAndResult(hi,result)

GRU

In [None]:
model = createModelGRU(embed_dim=150,gru_out=200,max_feat=7000,input_length=feat.shape[1])

In [None]:
trainModel(model,feat=X_train,target=Y_train,validation_split=0.2,epochs=50,batch_size=32)

In [None]:
evaluateModel(model,feat=X_test,target=Y_test,batch_size=32)

In [None]:
resultGRU = model.predict(X_test)

In [None]:
filterAndShow(filter1=0.4, filter2=0.6, modelOutput=resultGRU, labelTest=Y_test) #GRU with filter

In [None]:
showConfusionMatrix(trueLabel=Y_test,resultToShow=result) #GRU no filter

In [95]:
saveModel(model)

Saved model to disk


Web Read

In [96]:
path = "../Clean/lemma_resultscrape.csv"
x = readFile(path)

x

(10, 2)


Unnamed: 0,cleaned_review,Label
0,last night col ferry col coldwell usa able wat...,1
1,sit pack yet silent theater morning watch beli...,1
2,film overwhelm nothing add compel need eternal...,1
3,listen critics say movie bore movie one tense ...,1
4,absolutely incredible see 3 time cinemas time ...,1
5,understand nominate movie oscar poor story poo...,0
6,know many well make ww1 film one make grade no...,0
7,bore rat slow action memorable line mediocre a...,0
8,know people see movie war superhero would go s...,0
9,story cliche performance awful wonder people t...,0


In [97]:
#testWR = loadModel('./6_Save_model_full_dic_LSTM/addTestTrainSize','./6_Save_model_full_dic_LSTM/addTestTrainSize')

testWR = model

In [98]:
featWR,targetWR,tokenizerr = beforeCreateModel(max_feat=14000,dataset=x)

featWR,targetWR

406


(array([[  0,   0,   0, ..., 227, 228,  72],
        [  0,   0,   0, ..., 257,   6,   4],
        [  0,   0,   0, ..., 284,   1,   1],
        ...,
        [  0,   0,   0, ..., 370, 371,  12],
        [  0,   0,   0, ..., 401,  10,  72],
        [  0,   0,   0, ...,  47,  48,  49]]),
 array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0], dtype=int64))

In [99]:
featWR.shape[1]

580

In [103]:
resultWR = test.predict(featWR)

resultWR

array([[0.17079987],
       [0.98814255],
       [0.00020901],
       [0.9652322 ],
       [0.12743159],
       [0.9996835 ],
       [0.00127136],
       [0.9992053 ],
       [0.99922144],
       [0.9659265 ]], dtype=float32)

In [104]:
showConfusionMatrix(trueLabel=targetWR,resultToShow=resultWR)
#confusion_matrix(target[:,1] , result>0.5)

[[1 4]
 [3 2]]


In [102]:
showWordWithCodeNew(dataToMap=featWR , tokenizer=tokenizerr)

[[None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
