In [72]:
import pandas as pd
import yaml
import random
import re
import pylab as plt
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.layers import LayerNormalization,BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, GRU, Dropout
from tensorflow.keras.models import model_from_yaml
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

np.set_printoptions(suppress=True)

In [73]:
def readFile(filePath):
    lemma_result = pd.read_csv(filePath)
    print(lemma_result.shape)
    return lemma_result

In [74]:
def checkLength(max_feat, file):
    leng = 0
    sentence = []
    allData = file['cleaned_review']

    max_fatures = max_feat
    tokenizer = Tokenizer(num_words=max_fatures, split=' ') 
    tokenizer.fit_on_texts(allData.values)
    X1 = tokenizer.texts_to_sequences(allData.values)
    
    for i in X1:
        if len(i)>leng:
            leng = len(i)
            sentence = i
    print(leng)
    print(sentence)
    return leng

In [75]:
def beforeCreateModel(max_feat,dataset,max_length):
    max_fatures = max_feat #จำนวนคำที่ใช้ใน model
    tokenizer = Tokenizer(num_words=max_fatures, split=' ') 
    tokenizer.fit_on_texts(dataset['cleaned_review'].values)
    X1 = tokenizer.texts_to_sequences(dataset['cleaned_review'].values)
    print(len(tokenizer.word_index))
    feat = pad_sequences(X1, padding='pre',maxlen=max_length) # 505 = max_length in sentence
    target = dataset['Label'].values

    return feat,target,tokenizer

In [76]:
def createModelLSTM(embed_dim,lstm_out,max_feat,input_length):
    embed_dim = embed_dim
    lstm_out = lstm_out
    model = Sequential()
    model.add(Embedding(input_dim = max_feat ,output_dim = embed_dim ,input_length = input_length)) #input_dim = max_feat, #output_dim = embed_dim
    model.add(LSTM(lstm_out, dropout=0.5, recurrent_dropout=0.5))
    model.add(Dropout(0.25))
    model.add(Dense(1,activation='sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
    print(model.summary())
    return model

In [77]:
def createModelGRU(embed_dim,gru_out,max_feat,input_length):
    embed_dim = embed_dim
    gru_out = gru_out
    model = Sequential()
    model.add(Embedding(max_feat, embed_dim,input_length = input_length)) 
    model.add(Dropout(0.2))
    model.add(GRU(gru_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dropout(0.2))
    model.add(Dense(1,activation='sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
    print(model.summary())
    return model

In [78]:
def trainModel(model,feat,target,validation_split,epochs,batch_size):
    random.seed(10)
    history = model.fit(feat, target,validation_split=validation_split, epochs = epochs, batch_size=batch_size)

In [79]:
def evaluateModel(model,feat,target,batch_size):
    score,acc = model.evaluate(feat, target, verbose = 2, batch_size = batch_size)
    print("score: %.2f" % (score))
    print("acc: %.2f" % (acc))

In [80]:
def saveModel(model):
    # serialize model to YAML
    model_yaml = model.to_yaml()
    with open("addTestTrainSize.yaml", "w") as yaml_file:
        yaml_file.write(model_yaml)
    # serialize weights to HDF5
    model.save_weights("addTestTrainSize.h5")
    print("Saved model to disk")

In [81]:
def loadModel(yamlPathName, h5PathName):
    with open(yamlPathName+'.yaml', 'r') as yaml_file:
        print(yamlPathName)
        loaded_model_yaml = yaml_file.read()
        loaded_model = model_from_yaml(loaded_model_yaml)

        # load weights into new model
        loaded_model.load_weights(h5PathName+'.h5')
    
    loaded_model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
    return loaded_model

In [82]:
def filterAndShow(filter1, filter2, modelOutput, labelTest):
    print(modelOutput)
    test = []
    test_y = []

    for i in range(len(modelOutput)):
        if(modelOutput[i] <filter1 or modelOutput[i]>filter2):
            test.append(modelOutput[i])
            test_y.append(labelTest[i])

    test1 = np.array(test)
    testy1 = np.array(test_y)
    print(testy1)
    print(test1.shape)
    print(testy1.shape)
    showConfusionMatrix(testy1, test1)

In [83]:
def showConfusionMatrix(trueLabel,resultToShow):
    labels = ['positive','negative']
    cm = confusion_matrix(y_true=trueLabel , y_pred=resultToShow>0.5)
    print(cm)
    #fig = plt.figure()
    #ax = fig.add_subplot(111)
    #cax = ax.matshow(cm)
    #plt.title('Confusion matrix of LSTM classifier')
    #fig.colorbar(cax)
    #ax.set_xticklabels(['']+labels)
    #ax.set_yticklabels(['']+labels)
    #plt.xlabel('Predicted')
    #plt.ylabel('True')
    #plt.show()

In [84]:
def showWordWithCode(dataToMap, tokenizer): #dataToMap = list of sentiment
    reverse_word_map = dict(map(reversed, tokenizer.word_index.items())) # map id to all word in dic
    
    def sequence_to_text(list_of_indices):
        # Looking up words in dictionary
        words = [reverse_word_map.get(letter) for letter in list_of_indices]
        return(words)
    
    my_texts = list(map(sequence_to_text, dataToMap))
    my_texts
    return my_texts

In [85]:
def saveSentimentAndResult(sentenceToSave, resultToSave):
    sen_temp = ""
    SentimentSave = []
    for one_sentence in sentenceToSave:
        for word in one_sentence:
            if isinstance(word, str):
                sen_temp = sen_temp + " " + word
            
        SentimentSave.append(sen_temp)
        sen_temp = ""
            
    #make 1 Dim predict result
    resultSave = []
    for arr_result in resultToSave:
        for result in arr_result:
            #print(result)
            resultSave.append(result)
            
    data = {'lemma_review': SentimentSave, 'predict score': resultSave}
    toFile = pd.DataFrame(data)
    toFile.to_csv("./for_compare.csv", index=False)

Test

In [86]:
path = "../Clean/lemma_allresult.csv"
x = readFile(path)

(10000, 2)


In [87]:
max_length = checkLength(10000,x)

853
[569, 935, 28, 155, 21, 1153, 1780, 3, 48, 1, 1020, 352, 2883, 261, 179, 29, 6126, 7109, 3174, 7996, 211, 913, 1419, 3479, 1427, 4161, 397, 1018, 941, 289, 2916, 2014, 245, 1398, 2950, 1087, 7, 529, 3581, 46, 1000, 200, 1, 150, 648, 113, 1043, 3785, 159, 1576, 2636, 698, 41, 122, 1231, 3581, 52, 1, 1153, 1780, 5, 3581, 60, 203, 529, 1153, 1780, 44, 1096, 121, 99, 236, 33, 3581, 4116, 1359, 23, 33, 1153, 1780, 753, 152, 1153, 1780, 61, 27, 155, 190, 828, 3454, 3581, 61, 292, 1719, 2903, 49, 2311, 31, 302, 292, 43, 272, 576, 17, 895, 781, 307, 343, 1457, 5, 1696, 3199, 27, 1353, 13, 12, 172, 165, 968, 47, 2000, 23, 2311, 3454, 19, 125, 984, 18, 933, 904, 38, 301, 5224, 144, 301, 2444, 131, 6015, 2497, 52, 1, 624, 5919, 31, 3, 152, 15, 1153, 1780, 52, 3581, 1153, 1780, 110, 2561, 2017, 144, 302, 4, 362, 767, 1431, 79, 45, 33, 541, 161, 651, 2321, 902, 2676, 29, 8781, 3230, 31, 29, 453, 155, 93, 394, 902, 1973, 393, 2817, 25, 1, 29, 1153, 48, 39, 39, 349, 1153, 254, 986, 31, 1019, 292,

In [88]:
feat,target,tokenizer = beforeCreateModel(max_feat=10000,dataset=x,max_length=max_length)

feat,target

44402


(array([[   0,    0,    0, ...,  387, 3283,  334],
        [   0,    0,    0, ..., 1878,   19,  125],
        [   0,    0,    0, ...,   12,    6,  275],
        ...,
        [   0,    0,    0, ...,    2,  252,   73],
        [   0,    0,    0, ...,  343,    2,    9],
        [   0,    0,    0, ..., 4874,  846, 1661]]),
 array([1, 1, 1, ..., 0, 0, 1], dtype=int64))

In [89]:
X_train, X_test, Y_train, Y_test = train_test_split(feat,target, test_size = 0.2, train_size = 0.8, random_state = 42)

LSTM

In [90]:
model = createModelLSTM(embed_dim=128,lstm_out=128,max_feat=10000,input_length=feat.shape[1])

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 853, 128)          1280000   
_________________________________________________________________
lstm_18 (LSTM)               (None, 128)               131584    
_________________________________________________________________
dropout_15 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 129       
Total params: 1,411,713
Trainable params: 1,411,713
Non-trainable params: 0
_________________________________________________________________
None


In [91]:
trainModel(model,feat=X_train,target=Y_train,validation_split=0.2,epochs=15,batch_size=32)

Train on 6400 samples, validate on 1600 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [92]:
evaluateModel(model,feat=X_test,target=Y_test,batch_size=32)

2000/2000 - 15s - loss: 0.9788 - accuracy: 0.8040
score: 0.98
acc: 0.80


In [93]:
saveModel(model)

Saved model to disk


In [94]:
test = loadModel('./11_Save_model_10000_data_LSTM/addTestTrainSize','./11_Save_model_10000_data_LSTM/addTestTrainSize')

#test = model

./11_Save_model_10000_data_LSTM/addTestTrainSize


In [95]:
evaluateModel(test,feat=X_test,target=Y_test,batch_size=32)

2000/2000 - 16s - loss: 0.9788 - accuracy: 0.8040
score: 0.98
acc: 0.80


In [96]:
result = test.predict(X_test)

In [97]:
filterAndShow(filter1=0.4, filter2=0.6, modelOutput=result, labelTest=Y_test) #LSTM with filter

[[0.00001076]
 [0.00352274]
 [0.00101464]
 ...
 [0.99881387]
 [0.95910513]
 [0.9854575 ]]
[0 0 0 ... 1 0 0]
(1962, 1)
(1962,)
[[767 208]
 [166 821]]


In [98]:
showConfusionMatrix(trueLabel=Y_test,resultToShow=result) #LSTM no filter

[[778 218]
 [174 830]]


In [26]:
confusion_matrix(y_true=Y_test, y_pred=result>0.5)

array([[78, 26],
       [40, 56]], dtype=int64)

In [91]:
hi=showWordWithCode(dataToMap=X_test, tokenizer=tokenizer)

In [None]:
result

In [92]:
hi

[[None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,


In [None]:
saveSentimentAndResult(hi,result)

GRU

In [None]:
model = createModelGRU(embed_dim=150,gru_out=200,max_feat=7000,input_length=feat.shape[1])

In [None]:
trainModel(model,feat=X_train,target=Y_train,validation_split=0.2,epochs=50,batch_size=32)

In [None]:
evaluateModel(model,feat=X_test,target=Y_test,batch_size=32)

In [None]:
resultGRU = model.predict(X_test)

In [None]:
filterAndShow(filter1=0.4, filter2=0.6, modelOutput=resultGRU, labelTest=Y_test) #GRU with filter

In [None]:
showConfusionMatrix(trueLabel=Y_test,resultToShow=result) #GRU no filter

In [95]:
saveModel(model)

Saved model to disk


Web Read

In [106]:
path = "../Clean/lemma_master_result.csv"
x = readFile(path)

x

(50, 2)


Unnamed: 0,cleaned_review,Label
0,mr costner drag movie far longer necessary asi...,0
1,example majority action film generic bore real...,0
2,first hate moronic rappers could nt act gun pr...,0
3,even beatles could write songs everyone like a...,0
4,brass picture movies fit word really somewhat ...,0
5,funny thing happen watch mosquito one hand her...,0
6,german horror film one weirdest see aware conn...,0
7,long time fan japanese film expect really both...,0
8,tokyo eye tell 17 year old japanese girl fall ...,0
9,wealthy horse ranchers buenos air long stand t...,0


In [100]:
testWR = loadModel('./11_Save_model_10000_data_LSTM/addTestTrainSize','./11_Save_model_10000_data_LSTM/addTestTrainSize')

#testWR = model

./11_Save_model_10000_data_LSTM/addTestTrainSize


  config = yaml.load(yaml_string)


In [107]:
#x_cut = x[1001:1101]
featWR,targetWR,tokenizerr = beforeCreateModel(max_feat=10000,dataset=x,max_length=max_length) #fulldic max_feat = 14000

featWR,targetWR

2334


(array([[   0,    0,    0, ...,  572,   81,  422],
        [   0,    0,    0, ...,   96,  250,  251],
        [   0,    0,    0, ...,   54,   97,  151],
        ...,
        [   0,    0,    0, ..., 2278,  130,    7],
        [   0,    0,    0, ...,  170, 2305, 2306],
        [   0,    0,    0, ..., 2333,  923, 2334]]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0], dtype=int64))

In [108]:
featWR.shape[1]

853

In [109]:
resultWR = testWR.predict(featWR)

resultWR

array([[0.9654267 ],
       [0.62168634],
       [0.996505  ],
       [0.8451896 ],
       [0.00042387],
       [0.00015683],
       [0.9999541 ],
       [0.5547549 ],
       [0.999408  ],
       [0.9759325 ],
       [0.2673039 ],
       [0.00294775],
       [0.00000476],
       [0.00007739],
       [0.15193129],
       [0.9995877 ],
       [0.839071  ],
       [0.95919025],
       [0.99942124],
       [0.00091689],
       [0.19061042],
       [0.11739163],
       [0.99234575],
       [0.99892944],
       [0.4686103 ],
       [0.18684739],
       [0.00026185],
       [0.00010787],
       [0.00000595],
       [0.00747667],
       [0.6915491 ],
       [0.00040206],
       [0.88406575],
       [0.9996449 ],
       [0.00005131],
       [0.9991743 ],
       [0.9997527 ],
       [0.4129327 ],
       [0.00153896],
       [0.9999782 ],
       [0.980942  ],
       [0.9978789 ],
       [0.9990594 ],
       [0.9996687 ],
       [0.00002558],
       [0.99915147],
       [0.00010106],
       [0.002

In [110]:
showConfusionMatrix(trueLabel=targetWR,resultToShow=resultWR)
#confusion_matrix(target[:,1] , result>0.5)

[[25 25]
 [ 0  0]]


In [22]:
showWordWithCode(dataToMap=featWR , tokenizer=tokenizerr)

[[None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,


In [111]:
evaluateModel(testWR,feat=X_test,target=Y_test,batch_size=32)

2000/2000 - 16s - loss: 0.9788 - accuracy: 0.8040
score: 0.98
acc: 0.80


In [30]:
testWR.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 580, 150)          2100000   
_________________________________________________________________
dropout (Dropout)            (None, 580, 150)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 200)               280800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 201       
Total params: 2,381,001
Trainable params: 2,381,001
Non-trainable params: 0
_________________________________________________________________
