In [1]:
import pandas as pd
import yaml
import random
import re
import pylab as plt
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.layers import LayerNormalization,BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, GRU, Dropout
from tensorflow.keras.models import model_from_yaml
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

np.set_printoptions(suppress=True)

In [2]:
def readFile(filePath):
    lemma_result = pd.read_csv(filePath)
    print(lemma_result.shape)
    return lemma_result

In [3]:
def checkLength(max_feat, file):
    leng = 0
    sentence = []
    allData = file['cleaned_review']

    max_fatures = max_feat
    tokenizer = Tokenizer(num_words=max_fatures, split=' ') 
    tokenizer.fit_on_texts(allData.values)
    X1 = tokenizer.texts_to_sequences(allData.values)
    
    for i in X1:
        if len(i)>leng:
            leng = len(i)
            sentence = i
    print(leng)
    print(sentence)
    return leng

In [4]:
def beforeCreateModel(max_feat,dataset,max_length):
    max_fatures = max_feat #จำนวนคำที่ใช้ใน model
    tokenizer = Tokenizer(num_words=max_fatures, split=' ') 
    tokenizer.fit_on_texts(dataset['cleaned_review'].values)
    X1 = tokenizer.texts_to_sequences(dataset['cleaned_review'].values)
    print(len(tokenizer.word_index))
    feat = pad_sequences(X1, padding='pre',maxlen=max_length) # 505 = max_length in sentence
    target = dataset['Label'].values

    return feat,target,tokenizer

In [5]:
def createModelLSTM(embed_dim,lstm_out,max_feat,input_length):
    embed_dim = embed_dim
    lstm_out = lstm_out
    model = Sequential()
    model.add(Embedding(input_dim = max_feat ,output_dim = embed_dim ,input_length = input_length)) #input_dim = max_feat, #output_dim = embed_dim
    model.add(LSTM(lstm_out, dropout=0.5, recurrent_dropout=0.5))
    model.add(Dropout(0.25))
    model.add(Dense(1,activation='sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
    print(model.summary())
    return model

In [6]:
def createModelGRU(embed_dim,gru_out,max_feat,input_length):
    embed_dim = embed_dim
    gru_out = gru_out
    model = Sequential()
    model.add(Embedding(max_feat, embed_dim,input_length = input_length)) 
    model.add(Dropout(0.2))
    model.add(GRU(gru_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dropout(0.2))
    model.add(Dense(1,activation='sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
    print(model.summary())
    return model

In [7]:
def trainModel(model,feat,target,validation_split,epochs,batch_size):
    random.seed(10)
    history = model.fit(feat, target,validation_split=validation_split, epochs = epochs, batch_size=batch_size)

In [8]:
def evaluateModel(model,feat,target,batch_size):
    score,acc = model.evaluate(feat, target, verbose = 2, batch_size = batch_size)
    print("score: %.2f" % (score))
    print("acc: %.2f" % (acc))

In [9]:
def saveModel(model):
    # serialize model to YAML
    model_yaml = model.to_yaml()
    with open("addTestTrainSize.yaml", "w") as yaml_file:
        yaml_file.write(model_yaml)
    # serialize weights to HDF5
    model.save_weights("addTestTrainSize.h5")
    print("Saved model to disk")

In [10]:
def loadModel(yamlPathName, h5PathName):
    with open(yamlPathName+'.yaml', 'r') as yaml_file:
        print(yamlPathName)
        loaded_model_yaml = yaml_file.read()
        loaded_model = model_from_yaml(loaded_model_yaml)

        # load weights into new model
        loaded_model.load_weights(h5PathName+'.h5')
    
    loaded_model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
    return loaded_model

In [11]:
def filterAndShow(filter1, filter2, modelOutput, labelTest):
    print(modelOutput)
    test = []
    test_y = []

    for i in range(len(modelOutput)):
        if(modelOutput[i] <filter1 or modelOutput[i]>filter2):
            test.append(modelOutput[i])
            test_y.append(labelTest[i])

    test1 = np.array(test)
    testy1 = np.array(test_y)
    print(testy1)
    print(test1.shape)
    print(testy1.shape)
    showConfusionMatrix(testy1, test1)

In [12]:
def showConfusionMatrix(trueLabel,resultToShow):
    labels = ['positive','negative']
    cm = confusion_matrix(y_true=trueLabel , y_pred=resultToShow>0.5)
    print(cm)
    #fig = plt.figure()
    #ax = fig.add_subplot(111)
    #cax = ax.matshow(cm)
    #plt.title('Confusion matrix of LSTM classifier')
    #fig.colorbar(cax)
    #ax.set_xticklabels(['']+labels)
    #ax.set_yticklabels(['']+labels)
    #plt.xlabel('Predicted')
    #plt.ylabel('True')
    #plt.show()

In [13]:
def showWordWithCode(dataToMap, tokenizer): #dataToMap = list of sentiment
    reverse_word_map = dict(map(reversed, tokenizer.word_index.items())) # map id to all word in dic
    
    def sequence_to_text(list_of_indices):
        # Looking up words in dictionary
        words = [reverse_word_map.get(letter) for letter in list_of_indices]
        return(words)
    
    my_texts = list(map(sequence_to_text, dataToMap))
    my_texts
    return my_texts

In [14]:
def saveSentimentAndResult(sentenceToSave, resultToSave):
    sen_temp = ""
    SentimentSave = []
    for one_sentence in sentenceToSave:
        for word in one_sentence:
            if isinstance(word, str):
                sen_temp = sen_temp + " " + word
            
        SentimentSave.append(sen_temp)
        sen_temp = ""
            
    #make 1 Dim predict result
    resultSave = []
    for arr_result in resultToSave:
        for result in arr_result:
            #print(result)
            resultSave.append(result)
            
    data = {'lemma_review': SentimentSave, 'predict score': resultSave}
    toFile = pd.DataFrame(data)
    toFile.to_csv("./for_compare.csv", index=False)

Test

In [None]:
path = "../Clean/lemma10000_master_result.csv"
x = readFile(path)

In [None]:
max_length = checkLength(10000,x)

In [None]:
feat,target,tokenizer = beforeCreateModel(max_feat=10000,dataset=x,max_length=max_length)

feat,target

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(feat,target, test_size = 0.2, train_size = 0.8, random_state = 42)

LSTM

In [None]:
model = createModelLSTM(embed_dim=128,lstm_out=128,max_feat=10000,input_length=feat.shape[1])

In [None]:
trainModel(model,feat=X_train,target=Y_train,validation_split=0.2,epochs=15,batch_size=32)

In [None]:
evaluateModel(model,feat=X_test,target=Y_test,batch_size=32)

In [None]:
saveModel(model)

In [None]:
test = loadModel('./11_Save_model_10000_data_LSTM/addTestTrainSize','./11_Save_model_10000_data_LSTM/addTestTrainSize')

#test = model

In [None]:
evaluateModel(test,feat=X_test,target=Y_test,batch_size=32)

In [None]:
result = test.predict(X_test)

In [None]:
filterAndShow(filter1=0.4, filter2=0.6, modelOutput=result, labelTest=Y_test) #LSTM with filter

In [None]:
showConfusionMatrix(trueLabel=Y_test,resultToShow=result) #LSTM no filter

In [None]:
confusion_matrix(y_true=Y_test, y_pred=result>0.5)

In [None]:
hi=showWordWithCode(dataToMap=X_test, tokenizer=tokenizer)

In [None]:
result

In [None]:
hi

In [None]:
saveSentimentAndResult(hi,result)

GRU

In [None]:
model = createModelGRU(embed_dim=150,gru_out=200,max_feat=7000,input_length=feat.shape[1])

In [None]:
trainModel(model,feat=X_train,target=Y_train,validation_split=0.2,epochs=50,batch_size=32)

In [None]:
evaluateModel(model,feat=X_test,target=Y_test,batch_size=32)

In [None]:
resultGRU = model.predict(X_test)

In [None]:
filterAndShow(filter1=0.4, filter2=0.6, modelOutput=resultGRU, labelTest=Y_test) #GRU with filter

In [None]:
showConfusionMatrix(trueLabel=Y_test,resultToShow=result) #GRU no filter

In [None]:
saveModel(model)

Web Read

In [15]:
path = "../Clean/lemma_result.csv"
x = readFile(path)

x

(1000, 2)


Unnamed: 0,cleaned_review,Label
0,one reviewers mention watch 1 oz episode hook ...,1
1,wonderful little production film technique una...,1
2,think wonderful way spend time hot summer week...,1
3,basically family little boy jake think zombie ...,0
4,petter mattei love time money visually stun fi...,1
...,...,...
995,nothing sacred ask ernie fosselius days everyb...,1
996,hat hate self aware pretentious inanity masque...,0
997,usually try professional constructive criticiz...,0
998,like go see film history class something like ...,0


In [16]:
testWR = loadModel('./12_Save_model_10000_masterimdb_LSTM/addTestTrainSize','./12_Save_model_10000_masterimdb_LSTM/addTestTrainSize')

#testWR = model

./12_Save_model_10000_masterimdb_LSTM/addTestTrainSize


  config = yaml.load(yaml_string)


In [17]:
#x_cut = x[1001:1101]
max_length_test = checkLength(10000,x)
#featWR,targetWR,tokenizerr = beforeCreateModel(max_feat=10000,dataset=x,max_length=max_length) #fulldic max_feat = 14000
featWR,targetWR,tokenizerr = beforeCreateModel(max_feat=10000,dataset=x,max_length=1304)

featWR,targetWR

603
[14, 1939, 656, 49, 156, 68, 128, 36, 37, 183, 1939, 424, 4164, 4722, 1929, 42, 1145, 46, 6, 1939, 49, 3410, 38, 751, 183, 797, 193, 30, 396, 7406, 2, 1547, 6, 2007, 873, 55, 66, 29, 14, 780, 327, 12, 1939, 158, 4436, 232, 7406, 59, 19, 751, 2344, 889, 2, 29, 46, 6, 120, 183, 267, 133, 24, 1939, 5473, 1, 44, 730, 2887, 3476, 133, 556, 1733, 5432, 1, 118, 221, 42, 779, 133, 3127, 226, 302, 4370, 4371, 226, 968, 300, 3083, 2334, 916, 1940, 3626, 2722, 15, 6, 7407, 133, 155, 178, 89, 222, 26, 328, 4370, 4371, 226, 699, 5489, 41, 226, 41, 226, 7, 330, 99, 4370, 4371, 4, 1478, 3591, 5489, 2512, 4, 1557, 11, 1, 68, 128, 99, 4, 140, 13, 223, 1546, 2, 10, 391, 37, 4370, 4371, 140, 746, 1939, 1233, 11, 226, 164, 19, 14, 4088, 42, 99, 6, 3127, 226, 173, 1018, 2, 521, 5, 226, 277, 4, 2, 631, 12, 81, 420, 843, 631, 2, 20, 902, 12, 81, 113, 27, 81, 226, 2, 15, 7345, 740, 1784, 1172, 1079, 91, 1300, 89, 24, 6, 3127, 268, 18, 1503, 2180, 3626, 2722, 873, 4973, 4078, 6460, 64, 364, 42, 424, 64, 36

(array([[   0,    0,    0, ...,  406, 3108,  310],
        [   0,    0,    0, ..., 2704,   18,  121],
        [   0,    0,    0, ...,   10,    5,  332],
        ...,
        [   0,    0,    0, ...,    4,  645,  771],
        [   0,    0,    0, ...,  964,  606,    1],
        [   0,    0,    0, ...,   57,  101, 1004]]),
 array([1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0,
        1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
        1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
        0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
        0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1,
        0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
        1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0,
        1, 0,

In [18]:
featWR.shape[1]

1304

In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(featWR,targetWR, test_size = 0.2, train_size = 0.8, random_state = 42)

In [20]:
#resultWR = testWR.predict(featWR)
resultWR = testWR.predict(X_test)

resultWR

array([[0.9999682 ],
       [0.23094516],
       [0.00022506],
       [0.9218977 ],
       [0.99997437],
       [0.9999995 ],
       [0.2818756 ],
       [0.9296115 ],
       [0.9985586 ],
       [0.9778511 ],
       [0.17688665],
       [0.00004893],
       [0.99959546],
       [0.94174314],
       [0.99802953],
       [0.19212028],
       [0.12460896],
       [0.88474685],
       [0.99976665],
       [0.999979  ],
       [0.0168418 ],
       [0.00007791],
       [0.10732297],
       [0.0087539 ],
       [0.99710923],
       [0.9854364 ],
       [0.61839557],
       [0.01905701],
       [0.00025873],
       [0.02852048],
       [0.6722403 ],
       [0.97459793],
       [0.99958473],
       [0.00010738],
       [0.9980806 ],
       [0.9955041 ],
       [0.9993445 ],
       [0.00759503],
       [0.996567  ],
       [0.00004767],
       [0.97916573],
       [0.6430848 ],
       [0.00000292],
       [0.496031  ],
       [0.03514318],
       [0.00051279],
       [0.9996737 ],
       [0.220

In [21]:
showConfusionMatrix(trueLabel=Y_test,resultToShow=resultWR)
#confusion_matrix(target[:,1] , result>0.5)

[[49 55]
 [64 32]]


In [None]:
showWordWithCode(dataToMap=featWR , tokenizer=tokenizerr)

In [23]:
evaluateModel(testWR,feat=X_test,target=Y_test,batch_size=32)

200/200 - 2s - loss: 3.2029 - accuracy: 0.4050
score: 3.20
acc: 0.41


In [24]:
testWR.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1304, 128)         1280000   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               131584    
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 1,411,713
Trainable params: 1,411,713
Non-trainable params: 0
_________________________________________________________________
