In [1]:
import pandas as pd
import random
import re
import pylab as plt
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, GRU, Dropout
from tensorflow.keras.models import model_from_yaml
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

np.set_printoptions(suppress=True)

In [2]:
def readFile(filePath):
    lemma_result = pd.read_csv(filePath)
    print(lemma_result.shape)
    return lemma_result

In [3]:
def beforeCreateModel(max_feat,dataset):
    max_fatures = max_feat #จำนวนคำที่ใช้ใน model
    tokenizer = Tokenizer(num_words=max_fatures, split=' ') 
    tokenizer.fit_on_texts(dataset['cleaned_review'].values)
    X1 = tokenizer.texts_to_sequences(dataset['cleaned_review'].values)
    
    feat = pad_sequences(X1, padding='pre',maxlen=580) # ลองปรับ padding เป็น Post เผื่อค่าจะดีขึ้น
    target = dataset['Label'].values
    
    return feat,target,tokenizer

In [4]:
def createModelLSTM(embed_dim,lstm_out,max_feat,input_length):
    embed_dim = embed_dim
    lstm_out = lstm_out
    model = Sequential()
    model.add(Embedding(max_feat, embed_dim,input_length = input_length))
    model.add(Dropout(0.2))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dropout(0.2))
    model.add(Dense(1,activation='sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
    print(model.summary())
    return model

In [5]:
def createModelGRU(embed_dim,gru_out,max_feat,input_length):
    embed_dim = embed_dim
    gru_out = gru_out
    model = Sequential()
    model.add(Embedding(max_feat, embed_dim,input_length = input_length))
    model.add(Dropout(0.2))
    model.add(GRU(gru_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dropout(0.2))
    model.add(Dense(1,activation='sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
    print(model.summary())
    return model

In [6]:
def trainModel(model,feat,target,validation_split,epochs,batch_size):
    random.seed(10)
    history = model.fit(feat, target,validation_split=validation_split, epochs = epochs, batch_size=batch_size)

In [7]:
def evaluateModel(model,feat,target,batch_size):
    score,acc = model.evaluate(feat, target, verbose = 2, batch_size = batch_size)
    print("score: %.2f" % (score))
    print("acc: %.2f" % (acc))

In [8]:
def saveModel(model):
    # serialize model to YAML
    model_yaml = model.to_yaml()
    with open("addTestTrainSize.yaml", "w") as yaml_file:
        yaml_file.write(model_yaml)
    # serialize weights to HDF5
    model.save_weights("addTestTrainSize.h5")
    print("Saved model to disk")

In [9]:
def loadModel(yamlPathName, h5PathName):
    with open(yamlPathName+'.yaml', 'r') as yaml_file:
        loaded_model_yaml = yaml_file.read()
        loaded_model = model_from_yaml(loaded_model_yaml)

        # load weights into new model
        loaded_model.load_weights(h5PathName+'.h5')
    
    loaded_model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
    return loaded_model

In [10]:
def filterAndShow(filter1, filter2, modelOutput, labelTest):
    print(modelOutput)
    test = []
    test_y = []

    for i in range(len(modelOutput)):
        if(modelOutput[i] <filter1 or modelOutput[i]>filter2):
            test.append(modelOutput[i])
            test_y.append(labelTest[i])

    test1 = np.array(test)
    testy1 = np.array(test_y)
    print(testy1)
    print(test1.shape)
    print(testy1.shape)
    showConfusionMatrix(testy1, test1)

In [11]:
def showConfusionMatrix(trueLabel,resultToShow):
    labels = ['positive','negative']
    cm = confusion_matrix(y_true=trueLabel , y_pred=resultToShow>0.5)
    print(cm)
    #fig = plt.figure()
    #ax = fig.add_subplot(111)
    #cax = ax.matshow(cm)
    #plt.title('Confusion matrix of LSTM classifier')
    #fig.colorbar(cax)
    #ax.set_xticklabels(['']+labels)
    #ax.set_yticklabels(['']+labels)
    #plt.xlabel('Predicted')
    #plt.ylabel('True')
    #plt.show()

In [12]:
def showWordWithCode(readIn, colName, dataToMap, tokenizer): #readIn = pd.read_csv ,colName = column name in string, dataToMap = list of sentiment
    test = tokenizer.texts_to_sequences(readIn[colName].values)
    reverse_word_map = dict(map(reversed, tokenizer.word_index.items())) # map id to all word in dic
    
    def sequence_to_text(list_of_indices):
        # Looking up words in dictionary
        words = [reverse_word_map.get(letter) for letter in list_of_indices]
        return(words)
    
    my_texts = list(map(sequence_to_text, dataToMap))
    my_texts[13]
    return my_texts

In [13]:
def saveSentimentAndResult(sentenceToSave, resultToSave):
    sen_temp = ""
    SentimentSave = []
    for one_sentence in sentenceToSave:
        for word in one_sentence:
            if isinstance(word, str):
                sen_temp = sen_temp + " " + word
            
        SentimentSave.append(sen_temp)
        sen_temp = ""
            
    #make 1 Dim predict result
    resultSave = []
    for arr_result in resultToSave:
        for result in arr_result:
            #print(result)
            resultSave.append(result)
            
    data = {'lemma_review': SentimentSave, 'predict score': resultSave}
    toFile = pd.DataFrame(data)
    toFile.to_csv("./for_compare.csv", index=False)

Test

In [14]:
path = "../Clean/lemma_result.csv"
x = readFile(path)

(1000, 2)


In [15]:
feat,target,tokenizer = beforeCreateModel(max_feat=7000,dataset=x)

feat,target

(array([[   0,    0,    0, ...,  406, 3108,  310],
        [   0,    0,    0, ..., 2704,   18,  121],
        [   0,    0,    0, ...,   10,    5,  332],
        ...,
        [   0,    0,    0, ...,    4,  645,  771],
        [   0,    0,    0, ...,  964,  606,    1],
        [   0,    0,    0, ...,   57,  101, 1004]]),
 array([1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0,
        1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
        1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
        0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
        0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1,
        0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
        1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0,
        1, 0,

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(feat,target, test_size = 0.2, train_size = 0.8, random_state = 42)

LSTM

In [17]:
model = createModelLSTM(embed_dim=150,lstm_out=200,max_feat=7000,input_length=feat.shape[1])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 580, 150)          1050000   
_________________________________________________________________
dropout (Dropout)            (None, 580, 150)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 200)               280800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 201       
Total params: 1,331,001
Trainable params: 1,331,001
Non-trainable params: 0
_________________________________________________________________
None


In [19]:
#trainModel(model,feat=X_train,target=Y_train,validation_split=0.2,epochs=50,batch_size=32)

In [None]:
#evaluateModel(model,feat=X_test,target=Y_test,batch_size=32)

In [None]:
#saveModel(model)

In [21]:
test = loadModel('./4_Save_model_cut_dummy_LSTM/addTestTrainSize','./4_Save_model_cut_dummy_LSTM/addTestTrainSize')

  config = yaml.load(yaml_string)


In [22]:
evaluateModel(test,feat=X_test,target=Y_test,batch_size=32)

200/200 - 2s - loss: 1.6870 - accuracy: 0.6700
score: 1.69
acc: 0.67


In [23]:
result = test.predict(X_test)

In [24]:
filterAndShow(filter1=0.4, filter2=0.6, modelOutput=result, labelTest=Y_test) #LSTM with filter

[[0.02430036]
 [0.06550971]
 [0.00006281]
 [0.00007229]
 [0.99773383]
 [0.9999572 ]
 [0.9382391 ]
 [0.00060544]
 [0.00056729]
 [0.60899746]
 [0.94603825]
 [0.9981201 ]
 [0.00006075]
 [0.07131647]
 [0.01077259]
 [0.00031464]
 [0.9998952 ]
 [0.21666142]
 [0.13646378]
 [0.99333775]
 [0.99996173]
 [0.00010865]
 [0.00005511]
 [0.06085382]
 [0.00003648]
 [0.00593925]
 [0.01792044]
 [0.99984014]
 [0.01197829]
 [0.9998673 ]
 [0.00012733]
 [0.00004649]
 [0.34669885]
 [0.0424039 ]
 [0.02007715]
 [0.9781845 ]
 [0.2412503 ]
 [0.969636  ]
 [0.00029175]
 [0.99988294]
 [0.9656104 ]
 [0.56526744]
 [0.00030897]
 [0.00019033]
 [0.99966216]
 [0.6906669 ]
 [0.60334545]
 [0.9960824 ]
 [0.00057261]
 [0.99859434]
 [0.12571406]
 [0.00004569]
 [0.00068661]
 [0.00021745]
 [0.00010541]
 [0.9993191 ]
 [0.00001085]
 [0.97216153]
 [0.00015264]
 [0.00023316]
 [0.00007705]
 [0.9997602 ]
 [0.00137877]
 [0.5412207 ]
 [0.00092237]
 [0.8023472 ]
 [0.25606233]
 [0.68530506]
 [0.28611165]
 [0.00004721]
 [0.00007158]
 [0.63

In [25]:
showConfusionMatrix(trueLabel=Y_test,resultToShow=result) #LSTM no filter

[[78 26]
 [40 56]]


In [26]:
confusion_matrix(y_true=Y_test, y_pred=result>0.5)

array([[78, 26],
       [40, 56]], dtype=int64)

In [None]:
hi=showWordWithCode(readIn=x, colName='cleaned_review', dataToMap=X_test, tokenizer=tokenizer)

In [None]:
result

In [None]:
hi

In [None]:
saveSentimentAndResult(hi,result)

GRU

In [None]:
model = createModelGRU(embed_dim=150,gru_out=200,max_feat=7000,input_length=feat.shape[1])

In [None]:
trainModel(model,feat=X_train,target=Y_train,validation_split=0.2,epochs=50,batch_size=32)

In [None]:
evaluateModel(model,feat=X_test,target=Y_test,batch_size=32)

In [None]:
resultGRU = model.predict(X_test)

In [None]:
filterAndShow(filter1=0.4, filter2=0.6, modelOutput=resultGRU, labelTest=Y_test) #GRU with filter

In [None]:
showConfusionMatrix(trueLabel=Y_test,resultToShow=result) #GRU no filter

In [None]:
saveModel(model)

Web Read

In [32]:
path = "../Clean/lemma_resultscrape.csv"
x = readFile(path)

(10, 2)


In [33]:
testWR = loadModel('./5_Save_model_cut_dummy_GRU/addTestTrainSize','./5_Save_model_cut_dummy_GRU/addTestTrainSize')

In [34]:
featWR,targetWR,tokenizer = beforeCreateModel(max_feat=7000,dataset=x)

featWR,targetWR

(array([[  0,   0,   0, ..., 227, 228,  72],
        [  0,   0,   0, ..., 257,   6,   4],
        [  0,   0,   0, ..., 284,   1,   1],
        ...,
        [  0,   0,   0, ..., 370, 371,  12],
        [  0,   0,   0, ..., 401,  10,  72],
        [  0,   0,   0, ...,  47,  48,  49]]),
 array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0], dtype=int64))

In [30]:
featWR.shape[1]

580

In [35]:
resultWR = test.predict(featWR)

resultWR

array([[0.00115103],
       [0.00017376],
       [0.0006973 ],
       [0.05847816],
       [0.9454212 ],
       [0.99995625],
       [0.9846506 ],
       [0.2361771 ],
       [0.99977034],
       [0.99397993]], dtype=float32)

In [36]:
showConfusionMatrix(trueLabel=targetWR,resultToShow=resultWR)
#confusion_matrix(target[:,1] , result>0.5)

[[1 4]
 [4 1]]
