In [101]:
import numpy as np
import pandas as pd
import re
import string
from pyvi import ViTokenizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, SimpleRNN, Dropout, Convolution1D, Bidirectional

## Load data

In [48]:
negative_df=pd.read_excel(r'Data/negative_comment_student.xlsx')
negative_df['Label'] = [0 for x in range(negative_df.shape[0])] 
negative_df = negative_df[['Opinion', 'Label']]
positive_df=pd.read_excel(r'Data/positive_comment_student.xlsx')
positive_df['Label'] = [1 for x in range(positive_df.shape[0])]
positive_df = positive_df[['Opinion', 'Label']]

In [79]:
train = pd.concat([negative_df,positive_df], axis=0)
train.index = [i for i in range(train.shape[0])]

In [80]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3961 entries, 0 to 3960
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Opinion  3961 non-null   object
 1   Label    3961 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 92.8+ KB


## Preprocessing

In [81]:
def pre_processing(text):
    
    # Chuyển thành chữ thường 
    text = text.lower()
    
    # Chuẩn hóa tiếng việt
    replace_list={'ô kêi': ' ok ', 'okie': ' ok ', ' o kê ': ' ok ', 'okey': ' ok ', 'ôkê': ' ok ', ' oki ': ' ok ', ' oke ':  ' ok ',' okay':' ok ','okê':' ok ',
    ' tks ': u' cám ơn ', 'thks': u' cám ơn ', 'thanks': u' cám ơn ', 'ths': u' cám ơn ', 'thank': u' cám ơn ', 'cam on':u'cám ơn',u'cảm ơn':'cám ơn',
    ' not ': u' không ',' khoong ': ' không ', u' kg ': u' không ','ko ': 'không', ' k ': u' không ',' kh ':u' không ',' kô ':u' không ','hok':u' không ',' kp ': u' không phải ',u' kô ': u' không ', ' ko ': u' không ', u' ko ': u' không ', u' k ': u' không ', 'khong': u' không ', u' hok ': u' không ',' k ':u' không ',u'chẳng':u'không',u'đéo':u'không'}
    for k, v in replace_list.items():
        text = text.replace(k, v)
    
    # Xóa icon
    text = re.sub(r"\W", " ",text)
    
    # Xóa dấu câu
    text = re.sub('['+string.punctuation+']', ' ', text)
    
    # Tokenizer
    text = ViTokenizer.tokenize(text)
    return text

In [84]:
new_sentence = []
for i in range(train.shape[0]):
    new_sentence.append(pre_processing(train['Opinion'][i]))
train['New_Opinion'] = new_sentence

## SVM, Kernel = 'linear'

In [95]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [90]:
class SVMModel:
    def __init__(self):
        self.clf = self._init_pipeline()
        
    @staticmethod
    def _init_pipeline():
        stopwords=('rằng', 'thì', 'mà', 'là', 'thế', 'à', 'ừ', 'vậy', 'như')
        pipe_line = Pipeline([
            ("vectorizer", CountVectorizer(stop_words=stopwords)), # bag of words
            ("tfidf", TfidfTransformer()), # tf-idf
            ("clf_svm", SVC(kernel='linear', probability=True)) # svm kernel = 'linear'
        ])
        return pipe_line

## Slit data

In [92]:
from sklearn.model_selection import train_test_split

In [97]:
x_train_val, x_test, y_train_val, y_test = train_test_split(train['New_Opinion'].values, train['Label'].values, test_size=.2, random_state = 0)

In [98]:
x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size=.2, random_state = 0)

In [99]:
clf_svm = SVMModel().clf.fit(x_train, y_train)
print("Train accuracy:",clf_svm.score(x_train, y_train))
print("Test accuracy:", clf_svm.score(x_test, y_test))

Train accuracy: 0.9191002367797948
Test accuracy: 0.7894073139974779


In [100]:
vocab_size = 10000
embedding_dim = 64
max_length = 100
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'

In [102]:
# Tạo Tokenizer
tokenizer = Tokenizer(num_words= vocab_size, oov_token= oov_tok)

# Đưa từ vào tokenizer để tạo từ điển
tokenizer.fit_on_texts(x_train)

# Biến các câu train thành sequences
x_train = tokenizer.texts_to_sequences(x_train)

# Padding cho các train sequences
x_train = pad_sequences(x_train, maxlen= max_length, padding=padding_type, truncating= trunc_type)

# Biến các câu val thành sequences tương ứng
x_val = tokenizer.texts_to_sequences(x_val)

x_val = pad_sequences(x_val, maxlen= max_length, padding=padding_type, truncating= trunc_type)

# Biến các câu test thành sequences tương ứng
x_test = tokenizer.texts_to_sequences(x_test)

x_test = pad_sequences(x_test, maxlen= max_length, padding=padding_type, truncating= trunc_type)

## Build Model

In [115]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=x_train.shape[1]))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 64)           640000    
_________________________________________________________________
bidirectional_3 (Bidirection (None, 64)                24832     
_________________________________________________________________
dense_6 (Dense)              (None, 16)                1040      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 17        
Total params: 665,889
Trainable params: 665,889
Non-trainable params: 0
_________________________________________________________________


In [116]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=4, validation_data=(x_val, y_val))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x1d3b143fb70>

In [117]:
loss, acc = model.evaluate(x_test, y_test)

