In [1]:
import nltk
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
import numpy as np
import random
# tải về dữ liệu tiếng Việt
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# đường dẫn tới file chứa dữ liệu câu hỏi và câu trả lời
data_path = './chatDataSet.txt'

# đọc dữ liệu từ file
with open(data_path, 'r', encoding='utf8') as f:
    lines = f.read().split('\n')
    
# xử lý dữ liệu
lemmatizer = WordNetLemmatizer()

In [3]:
# tách câu thành từng từ và đưa về dạng nguyên mẫu
def preprocess_text(text):
    text = text.lower().strip()
    words = nltk.word_tokenize(text)
    return [lemmatizer.lemmatize(w) for w in words]

# tạo từ điển chứa các từ trong dữ liệu
word_dict = {}
for line in lines:
    words = preprocess_text(line)
    for word in words:
        if word not in word_dict:
            word_dict[word] = len(word_dict)

In [4]:
# tạo câu hỏi và câu trả lời dưới dạng chuỗi số
question_seqs = []
answer_seqs = []

for line in lines:
    if line:
        line = line.strip()
        parts = line.split(":")
        if len(parts) == 2:
            question = preprocess_text(parts[0])
            answer = preprocess_text(parts[1])
            question_seqs.append([word_dict[q] for q in question])
            answer_seqs.append([word_dict[a] for a in answer])

In [5]:
# chuyển đổi định dạng chuỗi số sang ma trận
maxlen = 50
question_data = np.zeros((len(question_seqs), maxlen), dtype=np.int32)
answer_data = np.zeros((len(answer_seqs), maxlen), dtype=np.int32)

for i, seq in enumerate(question_seqs):
    if seq:
        question_data[i, -len(seq):] = np.array(seq)[:maxlen]

for i, seq in enumerate(answer_seqs):
    if seq:
        answer_data[i, -len(seq):] = np.array(seq)[:maxlen]

In [11]:
question_data.shape

(12859, 50)

In [12]:
answer_data.shape

(12859, 50)

In [4]:
vocab_size = len(word_dict)
embedding_dim = 128
lstm_units = 256
embedding_dim = 50
from keras.layers import TimeDistributed, Bidirectional

def create_model():
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=maxlen),
        LSTM(lstm_units, return_sequences=True),
        Dropout(0.2),
        Dense(256, activation='relu'),
        Dropout(0.2),
        Dense(128, activation='relu'),
        Dropout(0.2),
        Dense(256, activation='relu'),
        Dropout(0.2),
        TimeDistributed(Dense(vocab_size, activation='softmax'))
    ])
    
    optimizer = Adam(lr=learning_rate)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    return model

In [5]:
model = create_model()
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 50)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 50)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 50, 200)      1000200     ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 50, 200)      1000200     ['input_2[0][0]']                
                                                                                              

In [7]:
import tensorflow as tf
# huấn luyện mô hình
batch_size = 64
epochs = 50
with tf.device('/gpu:0'):
    history = model.fit(question_data, tf.keras.utils.to_categorical(answer_data, num_classes=vocab_size),
                        batch_size=batch_size, epochs=epochs, validation_split=0.2)

ValueError: Failed to find data adapter that can handle input: (<class 'list'> containing values of types {'(<class \'list\'> containing values of types {\'(<class \\\'list\\\'> containing values of types {"<class \\\'int\\\'>"})\'})'}), <class 'numpy.ndarray'>

In [None]:
import matplotlib.pyplot as plt
plt.plot(history.history['val_loss'], c = 'coral', label = 'validation loss line')
plt.plot(history.history['loss'], c = 'blue', label = 'train loss line')
legend = plt.legend(loc='upper center')
plt.show()

In [9]:
# save model
model.save(r'./botChatv1.h5')

In [10]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
def predict_answer(question):
    # tiền xử lý câu hỏi
    question = preprocess_text(question)
    seq = [word_dict[word] for word in question]
    seq = pad_sequences([seq], maxlen=maxlen)
    print(question)
    # dự đoán câu trả lời
    prediction = model.predict(seq)
    prediction = prediction[0][-len(question):]
    prediction = np.argmax(prediction, axis=1)

    # chuyển đổi lại sang dạng văn bản
    print(prediction)
    rev_word_dict = {v: k for k, v in word_dict.items()}
    answer = [rev_word_dict[idx] for idx in prediction]
    answer = ' '.join(answer)

    return answer

In [15]:
question = 'khôn'
print(predict_answer(question))

['khôn']
[0]
ngon
