In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from underthesea import word_tokenize
from sklearn.model_selection import train_test_split

df = pd.read_excel("your_data_file.xlsx")
questions = df["question"].tolist()
answers = df["answer"].tolist()

tokenized_questions = [word_tokenize(question) for question in questions]
tokenized_answers = [word_tokenize(answer) for answer in answers]

max_length = max(max(len(seq) for seq in tokenized_questions), max(len(seq) for seq in tokenized_answers))
padded_questions = pad_sequences(tokenized_questions, maxlen=max_length, padding='post')
padded_answers = pad_sequences(tokenized_answers, maxlen=max_length, padding='post')

tokenizer = Tokenizer()
tokenizer.fit_on_texts(questions + answers)

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_length),
    tf.keras.layers.LSTM(256),
    tf.keras.layers.Dense(len(tokenizer.word_index) + 1, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
train_questions, test_questions, train_answers, test_answers = train_test_split(padded_questions, padded_answers, test_size=0.2)
model.fit(train_questions, train_answers, epochs=20)

test_loss, test_acc = model.evaluate(test_questions, test_answers, verbose=2)
print(f"Test Accuracy: {test_acc}")

input_question = "Bạn là ai?"
input_sequence = pad_sequences([word_tokenize(input_question)], maxlen=max_length, padding='post')
predicted_sequence = model.predict(input_sequence)
predicted_answer = tokenizer.sequences_to_texts([predicted_sequence.argmax(axis=-1)])

print(f"Input Question: {input_question}")
print(f"Predicted Answer: {predicted_answer}")
