In [46]:
import random
import tensorflow as tf
from tensorflow.keras.optimizers import SGD # from tensorflow.keras.optimizers.legacy import SGD # 
from pyvi import ViTokenizer, ViPosTagger, ViUtils
from tensorflow.python.keras.layers import Dense, Dropout # from keras.layers import Dense, Dropout
from tensorflow.python.keras.models import load_model # from keras.models import load_model
from tensorflow.python.keras.models import Sequential # from keras.models import Sequential
import numpy as np
import pickle, os
import json, warnings
import nltk, re
from nltk.stem import WordNetLemmatizer

warnings.filterwarnings("ignore")

In [None]:
# lemmatizer = WordNetLemmatizer()
# try:
#     nltk.download('omw-1.4')
#     nltk.download("punkt")
#     nltk.download("wordnet")
# except :
#     print("need to check")

In [47]:
words = []
classes = []
documents = []
ignore_words = ["?", "!"]
current_fpath = os.getcwd()
print("current file-path", current_fpath)
data_folder = '\\'.join(current_fpath.split('\\')[:-1]) # r"/workspaces/AI_chatbot_flask"
print(f"file in this directory: {os.listdir(data_folder)}")
data_file = open(f"{data_folder}\\json_data\\intents.json", encoding='utf-8').read()
intents = json.loads(data_file)

current file-path d:\All_chatbot\retrieval_based_bot\notebooks
file in this directory: ['flask_app', 'json_data', 'models', 'notebooks']


In [48]:
# words
for intent in intents["intents"]:
    for pattern in intent["patterns"]:

        # take each word and tokenize it
        patt = ViTokenizer.tokenize(pattern)
        w = nltk.word_tokenize(patt)
        words.extend(w)
        # adding documents
        documents.append((w, intent["tag"]))

        # adding classes to our class list
        if intent["tag"] not in classes:
            classes.append(intent["tag"])

In [49]:
# lemmatizer
words = [lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))

classes = sorted(list(set(classes)))

#print(len(documents), "documents")
print(f"Number of pre-trained class {len(classes)}")
print(f"Number of words: {len(words)} \n The last 20 words {words[-21:-1]}")

pickle.dump(words, open(f"{data_folder}//models//words.pkl", "wb"))
pickle.dump(classes, open(f"{data_folder}//models//classes.pkl", "wb"))

Number of pre-trained class 9
Number of words: 102 
 The last 20 words ['tschuss', 'tư_vấn', 'tạm_biệt', 'tọa_lạc', 'vous', 'we', 'wer', 'what', 'when', 'where', 'who', 'work', 'xin', 'you', 'your', 'ê', 'êtes', 'đâu', 'đã', 'địa_chỉ']


In [50]:
# training initializer
# initializing training data
training = []
output_empty = [0] * len(classes)
for doc in documents:
    # initializing bag of words
    bag = []
    # list of tokenized words for the pattern
    pattern_words = doc[0]
    # lemmatize each word - create base word, in attempt to represent related words
    pattern_words = [lemmatizer.lemmatize(word.lower()) for word in pattern_words]
    # create our bag of words array with 1, if word match found in current pattern
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)

    # output is a '0' for each tag and '1' for current tag (for each pattern)
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1

    training.append([bag, output_row])

In [51]:
# shuffle our features and turn into np.array
random.shuffle(training)
training = np.array(training, dtype='object')
# create train and test lists. X - patterns, Y - intents
train_x = list(training[:, 0])
train_y = list(training[:, 1])

print("Training data created")

Training data created


In [52]:
# actual training
# Create model - 3 layers. First layer 128 neurons, second layer 64 neurons and 3rd output layer contains number of neurons
# equal to number of intents to predict output intent with softmax
model = Sequential()
model.add(Dense(512, input_shape=(len(train_x[0]),), activation="relu"))
model.add(Dropout(0.4))
model.add(Dense(256, activation="relu"))
model.add(Dense(128, activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(len(train_y[0]), activation="softmax"))
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 512)               52736     
_________________________________________________________________
dropout_6 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_14 (Dense)             (None, 128)               32896     
_________________________________________________________________
dropout_7 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 9)                 1161      
Total params: 218,121
Trainable params: 218,121
Non-trainable params: 0
________________________________________________

In [53]:
from keras import callbacks 

# Compile model. Stochastic gradient descent with Nesterov accelerated gradient gives good results for this model
sgd = SGD(learning_rate=0.01, momentum=0.9, nesterov=True)
model.compile(loss="categorical_crossentropy", optimizer='SGD', metrics=["accuracy"])

earlystopping = callbacks.EarlyStopping(monitor ="loss", mode ="min", patience = 5, restore_best_weights = True)
callbacks =[earlystopping]

# fitting and saving the model
hist = model.fit(np.array(train_x), np.array(train_y), epochs=100, batch_size=5, verbose=1, callbacks = callbacks)
model.save(f"{data_folder}//models//chatbot_model.h5", hist)
print("model created")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [58]:
tf.__version__

'2.8.0'

In [54]:
model = load_model(f"{data_folder}//models//chatbot_model.h5")
intents = json.loads(open(f"{data_folder}//json_data//intents.json", encoding='utf-8').read())
words = pickle.load(open(f"{data_folder}//models//words.pkl", "rb"))
classes = pickle.load(open(f"{data_folder}//models//classes.pkl", "rb"))
data_file = open(f"{data_folder}//json_data//intents.json", encoding='utf-8').read()
intents = json.loads(data_file)

# chat functionalities
def clean_up_sentence(sentence):
    sentence_words = nltk.word_tokenize(ViTokenizer.tokenize(sentence))
    sentence_words = [lemmatizer.lemmatize(word.lower()) for word in sentence_words]
    return sentence_words

In [55]:
def bow(sentence, words, show_details=True):
    # tokenize the pattern
    sentence_words = clean_up_sentence(sentence)
    # bag of words - matrix of N words, vocabulary matrix
    bag = [0] * len(words)
    for s in sentence_words:
        for i, w in enumerate(words):
            if w == s:
                # assign 1 if current word is in the vocabulary position
                bag[i] = 1
                if show_details:
                    print("found in bag: %s" % w)
    return np.array(bag)

In [56]:
def rounding_text_hour(text):

    mask1 = any(word in text for word in ['tiếng', 'phút', 'giờ', 'minute', 'minutes'])
    mask2 = any(word in text for word in ['giờ làm việc', 'giờ hành chính', 'giờ đi làm', 'giờ vào làm', 'mấy giờ', "giờ ngủ trưa",
                                          'giờ có mặt', 'giờ tan sở', 'giờ ra về', 'giờ giải lao', 'giờ nghỉ trưa', "giờ nghỉ ngơi"])
    if mask1 & ~mask2:
        text = text.replace('giờ', 'tiếng').replace('hour', 'tiếng').replace('minute', 'phút').replace('mins', 'phút')
        h = re.findall('[0-9]+tiếng', text.replace(" ", ""))
        m = re.findall('[0-9]+phút', text.replace(" ", ""))
        if ((len(h) > 0) & (len(m) > 0)):
            h = h[0].replace("tiếng", "")
            m = m[0].replace("phút", "")
            if (int(m) < 0) or (int(m) > 60):
                txt = "số giờ (hoặc phút) làm việc không hợp lệ"
            elif int(m) < 60:
                txt = f"dưới {int(h) + 1} tiếng nhưng đã trên {int(h)} giờ làm việc"
            else:
                txt = f"dưới {int(h) + (m // 60) + 1} tiếng nhưng đã trên {int(h)} giờ làm việc"
        elif (len(h) == 0):
            txt = "dưới 1 tiếng làm việc"
        elif (len(m) == 0):
            txt = f"dưới {re.findall('[0-9]', h[0])[0]} giờ làm việc"
        else: 
            txt = text
        return txt    
    else: 
        return text

In [57]:
from datetime import date, datetime, time, timedelta
from transformers import pipeline
gpt_model = pipeline("text-generation", model = "gpt2")

while True:
    print(100*"=")
    sentence = input("You: ")
    res_sentence = gpt_model(sentence, 
                            do_sample=True, top_k=3, 
                            temperature=0.9, max_length=120)
    print(f"You: {sentence}")
    sentence = rounding_text_hour(sentence)
    if sentence == "quit":
        break
    p = bow(sentence, words, show_details=False)
    res = model.predict(np.array([p]))[0]
    ERROR_THRESHOLD = 0.1
    results = [[i, r] for i, r in enumerate(res) if r > ERROR_THRESHOLD]
    # sort by strength of probability
    results.sort(key=lambda x: x[1], reverse=True)
    return_list = []
    for r in results:
        return_list.append({"intent": classes[r[0]], "probability": str(r[1])})
    ints = return_list
    print(ints)

    if len(return_list) > 0:
        tag = ints[0]["intent"]
        list_of_intents = intents["intents"]

        for i in list_of_intents:
            if i["tag"] == tag:
                print(f"Tag: {tag}")
                result = random.choice(i["response"])
                print(f"BOT: {result}")
                if i["tag"] == "daytime_today":
                    today_date = (datetime.now() + timedelta(hours=7)).strftime('%b-%d, %Y \t %H:%M:%S GMT+07')
                    print(f"\tToday is {today_date }")
    else: 
        print("xin giải thích rõ ràng hơn vì có thể bot không hiểu hoặc chưa được học!")

You: hi
[{'intent': 'greeting', 'probability': '0.91549516'}]
Tag: greeting
BOT: (VI): chào bạn, bạn cần hỗ trợ điều gì?
(EN): Hi, what support do you need?
(GE): Hallo, welche Unterstützung brauchst du?
(FR): Bonjour, de quel soutien avez-vous besoin?
You: quit
