# ChatBot Let's Get Lost!

Notebook ini berisi dalam pembuatan model chatbot AI menggunakan JSON format file lalu mengunakan library TensorFlow dengan Keras dalam Python.

## Import Libraries

In [173]:
# Import Library
import json
import string
import pickle
import numpy as np
import pandas as pd

# Preprocessing
from sklearn.preprocessing import LabelEncoder

# Model
import tensorflow as tf

# Words Pre
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\TheSevenS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Load Data

In [174]:
# load json file

data_file = open('intents.json').read()
intents_json = json.loads(data_file)

In [175]:
# Create list from json
patterns = [] # Input
tag = [] # Classes/intent

for i in range(len(intents_json['intents'])):
    for user_patterns in intents_json['intents'][i]['patterns']:
        patterns.append(user_patterns)
        tag.append(intents_json['intents'][i]['tag'])


In [176]:
# Create dataframe from json
df = pd.DataFrame({
    'patterns': patterns,
    'tag' : tag,
})
df.head()

Unnamed: 0,patterns,tag
0,Hai,greet
1,Hi,greet
2,Halo,greet
3,Apa Kabar,greet
4,Selamat Pagi,greet


## Preprocessing

**Clean Text**

In [177]:
# Clean text
def clean_text(text):
    """
    Fungsi yang digunakan untuk melakukan praproses
    """
    # konversi ke lowercase
    text = text.lower()
    # menghapus tanda baca
    tandabaca = tuple(string.punctuation)
    text = ''.join(ch for ch in text if ch not in tandabaca)
    return text

In [178]:
# Konfirmasi function diatas
kalimat = 'Halo boleh bantuannya?'
clean_text(kalimat)

'halo boleh bantuannya'

In [179]:
df['clean_patterns'] = df['patterns'].apply(clean_text)
df['clean_patterns'].head()

0             hai
1              hi
2            halo
3       apa kabar
4    selamat pagi
Name: clean_patterns, dtype: object

**Tokenizing**

Setelah melakukan cleaning data, selanjutnya saya akan menggunakan nltk.word_tokenize untuk menggabungkan list kalimat yang ada untuk menjadi satu kalimat berdasarkan tags.

Tokenizing merupakan salah satu jenis preprocessing yang dilakukan sebelum membuat machine learning model. Tokenizing merupakan hal dasar pada data text.

In [180]:
# Create corpus
words = set([
    word for word in df['clean_patterns'] for word in word_tokenize(word)
    ]
)

In [181]:
corpus_size = len(words)

In [182]:
# Pengunaan tokenize pada text
df['length'] = df['clean_patterns'].apply(word_tokenize).apply(len)
sequence_length = int(round(df['length'].max(),0))
print(corpus_size,sequence_length)

106 7


**Encoding**

In [183]:
# Label Encoding
encoder = LabelEncoder()
y_train = encoder.fit_transform(df['tag'])
y_train = tf.keras.utils.to_categorical(y_train)

In [184]:
print(encoder.classes_)

['bye' 'canda' 'canda_2' 'creator' 'greet' 'nama' 'nanya_apps'
 'nanya_gobot' 'pilihan' 'pilihan_2' 'pilihan_3' 'thanks' 'weblink']


In [185]:
print(len(encoder.classes_))

13


**Vectorization**

Vektorisasi merupakan metode *bag of words* merupakan metode pengumpulan vocab pada corpus, metode yang saya gunakan pada hal ini adalah text vectorization yang terdapat pada TensorFlow kit.

In [186]:
vect = tf.keras.layers.TextVectorization(
    max_tokens=corpus_size,
    standardize='lower_and_strip_punctuation',
    split='whitespace',
    ngrams=None,
    output_mode='int',
    output_sequence_length=sequence_length
)
vect.adapt(df['clean_patterns'])   

In [187]:
check = 'halo boleh bantuannya?'
vect(clean_text(check))

<tf.Tensor: shape=(7,), dtype=int64, numpy=array([86,  1,  1,  0,  0,  0,  0], dtype=int64)>

**Embedding**

In [188]:
embedding = tf.keras.layers.Embedding(
    input_dim=corpus_size,
    output_dim=16,
    input_length=sequence_length,
    embeddings_initializer='uniform'
)

In [189]:
embedding(vect(clean_text(check)))

<tf.Tensor: shape=(7, 16), dtype=float32, numpy=
array([[ 0.03273423, -0.01466324, -0.02194047,  0.0195001 , -0.03684908,
        -0.03700713, -0.03961496, -0.02023194,  0.04614196, -0.02123632,
        -0.03651143, -0.02739429, -0.04634575, -0.02686164,  0.02724976,
         0.02910655],
       [ 0.04243039, -0.01286985,  0.0449164 ,  0.0356538 ,  0.0358046 ,
         0.04322822, -0.01013698,  0.0157564 ,  0.03122801,  0.03352822,
        -0.00829347, -0.03380398,  0.02241416, -0.03102088, -0.0467157 ,
         0.04269984],
       [ 0.04243039, -0.01286985,  0.0449164 ,  0.0356538 ,  0.0358046 ,
         0.04322822, -0.01013698,  0.0157564 ,  0.03122801,  0.03352822,
        -0.00829347, -0.03380398,  0.02241416, -0.03102088, -0.0467157 ,
         0.04269984],
       [ 0.00447254, -0.00448421, -0.01076812,  0.03249026, -0.01133959,
        -0.03677382, -0.0172616 , -0.02712643,  0.01529939,  0.00222324,
         0.02653955, -0.02027609, -0.02837888, -0.04144539, -0.02792264,
        -

## Model Training

From this chapter we will try train our data with LSTM (Long Short Term Memory) from tensorflow to make prediction for answer of the ChatBot.

In [190]:
# Functional API Model
input = tf.keras.layers.Input(shape=(1,), dtype='string')
hidden_1 = vect(input)
hidden_2 = embedding (hidden_1)
hidden_3 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128))(hidden_2)
output = tf.keras.layers.Dense(len(encoder.classes_), activation='softmax')(hidden_3)
model = tf.keras.Model(inputs=input, outputs=output)

#Compile Model
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics='accuracy')


In [191]:
# Show summary of model
model.summary()

Model: "model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_2 (TextV  (None, 7)                0         
 ectorization)                                                   
                                                                 
 embedding_2 (Embedding)     (None, 7, 16)             1696      
                                                                 
 bidirectional_5 (Bidirectio  (None, 256)              148480    
 nal)                                                            
                                                                 
 dense_7 (Dense)             (None, 13)                3341      
                                                                 
Total params: 153,517
Trainable params: 153,517
Non-trainab

In [192]:
hist = model.fit(df['clean_patterns'], y_train, epochs=100, verbose=0)
pd.DataFrame(hist.history).tail()

Unnamed: 0,loss,accuracy
95,0.012591,1.0
96,0.012208,1.0
97,0.011949,1.0
98,0.011445,1.0
99,0.010963,1.0


**Saving Model**

In [193]:
# Save Encoder

with open('encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)

In [194]:
pickle.dump({'config': vect.get_config(),
             'weights': vect.get_weights()}
            , open("vect.pkl", "wb"))

## Inference

In [195]:
def bot_response(text):
    """Take text as function input then predict using model. Return response based on highest probability using numpy argmax    
    """
    text = clean_text(text)
    pred = model.predict([text])
    res = encoder.classes_[pred.argmax()] # Mencari index yang memiliki probabilitas tertinggi
    i = 0
    try:
        if vect(text).numpy().max() > 1: # If the input is known word(s)
            while i < len(intents_json['intents']):
                if res == intents_json['intents'][i]['tag']:
                    responses = intents_json['intents'][i]['responses']
                    break
                else:
                    i+=1
        else: # If only unknown word(s)
            responses = ['Maaf kawan, aku tidak mengerti perkataan mu ...']
    except: # If empty string or any error occured
        responses = ['GoBot tidak mengerti :( ...']

    # For debugging only
    dict_temp = []
    for i in range(len(pred[0])):
        temp = {encoder.classes_[i]: pred[0][i]}
        dict_temp.append(temp)
    print(dict_temp)
    print(encoder.classes_[pred.argmax()])

    return print(np.random.choice(responses))

In [196]:
tes1 = 'hai'
clean_text(tes1)

'hai'

In [197]:
model.predict([clean_text(tes1)])



array([[1.1330127e-03, 2.7066740e-06, 2.6641314e-10, 4.9044564e-09,
        9.9672085e-01, 8.7677887e-05, 8.3971313e-10, 1.1822385e-06,
        1.9933213e-11, 1.6379321e-08, 5.6427102e-10, 2.0471208e-03,
        7.3999636e-06]], dtype=float32)

In [198]:
model.predict([clean_text(tes1)]).argmax()



4

In [199]:
encoder.classes_[model.predict([clean_text(tes1)]).argmax()]




'greet'

In [200]:
intents_json['intents'][4]['responses']

['GoBot merupakan online Chatbot, yang dapat membantu kawan untuk mecarikan rekomendasi tempat kawan liburan.']

In [201]:
bot_response('Hai!')

[{'bye': 0.0011330127}, {'canda': 2.706674e-06}, {'canda_2': 2.6641314e-10}, {'creator': 4.9044564e-09}, {'greet': 0.99672085}, {'nama': 8.767789e-05}, {'nanya_apps': 8.3971313e-10}, {'nanya_gobot': 1.1822385e-06}, {'pilihan': 1.9933213e-11}, {'pilihan_2': 1.6379321e-08}, {'pilihan_3': 5.64271e-10}, {'thanks': 0.0020471208}, {'weblink': 7.3999636e-06}]
greet
Halo selamat datang


In [202]:
bot_response('Dadah!')

[{'bye': 0.9937512}, {'canda': 0.00024376585}, {'canda_2': 3.529947e-08}, {'creator': 2.1293776e-08}, {'greet': 0.0012050346}, {'nama': 0.00017839667}, {'nanya_apps': 8.969308e-08}, {'nanya_gobot': 1.5127756e-06}, {'pilihan': 2.4669982e-09}, {'pilihan_2': 9.506941e-07}, {'pilihan_3': 1.6138257e-07}, {'thanks': 0.0027085817}, {'weblink': 0.0019101795}]
bye
Sampai jumpa, terima kasih telah bertanya!


In [208]:
bot_response('Halo!')

[{'bye': 0.0011078165}, {'canda': 2.6155403e-06}, {'canda_2': 2.3204419e-10}, {'creator': 4.3474633e-09}, {'greet': 0.99700636}, {'nama': 7.960388e-05}, {'nanya_apps': 7.4272694e-10}, {'nanya_gobot': 1.0980622e-06}, {'pilihan': 1.7230505e-11}, {'pilihan_2': 1.4881637e-08}, {'pilihan_3': 4.8229154e-10}, {'thanks': 0.0017956359}, {'weblink': 6.919822e-06}]
greet
Halo, ada yang bisa GoBot bantu?
