In [42]:
# prepare dataset
import json
import pandas as pd

with open("data/intents.json") as data_file:
    data = json.load(data_file)

In [43]:
text_input = []
intents = []

for intent in data['intents']:
    for pattern in intent['patterns']:
        text_input.append(pattern)
        intents.append(intent['tag'])

df = pd.DataFrame({'text_input': text_input,
                    'intents': intents})

df.head()

Unnamed: 0,text_input,intents
0,Hai,salam
1,Hi,salam
2,Halo,salam
3,Selamat Pagi,salam
4,Selamat Siang,salam


In [44]:
df.intents.value_counts()

ganti email                                   7
penarikan dana refund                         7
keamanan akun                                 7
salam                                         6
ganti password                                6
Pesanan belum diterima                        6
bye                                           5
lupa password                                 5
cara pembayaran                               5
pengaturan akun                               5
pekerjaan                                     5
pengiriman                                    5
pembayaran                                    4
nama                                          4
pembatalan                                    4
virtual account                               4
pengembalian dana                             3
cara membatalkan pesanan                      2
pelacakan                                     2
dana                                          2
estimasi pengembalian dana              

In [45]:
# data cleansing
import string

# convert lowercase
df.text_input = df.text_input.apply(lambda x: x.lower())

# remove punctuation
exclude = set(string.punctuation)
df.text_input = df.text_input.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [46]:
# label encoding
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

le = LabelEncoder()
y_train = le.fit_transform(df.intents)
y_train = to_categorical(y_train)

In [47]:
all_vocab = []
length = []

for idx, row in df.iterrows():
    sent = row['text_input']
    [all_vocab.append(i) for i in sent.split()]
    length.append(len(sent.split()))

In [48]:
len(all_vocab)

341

In [49]:
max(length)

12

In [50]:
len(set(all_vocab))

113

In [51]:
from tensorflow.keras.layers import TextVectorization

max_vocab_length = 86
max_length = 6

text_vectorization = TextVectorization(max_tokens=max_vocab_length,
                                       standardize='lower_and_strip_punctuation',
                                       split='whitespace',
                                       ngrams=None,
                                       output_mode='int',
                                       output_sequence_length=max_length
                                       )


In [52]:
text_vectorization.adapt(df.text_input)

In [53]:
text_vectorization.get_vocabulary()

['',
 '[UNK]',
 'saya',
 'cara',
 'pesanan',
 'dana',
 'akun',
 'pembayaran',
 'password',
 'pengembalian',
 'apakah',
 'email',
 'melakukan',
 'bagaimana',
 'apa',
 'stock',
 'sepatu',
 'ready',
 'produk',
 'selamat',
 'refund',
 'mengubah',
 'mengganti',
 'mengatur',
 'menarik',
 'keamanan',
 'kamu',
 'ganti',
 'account',
 'virtual',
 'tugas',
 'siapa',
 'sampai',
 'pergantian',
 'pengiriman',
 'nama',
 'menerima',
 'membatalkan',
 'lama',
 'kendala',
 'dalam',
 'bisa',
 'belum',
 'anda',
 'air',
 'waktu',
 'tidak',
 'proteksi',
 'penarikan',
 'ok',
 'nike',
 'metode',
 'menjaga',
 'mengalami',
 'lupa',
 'jordan',
 'hilang',
 'estimasi',
 'dapatkah',
 'berapa',
 'barang',
 'adidas',
 'ada',
 '1',
 'zoom',
 'yang',
 'x',
 'vitrual',
 'untuk',
 'tipe',
 'tinggal',
 'tersedia',
 'terima',
 'status',
 'siang',
 'shoes',
 'saja',
 'reebok',
 'pengaturan',
 'pekerjaan',
 'pagi',
 'nmsr1',
 'nmdr1',
 'ngapain',
 'nano',
 'namanya']

In [54]:
text_vectorization('halo nama kamu siapa')

<tf.Tensor: shape=(6,), dtype=int64, numpy=array([ 1, 35, 26, 31,  0,  0], dtype=int64)>

In [55]:
text_vectorization.get_vocabulary()[0]

''

In [56]:
from tensorflow.keras.layers import Embedding
embedding = Embedding(input_dim=max_vocab_length,
                      output_dim=16,
                      embeddings_initializer="uniform",
                      input_length=max_length)

In [57]:
import numpy as np
res_embed = embedding(np.array([[71, 17,  7, 13,  0,  0]]))
res_embed

<tf.Tensor: shape=(1, 6, 16), dtype=float32, numpy=
array([[[ 0.03356839, -0.01418524, -0.00959344, -0.03244349,
          0.00525322, -0.00678716,  0.01375965, -0.02039598,
          0.02254013,  0.02400767, -0.00607923,  0.0014984 ,
         -0.00309516,  0.03303765, -0.00576893, -0.03698812],
        [ 0.03654864,  0.03878578,  0.03779382, -0.04306762,
          0.03783691, -0.03189838,  0.02844239,  0.02600267,
         -0.00995468, -0.00505088, -0.04161853, -0.02678424,
          0.00177746,  0.0352616 , -0.02740099,  0.04787293],
        [ 0.04565973,  0.00615553,  0.00092741, -0.00173266,
         -0.00930811, -0.01957765, -0.0356604 ,  0.04530204,
          0.02735212,  0.00239993, -0.00365553, -0.02785708,
         -0.02054544,  0.00863456, -0.01544323, -0.0376973 ],
        [-0.00935214,  0.02385615,  0.01786139, -0.01702861,
         -0.01345649, -0.01990881, -0.03531392,  0.02053029,
          0.00897446, -0.03801204, -0.01730996,  0.04298021,
         -0.04110704, -0.01763

In [58]:
# modelling
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, LSTM
inputs = Input(shape=(1,), dtype='string')
x = text_vectorization(inputs)
x = embedding(x)
x = LSTM(12)(x)
outputs = Dense(27, activation='softmax')(x)
model_lstm = Model(inputs, outputs, name="LSTM_model")

In [59]:
# compile model
model_lstm.compile(loss='categorical_crossentropy',
                   optimizer='adam',
                   metrics=["accuracy"])

In [60]:
model_lstm.fit(df.text_input,
                y_train,
                epochs=10000,
                verbose=0)

<keras.callbacks.History at 0x14f9e639480>

In [61]:
model_lstm.evaluate(df.text_input, y_train)



[0.040633637458086014, 0.9708737730979919]

In [62]:
model_lstm.save("bot_model.tf")



INFO:tensorflow:Assets written to: bot_model.tf\assets


INFO:tensorflow:Assets written to: bot_model.tf\assets


In [63]:
import pickle
le_filename = open("label_encoder.pickle", "wb")
pickle.dump(le, le_filename)
le_filename.close()

In [64]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [65]:
documentA = 'Berapa lama waktu pengiriman pesanan saya?'
documentB = 'Untuk Estimasi pengiriman sekitar 1-2 hari'

In [66]:
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')
bagOfWordsA 

['Berapa', 'lama', 'waktu', 'pengiriman', 'pesanan', 'saya?']

In [67]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

In [68]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1
numOfWordsA

{'Berapa': 1,
 'Untuk': 0,
 'sekitar': 0,
 'pengiriman': 1,
 'saya?': 1,
 '1-2': 0,
 'lama': 1,
 'waktu': 1,
 'pesanan': 1,
 'Estimasi': 0,
 'hari': 0}

In [69]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords.words('indonesian')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kelvi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['ada',
 'adalah',
 'adanya',
 'adapun',
 'agak',
 'agaknya',
 'agar',
 'akan',
 'akankah',
 'akhir',
 'akhiri',
 'akhirnya',
 'aku',
 'akulah',
 'amat',
 'amatlah',
 'anda',
 'andalah',
 'antar',
 'antara',
 'antaranya',
 'apa',
 'apaan',
 'apabila',
 'apakah',
 'apalagi',
 'apatah',
 'artinya',
 'asal',
 'asalkan',
 'atas',
 'atau',
 'ataukah',
 'ataupun',
 'awal',
 'awalnya',
 'bagai',
 'bagaikan',
 'bagaimana',
 'bagaimanakah',
 'bagaimanapun',
 'bagi',
 'bagian',
 'bahkan',
 'bahwa',
 'bahwasanya',
 'baik',
 'bakal',
 'bakalan',
 'balik',
 'banyak',
 'bapak',
 'baru',
 'bawah',
 'beberapa',
 'begini',
 'beginian',
 'beginikah',
 'beginilah',
 'begitu',
 'begitukah',
 'begitulah',
 'begitupun',
 'bekerja',
 'belakang',
 'belakangan',
 'belum',
 'belumlah',
 'benar',
 'benarkah',
 'benarlah',
 'berada',
 'berakhir',
 'berakhirlah',
 'berakhirnya',
 'berapa',
 'berapakah',
 'berapalah',
 'berapapun',
 'berarti',
 'berawal',
 'berbagai',
 'berdatangan',
 'beri',
 'berikan',
 'berikut'

In [70]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

In [71]:
tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)

In [72]:
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [73]:
idfs = computeIDF([numOfWordsA, numOfWordsB])

In [74]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [75]:
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)

df = pd.DataFrame([tfidfA, tfidfB])
df

Unnamed: 0,Berapa,Untuk,sekitar,pengiriman,saya?,1-2,lama,waktu,pesanan,Estimasi,hari
0,0.115525,0.0,0.0,0.0,0.115525,0.0,0.115525,0.115525,0.115525,0.0,0.0
1,0.0,0.115525,0.115525,0.0,0.0,0.115525,0.0,0.0,0.0,0.115525,0.115525


In [76]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([documentA, documentB])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
df



Unnamed: 0,berapa,estimasi,hari,lama,pengiriman,pesanan,saya,sekitar,untuk,waktu
0,0.42616,0.0,0.0,0.42616,0.303216,0.42616,0.42616,0.0,0.0,0.42616
1,0.0,0.471078,0.471078,0.0,0.335176,0.0,0.0,0.471078,0.471078,0.0


In [77]:
import numpy as np

def cosine_similarity(x, y):
    
    # Ensure length of x and y are the same
    if len(x) != len(y) :
        return None
    
    # Compute the dot product between x and y
    dot_product = np.dot(x, y)
    
    # Compute the L2 norms (magnitudes) of x and y
    magnitude_x = np.sqrt(np.sum(x**2)) 
    magnitude_y = np.sqrt(np.sum(y**2))
    
    # Compute the cosine similarity
    cosine_similarity = dot_product / (magnitude_x * magnitude_y)
    
    return cosine_similarity

In [78]:
corpus = [ "Bagaimana saya menerima pengembalian dana dan berapa lama estimasti pengembalian dana sampai masuk ke dalam rekening saya kembali",
          "Syarat syarat apa saja untuk mengajukan pengembalian dana agar pengembalian dana disetujui dan berapa lama proses pengembalian dana tersebut",
          "Berapa lama proses untuk pengembalian dana"]

In [79]:
from sklearn.feature_extraction.text import CountVectorizer

# Create a matrix to represent the corpus
X = CountVectorizer().fit_transform(corpus).toarray()

print(X)

[[0 0 1 1 1 1 2 0 1 1 1 1 1 1 0 2 0 1 0 1 2 0 0 0]
 [1 1 0 1 0 1 3 1 0 0 0 1 0 0 1 3 1 0 1 0 0 2 1 1]
 [0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 1]]


In [80]:
cos_sim_1_2 = cosine_similarity(X[0, :], X[1, :])
cos_sim_1_3 = cosine_similarity(X[0, :], X[2, :])
cos_sim_2_3 = cosine_similarity(X[1, :], X[2, :])

print('Cosine Similarity between: ')
print('\tDocument 1 and Document 2: ', cos_sim_1_2)
print('\tDocument 1 and Document 3: ', cos_sim_1_3)
print('\tDocument 2 and Document 3: ', cos_sim_2_3)

Cosine Similarity between: 
	Document 1 and Document 2:  0.5330017908890262
	Document 1 and Document 3:  0.5000000000000001
	Document 2 and Document 3:  0.7106690545187015


In [81]:
from sklearn.metrics.pairwise import cosine_similarity

In [82]:
cos_sim_1_2 = cosine_similarity([X[0, :], X[1, :]])

print('Cosine Similarity between Document 1 and Document 2 is \n',cos_sim_1_2 )

Cosine Similarity between Document 1 and Document 2 is 
 [[1.         0.53300179]
 [0.53300179 1.        ]]
