## 1. Import Library

In [1]:
# import library
import string
import pickle
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from util import JSONParser

## 2. Load Data

In [2]:
# load data, and change data format from json to dataframe

# devine data path
path = "data/intents.json"

# buat objek JSONParser dan parse data intents.json
parser = JSONParser()
parser.parse(path)

# simpan dataframe dalam variabel df
data = parser.get_dataframe()

[INFO] Data JSON converted to DataFrame with shape : (107, 2)


In [3]:
data

Unnamed: 0,text_input,intents
0,Hai,salam
1,Hi,salam
2,Halo,salam
3,Apa Kabar,salam
4,Mau bertanya,salam
...,...,...
102,error,qna
103,kalau mau tanya-tanya kemana ya?,qna
104,kalau mau nanya kodingan kemana ya?,qna
105,bisa bantu kodingan ga?,qna


In [4]:
# hitung jumlah data per tag / inten
data.intents.value_counts()

website          15
terimakasih      12
creator          11
salam_malam       9
salam             9
salam_pagi        8
salam_assalam     8
bye               8
salam_siang       7
nama              6
pekerjaan         5
qna               5
jofi              4
Name: intents, dtype: int64

## 3. Preprocessing

### 3.1 Data preprocessing

In [5]:
def preprocess(chat):
    # konversi ke lowercase
    chat = chat.lower()
    # menghapus tanda baca
    tandabaca = tuple(string.punctuation)
    chat = ''.join(ch for ch in chat if ch not in tandabaca)
    return chat

In [6]:
# implementasikan fungsi preprocess ke string
data['input_after_prep'] = data.text_input.apply(preprocess)

In [7]:
data[['text_input', 'input_after_prep']].head(10)

Unnamed: 0,text_input,input_after_prep
0,Hai,hai
1,Hi,hi
2,Halo,halo
3,Apa Kabar,apa kabar
4,Mau bertanya,mau bertanya
5,Permisi,permisi
6,Salam,salam
7,Ping,ping
8,P,p
9,Assalamualaikum,assalamualaikum


### 3.1 Vektorisasi

In [8]:
# inisiasi objek CountVectorizer
vect = CountVectorizer()

In [9]:
# mengumpulkan vocab dari data teks yang sudah dilakukan praproses
vect.fit(data['input_after_prep'])

CountVectorizer()

In [10]:
# lihat list vocab
vect.get_feature_names()[:5] #batasi hanya 5 vocab teratas



['ada', 'afternoon', 'air', 'apa', 'apaan']

In [11]:
# ubah data teks menjadi matriks
text_vect = vect.transform(data.input_after_prep)

text_vect

<107x127 sparse matrix of type '<class 'numpy.int64'>'
	with 207 stored elements in Compressed Sparse Row format>

In [12]:
pd.DataFrame(text_vect.toarray(), columns=vect.get_feature_names())

Unnamed: 0,ada,afternoon,air,apa,apaan,apakah,apps,asalamualaikum,assalamualaikum,bagus,...,warahmatullahi,wb,web,webnya,website,websitenya,wr,ya,yang,you
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
103,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
104,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
105,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 4. Modelling

In [13]:
# deklarasi objek MultinomialNB
nb = MultinomialNB()

# training data, dengan X : text_vect dan y : intents
nb.fit(text_vect, data.intents)

MultinomialNB()

In [14]:
# Deklarasi pipeline yang mengandung vektorisasi (CountVectorizer) & pemodelan (MultinomialNB) 
pipe = make_pipeline(CountVectorizer(),
                     MultinomialNB())

# Training
pipe.fit(data.text_input, data.intents)

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('multinomialnb', MultinomialNB())])

In [15]:
# input string dari user
chat = input("Masukkan String : ")
# lakukan preproses
chat = preprocess(chat)

# prediksi teks kedalam pipeline
result = pipe.predict_proba([chat])

# ambil nilai probabilitas tertinggi
max_prob = max(result[0])
max_idx = np.argmax(result[0])
print(f"Max Prob : {max_prob}\nMax Index: {max_idx}\nLabel: {nb.classes_[max_idx]}")

Max Prob : 0.16235853524281035
Max Index: 6
Label: salam


## 5. Inference

In [16]:
print("Selamat Datang di JoFi Chatbot ")
while True:
    # input user
    chat = input("Anda : ")
    # praproses
    chat = preprocess(chat)
    # prediksi intent
    res = pipe.predict_proba([chat])
    # ambil nilai probabilitas & lokasinya
    max_prob = max(res[0])
    max_idx = np.argmax(res[0])
    # kondisi jika probabilitas kurang dari threshold
    if max_prob < 0.1:
        print("Bot : Maaf Kak, bisa pakai kata lain, aku ga ngerti :(")
    else:
        print(f"Bot : {parser.get_response(nb.classes_[max_idx])}")
    if nb.classes_[max_idx] == 'bye':
        break
    

Anda Terhubung dengan chatbot Kami
Bot : Hai
Bot : Waalaikumsalam kak
Bot : JoFi di summon oleh Kak Marwan dan Kak Rio
Bot : Untuk travel recomendationnya, bisa kunjungi website kami ya di https:
Bot : Kalau mau nanya lebih lanjut bisa ke github Kak Marwan : https://github.com/marwanmusa atau Kak Rio :https://github.com/rioarmiga
Bot : Untuk travel recomendationnya, bisa kunjungi website kami ya di https:


KeyboardInterrupt: Interrupted by user

In [None]:
with open("chatbot_model.pkl", "wb") as f:
    pickle.dump(pipe, f)