In [None]:
!pip install 

In [2]:
!pip install -q pysastrawi

In [3]:
# Import library that will be used in this project

# Library for Visualization
import matplotlib.pyplot as plt

# Library for preprocessing
import string
import re
import numpy as np
import pandas as pd
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Library for modelling
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import XGBoost
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from util import JSONParser
from sklearn.model_selection import train_test_split

# Library for model evaluation
from sklearn import metrics

In [4]:
# Load data from folders
path = "D:\Bootcamp Data Science Batch 011 - Hacktiv8\Final Project\p2---final-project-ftds-012-group-003\data\intents.json"

# Defining JSONParser
jp = JSONParser()

# Parsing data intents
jp.parse(path)

# Building dataframe and save it to variable df
df = jp.get_dataframe()

In [5]:
df

Unnamed: 0,chat_input,intents
0,Hai,menyapa
1,Halo,menyapa
2,Hello,menyapa
3,Hei,menyapa
4,Oi,menyapa
...,...,...
227,bisa akses dimana,website
228,info lengkap perusahaan dimana min?,website
229,apakah ada webnya,website
230,profile bank nya dimana ya?,website


In [9]:


# Look into sample chat
sample_chat = df.chat_input[200]
sample_chat



'Berat ga unduh aplikasinya?'

In [10]:
# Transforming the chat into lowercase
chat_lower = sample_chat.lower()
chat_lower

'berat ga unduh aplikasinya?'

In [11]:


# Remove punctuations from the chat
chat_punct = chat_lower.translate(str.maketrans("","",string.punctuation))
chat_punct



'berat ga unduh aplikasinya'

In [12]:
# We need to change non-alphabetical characters with spaces too to make the data cleaner.
chat_punct = re.sub("[^A-Za-z\s']"," ", chat_punct)
chat_punct

'berat ga unduh aplikasinya'

In [14]:
# Remove tab, in case the customer is mistype inputting tab on their chat
chat_punct = chat_punct.strip()
chat_punct

'berat ga unduh aplikasinya'

In [15]:
# Defining Stemmer
stemmer = StemmerFactory().create_stemmer()

# Applying stemmer to the chat
output   = stemmer.stem(chat_punct)
output

'berat ga unduh aplikasi'

In [16]:
# Create function to case folding corpus in the dataframe
def document_processing(document):
    # Transform Document Into Lowercase
    document = document.lower()

    # Remove Punctuation From Document
    document = document.translate(str.maketrans("","",string.punctuation))

    # Remove Digit From Document
    document = re.sub("[^A-Za-z\s']"," ", document)

    # Remove Tab From Document
    document = document.strip()

    #Stemmer
    stemmer = StemmerFactory().create_stemmer()

    # stemming process
    document = stemmer.stem(document)

    return document

## 3.2 Data Splitting

In [17]:
# Split between data (X) and target (y)
X = df.chat_input
y = df.intents

In [18]:
# Do Preprocessing text with Case Folding Function
X_proc = X.apply(document_processing)



# 3.3. Word Vectorization


In [19]:
# Define text Vectorizer
vect = CountVectorizer()
vect.fit(X_proc)

CountVectorizer()

In [20]:
# Split Data for Train-Set and Test-Set
X_train, X_test,  y_train, y_test = train_test_split(X_proc, y,test_size=0.2, random_state=4)
print(f"Train-Set shape      : {len(X_train),len(y_train)}")
print(f"Test-Set shape       : {len(X_test),len(y_test)}")

Train-Set shape      : (185, 185)
Test-Set shape       : (47, 47)


# IV. Model Training

In [21]:
# MultinomialNB Pipeline
nb = make_pipeline(CountVectorizer(),
                   MultinomialNB())

# Training
nb.fit(X_train, y_train)

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('multinomialnb', MultinomialNB())])

In [22]:
# Random Forest Pipeline
xgb = make_pipeline(CountVectorizer(),
                   XGBoost())

# Training
rf.fit(X_train, y_train)

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('randomforestclassifier', RandomForestClassifier())])

In [23]:
# Decision Tree Pipeline
dt = make_pipeline(CountVectorizer(),
                   DecisionTreeClassifier())

# Training
dt.fit(X_train, y_train)

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('decisiontreeclassifier', DecisionTreeClassifier())])

# V. Model Evaluation

## 5.1. Model Evaluation for MultinomialNB

In [24]:
y_train_pred_nb = nb.predict(X_train)
y_test_pred_nb = nb.predict(X_test)
print("Train-Set Multinomial Naive Bayes model accuracy(in %):", metrics.accuracy_score(y_train_pred_nb, y_train)*100)
print("Test-Set Multinomial Naive Bayes model accuracy(in %) :", metrics.accuracy_score(y_test_pred_nb, y_test)*100)

Train-Set Multinomial Naive Bayes model accuracy(in %): 97.83783783783784
Test-Set Multinomial Naive Bayes model accuracy(in %) : 76.59574468085107


## 5.2. Model Evaluation for Decision Tree

In [25]:
y_train_pred_dt = dt.predict(X_train)
y_test_pred_dt = dt.predict(X_test)
print("Train-Set Decision Tree model accuracy(in %):", metrics.accuracy_score(y_train_pred_dt, y_train)*100)
print("Test-Set Decision Tree model accuracy(in %) :", metrics.accuracy_score(y_test_pred_dt, y_test)*100)

Train-Set Decision Tree model accuracy(in %): 99.45945945945947
Test-Set Decision Tree model accuracy(in %) : 78.72340425531915


# 5.3. Model Evaluation for Random Forest

In [26]:
y_train_pred_rf = rf.predict(X_train)
y_test_pred_rf = rf.predict(X_test)
print("Train-Set Random Forest model accuracy(in %):", metrics.accuracy_score(y_train_pred_rf, y_train)*100)
print("Test-Set Random Forest model accuracy(in %) :", metrics.accuracy_score(y_test_pred_rf, y_test)*100)

Train-Set Random Forest model accuracy(in %): 99.45945945945947
Test-Set Random Forest model accuracy(in %) : 72.3404255319149


# VI. Building Chatbot

In [27]:
print("Anda Terhubung dengan chatbot Kami")
while True:
    # input user
    chat = input("Saya : ")
    # Preprocessing
    chat_processed = document_processing(chat)
    # Intent prediction (tag)
    res = nb.predict_proba([chat_processed])
    # get the probability value and its location
    max_prob = max(res[0])
    max_idx = np.argmax(res[0])
    
    # Define condition for unknown input
    print("Saya :",chat)
    if max_prob < 0.20:
        print("Bot  : Maaf Kak, aku masih gak ngerti maksud kakak ):")
    # Define condition to give response towards specific tag
    else:
        print(f"Bot  : {jp.get_response(nb.classes_[max_idx])}\n")
    # Define response to end the chat for tag "menutup"
    if nb.classes_[max_idx] == 'menutup':
        break

Anda Terhubung dengan chatbot Kami
