In [1]:
#Import libraries
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import re
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint

In [2]:
#function for loading the dataset
def load_dataset(filename):
    df = pd.read_csv(filename, encoding = "latin1", names = ["Sentence", "Intent"])
    print(df.head())
    intent = df["Intent"]
    unique_intent = list(set(intent))
    sentences = list(df["Sentence"])

    return (intent, unique_intent, sentences)

In [3]:
#loading the dataset
intent, unique_intent, sentences = load_dataset("chatbot_data.csv")

               Sentence    Intent
0                 Hello  greeting
1                   Hi!  greeting
2            Greetings!  greeting
3  Hi, how is it going?  greeting
4    How are you doing?  greeting


In [4]:
#printing top 5 rows
print(sentences[:5])

['Hello', 'Hi!', 'Greetings!', 'Hi, how is it going?', 'How are you doing?']


In [5]:
#data cleaning
def cleaning(sentences):
    words = []
    for s in sentences:
        clean = re.sub(r'[^ a-z A-Z 0-9]', " ", s)
        w = word_tokenize(clean)
        #stemming
        words.append([i.lower() for i in w])

    return words

In [6]:
cleaned_words = cleaning(sentences)
print(len(cleaned_words))
print(cleaned_words[:2])

31
[['hello'], ['hi']]


In [7]:
#filtering the data
def create_tokenizer(words, filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
    token = Tokenizer(filters = filters)
    token.fit_on_texts(words)
    return token

In [8]:
def max_length(words):
    return(len(max(words, key = len)))

In [9]:
#counting Vocab Size and Maximum length
word_tokenizer = create_tokenizer(cleaned_words)
vocab_size = len(word_tokenizer.word_index) + 1
max_length = max_length(cleaned_words)

print("Vocab Size = %d and Maximum length = %d" % (vocab_size, max_length))

Vocab Size = 51 and Maximum length = 6


In [10]:
# encoding function
def encoding_doc(token, words):
    return(token.texts_to_sequences(words))

In [11]:
encoded_doc = encoding_doc(word_tokenizer, cleaned_words)

In [12]:
# padding function
def padding_doc(encoded_doc, max_length):
    return(pad_sequences(encoded_doc, maxlen = max_length, padding = "post"))

In [13]:
padded_doc = padding_doc(encoded_doc, max_length)

In [14]:
padded_doc[:5]

array([[24,  0,  0,  0,  0,  0],
       [10,  0,  0,  0,  0,  0],
       [25,  0,  0,  0,  0,  0],
       [10,  1,  3, 26, 27,  0],
       [ 1,  8, 11, 28,  0,  0]])

In [15]:
print("Shape of padded docs = ",padded_doc.shape)

Shape of padded docs =  (31, 6)


In [16]:
#tokenizer with filter changed
output_tokenizer = create_tokenizer(unique_intent, filters = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~')

In [17]:
output_tokenizer.word_index

{'goodbye': 1,
 'quarantine': 2,
 'vaccine': 3,
 'avoid': 4,
 'distancing': 5,
 'info': 6,
 'symptoms': 7,
 'greeting': 8,
 'help': 9}

In [18]:
encoded_output = encoding_doc(output_tokenizer, intent)

In [19]:
encoded_output = np.array(encoded_output).reshape(len(encoded_output), 1)

In [20]:
encoded_output.shape

(31, 1)

In [21]:
# one hot encoding
def one_hot(encode):
    o = OneHotEncoder(sparse = False)
    return(o.fit_transform(encode))

In [22]:
output_one_hot = one_hot(encoded_output)

In [23]:
#Shape of one hot output
output_one_hot.shape 

(31, 9)

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
#spliting the data into train and test
train_X, val_X, train_Y, val_Y = train_test_split(padded_doc, output_one_hot, shuffle = True, test_size = 0.2)

In [26]:
#shape of train and test data
print("Shape of train_X = %s and train_Y = %s" % (train_X.shape, train_Y.shape))
print("Shape of val_X = %s and val_Y = %s" % (val_X.shape, val_Y.shape))

Shape of train_X = (24, 6) and train_Y = (24, 9)
Shape of val_X = (7, 6) and val_Y = (7, 9)


In [27]:
#defining the model
def create_model(vocab_size, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 128, input_length = max_length, trainable = False))#input layer
    model.add(Bidirectional(LSTM(128)))
    model.add(Dense(32, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(9, activation = "softmax"))#output layer
    return model

In [28]:
#presentation of the model
model = create_model(vocab_size, max_length)
model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 6, 128)            6528      
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               263168    
_________________________________________________________________
dense (Dense)                (None, 32)                8224      
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 9)                 297       
Total params: 278,217
Trainable params: 271,689
Non-trainable params: 6,528
_________________________________________________________________


In [29]:
#running the model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
hist = model.fit(train_X, train_Y, epochs = 100, batch_size = 32, validation_data = (val_X, val_Y), callbacks = [checkpoint])

Epoch 1/100

Epoch 00001: val_loss improved from inf to 2.20197, saving model to model.h5
Epoch 2/100

Epoch 00002: val_loss did not improve from 2.20197
Epoch 3/100

Epoch 00003: val_loss did not improve from 2.20197
Epoch 4/100

Epoch 00004: val_loss did not improve from 2.20197
Epoch 5/100

Epoch 00005: val_loss did not improve from 2.20197
Epoch 6/100

Epoch 00006: val_loss did not improve from 2.20197
Epoch 7/100

Epoch 00007: val_loss did not improve from 2.20197
Epoch 8/100

Epoch 00008: val_loss did not improve from 2.20197
Epoch 9/100

Epoch 00009: val_loss did not improve from 2.20197
Epoch 10/100

Epoch 00010: val_loss did not improve from 2.20197
Epoch 11/100

Epoch 00011: val_loss did not improve from 2.20197
Epoch 12/100

Epoch 00012: val_loss did not improve from 2.20197
Epoch 13/100

Epoch 00013: val_loss did not improve from 2.20197
Epoch 14/100

Epoch 00014: val_loss did not improve from 2.20197
Epoch 15/100

Epoch 00015: val_loss did not improve from 2.20197
Epoch 16


Epoch 00042: val_loss did not improve from 2.20197
Epoch 43/100

Epoch 00043: val_loss did not improve from 2.20197
Epoch 44/100

Epoch 00044: val_loss did not improve from 2.20197
Epoch 45/100

Epoch 00045: val_loss did not improve from 2.20197
Epoch 46/100

Epoch 00046: val_loss did not improve from 2.20197
Epoch 47/100

Epoch 00047: val_loss did not improve from 2.20197
Epoch 48/100

Epoch 00048: val_loss did not improve from 2.20197
Epoch 49/100

Epoch 00049: val_loss did not improve from 2.20197
Epoch 50/100

Epoch 00050: val_loss did not improve from 2.20197
Epoch 51/100

Epoch 00051: val_loss did not improve from 2.20197
Epoch 52/100

Epoch 00052: val_loss did not improve from 2.20197
Epoch 53/100

Epoch 00053: val_loss did not improve from 2.20197
Epoch 54/100

Epoch 00054: val_loss did not improve from 2.20197
Epoch 55/100

Epoch 00055: val_loss did not improve from 2.20197
Epoch 56/100

Epoch 00056: val_loss did not improve from 2.20197
Epoch 57/100

Epoch 00057: val_loss di


Epoch 00083: val_loss did not improve from 1.95154
Epoch 84/100

Epoch 00084: val_loss did not improve from 1.95154
Epoch 85/100

Epoch 00085: val_loss did not improve from 1.95154
Epoch 86/100

Epoch 00086: val_loss did not improve from 1.95154
Epoch 87/100

Epoch 00087: val_loss improved from 1.95154 to 1.93838, saving model to model.h5
Epoch 88/100

Epoch 00088: val_loss did not improve from 1.93838
Epoch 89/100

Epoch 00089: val_loss did not improve from 1.93838
Epoch 90/100

Epoch 00090: val_loss did not improve from 1.93838
Epoch 91/100

Epoch 00091: val_loss improved from 1.93838 to 1.90312, saving model to model.h5
Epoch 92/100

Epoch 00092: val_loss improved from 1.90312 to 1.85950, saving model to model.h5
Epoch 93/100

Epoch 00093: val_loss improved from 1.85950 to 1.82025, saving model to model.h5
Epoch 94/100

Epoch 00094: val_loss improved from 1.82025 to 1.80528, saving model to model.h5
Epoch 95/100

Epoch 00095: val_loss improved from 1.80528 to 1.79232, saving model 

In [30]:
model = load_model("model.h5")

In [31]:
def predictions(text):
    clean = re.sub(r'[^ a-z A-Z 0-9]', " ", text)
    test_word = word_tokenize(clean)
    test_word = [w.lower() for w in test_word]
    test_ls = word_tokenizer.texts_to_sequences(test_word)
    #Check for unknown words
    if [] in test_ls:
        test_ls = list(filter(None, test_ls))

    test_ls = np.array(test_ls).reshape(1, len(test_ls))

    x = padding_doc(test_ls, max_length)

    pred = model.predict_proba(x)


    return pred

In [32]:
def get_final_output(pred, classes):
    predictions = pred[0]

    classes = np.array(classes)
    ids = np.argsort(-predictions)
    classes = classes[ids]
    predictions = -np.sort(-predictions)

    for i in range(pred.shape[1]):
        print("%s has confidence = %s" % (classes[i], (predictions[i])))

In [33]:
text = "Information about coronavirus"
pred = predictions(text)
get_final_output(pred, unique_intent)



info has confidence = 0.8082617
help has confidence = 0.11911392
goodbye has confidence = 0.034690622
quarantine has confidence = 0.011257252
symptoms has confidence = 0.009037312
vaccine has confidence = 0.007104862
distancing has confidence = 0.0061440477
avoid has confidence = 0.0039669257
greeting has confidence = 0.00042336129


In [34]:
#defining all the intents
def action(intent):
    case0 = "greeting"
    case1 = "info"
    case2 = "symptoms"
    case3 = "vaccine"
    case4 = "avoid"
    case5 = "help"
    case6 = "distancing"
    case7 = "quarantine"
    case8 = "goodbye"
    
    if intent == case0: #defining intente values
        ans = "\nHello! How can I help you?"
    elif intent == case1:
        ans = "\nCoronavirus disease (COVID-19) is an infectious disease caused by a newly discovered coronavirus."
    elif intent == case2:
        ans = "\nMost common symptoms: fever, dry cough, tiredness \n Less common symptoms: aches and pains ,sore throat, diarrhoea, conjunctivitisheadache, loss of taste or smell, a rash on skin, or discolouration of fingers or toes \n Serious symptoms: difficulty breathing or shortness of breathchest pain or pressure, loss of speech or movement"
    elif intent == case3:
        ans = "\nAstraZeneca, Novavax, Covishield, Covaxin"
    elif intent == case4:
        ans = "\nWear a mask, Clean your hands, Keep a safe distance"
    elif intent == case5:
        ans = "\nIf you develop symptoms, self-isolate right away. have new or worsening symptoms; self-isolate right away and refer to the Ministry of Health COVID-19 self-assessment tool for further direction."
    elif intent == case6:
        ans = "\nIn public health, social distancing, also called physical distancing, is a set of non-pharmaceutical interventions or measures intended to prevent the spread of a contagious disease by maintaining a physical distance between people and reducing the number of times people come into close contact with each other."
    elif intent == case7:
        ans = "\nA quarantine is a restriction on the movement of people, animals and goods which is intended to prevent the spread of disease or pests. "
    elif intent == case8:
        ans = "\nBye, Stay safe :)"
    return(ans)

In [35]:
#intent index
output_tokenizer.word_index

{'goodbye': 1,
 'quarantine': 2,
 'vaccine': 3,
 'avoid': 4,
 'distancing': 5,
 'info': 6,
 'symptoms': 7,
 'greeting': 8,
 'help': 9}

In [36]:
#predictions
predictions(text)[0]

array([3.4690622e-02, 1.1257252e-02, 7.1048620e-03, 3.9669257e-03,
       6.1440477e-03, 8.0826169e-01, 9.0373121e-03, 4.2336129e-04,
       1.1911392e-01], dtype=float32)

In [37]:
#output_tokenizer index
output_tokenizer.word_index

{'goodbye': 1,
 'quarantine': 2,
 'vaccine': 3,
 'avoid': 4,
 'distancing': 5,
 'info': 6,
 'symptoms': 7,
 'greeting': 8,
 'help': 9}

In [38]:
intent_list = list(output_tokenizer.word_index)

In [39]:
intent_list

['goodbye',
 'quarantine',
 'vaccine',
 'avoid',
 'distancing',
 'info',
 'symptoms',
 'greeting',
 'help']

In [40]:
def findLargest(text):
    largest = 0
    largest_intent = 0
    probability_list = predictions(text)
    for i in range(0, len(intent_list)):
        if probability_list[0][i] > largest:
            largest = probability_list[0][i]
            largest_intent = i
    return largest_intent

In [41]:
findLargest(text)

5

In [42]:
likely_intent = intent_list[findLargest(text)]

In [43]:
action(likely_intent)

'\nCoronavirus disease (COVID-19) is an infectious disease caused by a newly discovered coronavirus.'

# GUI

In [44]:
from tkinter import*

In [45]:
#creating chatbot GUI
root=Tk()
def send():
    send="You => "+e.get()
    txt.insert(END,"\n"+send)
    intent_index = findLargest(str(e.get()))
    e.delete(0,END)
    likely_intent = intent_list[intent_index]
    receive="Chatbot => "+action(likely_intent)
    txt.insert(END,"\n"+receive)

In [46]:
#passing chatbot intents to GUI
txt=Text(root)
txt.grid(row=0,column=0,columnspan=2)
e=Entry(root,width=100)
send=Button(root,text="Send",command = send).grid(row=1,column=1)
e.grid(row=1,column=0)
root.title("CHATBOT")
root.mainloop()