In [None]:
# Importing Packages
import numpy as np
import pandas as pd
import re
import time
from typing import List
from io import open
import unicodedata
import random
import glob
import json
import sys
import os
import nltk
from keras.layers import Input, Embedding, LSTM, TimeDistributed, Dense, Bidirectional
from keras.models import Model, load_model

INPUT_LENGTH = 20
OUTPUT_LENGTH = 20

In [None]:
# Reading the dataset
df = pd.read_csv("final_faq.csv")
df.tail(10)

Unnamed: 0,Context,Answer
469,How long is the incubation period for COVID-19?,The “incubation period” means the time between...
470,What is the state doing to protect our health? \n,California has been actively and extensively p...
471,What is SARS-CoV-2? What is COVID-19?,Severe Acute Respiratory Syndrome Coronavirus-...
472,Where do coronaviruses come from?,Coronaviruses are viruses that circulate among...
473,Is this virus comparable to SARS or to the sea...,The novel coronavirus detected in China is gen...
474,How severe is COVID-19 infection?,Preliminary findings indicate that the mortali...
475,What is the mode of transmission? How (easily)...,While animals are the original source of the v...
476,What are the symptoms ofÂ COVID-19 infection,"The virus can cause mild, flu-like symptoms su..."
477,Are some people more at risk than others?,Generally elderly people and those with underl...
478,Are children also at risk of infection?,Disease in children appears to be relatively r...


In [None]:
# Creating QA Lists
questions = list(df['Context']);question.append("Hello");question.append("How are you ?");question.append("Bye");
answers = list(df['Answer']);answer.append("Hi, I am Covid-Bot");answer.append("I am Fine, How can I help u");answer.append("Good Bye");

In [None]:
# Text preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"who's","who is",text) 
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)  
    text = re.sub(r"[-()\"#/@;:<>{}+=~|.?,]","",text) 
    text = " ".join(text.split())
    return text

In [None]:
# Cleaning the QA Lists 
clean_questions = []
for question in questions:
    clean_questions.append(clean_text(question))
clean_answers = []    
for answer in answers:
    clean_answers.append(clean_text(answer))

In [None]:
lengths = []
#nltk method
for question in clean_questions:
    lengths.append(len(question.split()))
for answer in clean_answers:
    lengths.append(len(answer.split()))
# Create a dataframe
lengths = pd.DataFrame(lengths, columns=['counts'])

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
#tokenizing Q & A (Using ntlk)
Q_tok = [nltk.word_tokenize(sent) for sent in questions]
A_tok = [nltk.word_tokenize(sent) for sent in answers]

In [None]:
# "This is a cooool #smiley: :-) :-P <3 and some arrows < > -> <--"
# ['This', 'is', 'a', 'cooool', '#smiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']

In [None]:
#train-validation split
data_size = len(Q_tok)

# 80 % training 20% validation
training_input  = Q_tok[:round(data_size*(80/100))]
training_input  = [tr_input[::-1] for tr_input in training_input] #reversing input seq for better performance
training_output = A_tok[:round(data_size*(80/100))]

validation_input = Q_tok[round(data_size*(80/100)):]
validation_input  = [val_input[::-1] for val_input in validation_input] #reversing input seq for better performance
validation_output = A_tok[round(data_size*(80/100)):]

In [None]:
vocab = {}
for question in Q_tok:
    for word in question:
        if word not in vocab:
            vocab[word] = 1
        else:
            vocab[word] += 1

for answer in A_tok:
    for word in answer:
        if word not in vocab:
            vocab[word] = 1
        else:
            vocab[word] += 1    

In [None]:
threshold = 15
count = 0
for k,v in vocab.items():
    if v >= threshold:
        count += 1

In [None]:
WORD_CODE_START = 1
WORD_CODE_PADDING = 0
word_num  = 2
encoding = {}
decoding = {1: 'START'}
for word, count in vocab.items():
    if count >= threshold: #get vocabularies that appear above threshold count
        encoding[word] = word_num 
        decoding[word_num ] = word
        word_num += 1

print("No. of vocab used:", word_num)

No. of vocab used: 453


In [None]:
decoding[len(encoding)+2] = '<UNK>'
encoding['<UNK>'] = len(encoding)+2

In [None]:
dict_size = word_num+1

In [None]:
def transform(encoding, data, vector_size=20):
    transformed_data = np.zeros(shape=(len(data), vector_size))
    for i in range(len(data)):
        for j in range(min(len(data[i]), vector_size)):
            try:
                transformed_data[i][j] = encoding[data[i][j]]
            except:
                transformed_data[i][j] = encoding['<UNK>']
    return transformed_data

In [None]:
#encoding training set
encoded_training_input = transform(encoding, training_input, vector_size=INPUT_LENGTH)
encoded_training_output = transform(encoding, training_output, vector_size=OUTPUT_LENGTH)

In [None]:
#encoding validation set
encoded_validation_input = transform(encoding, validation_input, vector_size=INPUT_LENGTH)
encoded_validation_output = transform(encoding, validation_output, vector_size=OUTPUT_LENGTH)

In [None]:
import tensorflow as tf
tf.keras.backend.clear_session()

In [None]:
encoder_input = Input(shape=(INPUT_LENGTH,))
decoder_input = Input(shape=(OUTPUT_LENGTH,))

In [None]:
from keras.layers import SimpleRNN

encoder = Embedding(dict_size, 128, input_length=INPUT_LENGTH, mask_zero=True)(encoder_input)
encoder = LSTM(512, return_sequences=True, unroll=True)(encoder)
encoder_last = encoder[:,-1,:]
decoder = Embedding(dict_size, 128, input_length=OUTPUT_LENGTH, mask_zero=True)(decoder_input)
decoder = LSTM(512, return_sequences=True, unroll=True)(decoder, initial_state=[encoder_last, encoder_last])

In [None]:
from keras.layers import Activation, dot, concatenate

attention = dot([decoder, encoder], axes=[2, 2])
attention = Activation('softmax', name='attention')(attention)
print('attention', attention)
context = dot([attention, encoder], axes=[2,1])
decoder_combined_context = concatenate([context, decoder])
output = TimeDistributed(Dense(512, activation="tanh"))(decoder_combined_context)
output = TimeDistributed(Dense(dict_size, activation="softmax"))(output)
print('output', output)

attention KerasTensor(type_spec=TensorSpec(shape=(None, 20, 20), dtype=tf.float32, name=None), name='attention/Softmax:0', description="created by layer 'attention'")
output KerasTensor(type_spec=TensorSpec(shape=(None, 20, 454), dtype=tf.float32, name=None), name='time_distributed_1/Reshape_1:0', description="created by layer 'time_distributed_1'")


### Model Summary

In [None]:
model = Model(inputs=[encoder_input, decoder_input], outputs=[output])
model.compile(optimizer='adam', loss='binary_crossentropy')
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 20, 128)      58112       input_9[0][0]                    
__________________________________________________________________________________________________
input_10 (InputLayer)           [(None, 20)]         0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     (None, 20, 512)      1312768     embedding[0][0]                  
______________________________________________________________________________________________

In [None]:
training_encoder_input = encoded_training_input
training_decoder_input = np.zeros_like(encoded_training_output)
training_decoder_input[:, 1:] = encoded_training_output[:,:-1]
training_decoder_input[:, 0] = WORD_CODE_START
training_decoder_output = np.eye(dict_size)[encoded_training_output.astype('int')]

validation_encoder_input = encoded_validation_input
validation_decoder_input = np.zeros_like(encoded_validation_output)
validation_decoder_input[:, 1:] = encoded_validation_output[:,:-1]
validation_decoder_input[:, 0] = WORD_CODE_START
validation_decoder_output = np.eye(dict_size)[encoded_validation_output.astype('int')]

### Loading the model 

In [None]:
from keras import models
m = "model_topic3.h5"
mod = models.load_model(m)

### Predicting the result

In [None]:
def prediction(raw_input):
    clean_input = clean_text(raw_input)
    input_tok = [nltk.word_tokenize(clean_input)]
    input_tok = [input_tok[0][::-1]]  #reversing input seq
    encoder_input = transform(encoding, input_tok, 20)
    decoder_input = np.zeros(shape=(len(encoder_input), OUTPUT_LENGTH))
    decoder_input[:,0] = WORD_CODE_START
    for i in range(1, OUTPUT_LENGTH):
        output = mod.predict([encoder_input, decoder_input]).argmax(axis=2)
        decoder_input[:,i] = output[:,i]
    return output

def decode(decoding, vector):
    text = ''
    for i in vector:
        if i == 0:
            break
        text += ' '
        text += decoding[i]
    return text

In [None]:
pip install flask_ngrok



### Connecting Backend to Frontend using Flask Framework
(Created a home.html file which takes the user text from the front end and then inputs the same string into the model and predict the output from semantic similarity model and it shows the string to the Front end)

In [None]:
from flask import Flask
from flask import Flask, request, render_template
from flask_ngrok import run_with_ngrok
app = Flask(__name__)


run_with_ngrok(app)
@app.route("/")
def home():
    return render_template("home.html")

@app.route("/get")
def get_bot_response():
    userText = request.args.get('msg')
    output = prediction(userText)
    if(output == "" or output == " " or output == "  " or output == "   "):
        return str("Don't Know")
    return str(decode(decoding, output[0]))
 

if __name__ == "__main__":
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://12dd39557d02.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


127.0.0.1 - - [22/Jul/2021 04:34:52] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [22/Jul/2021 04:34:54] "[37mGET / HTTP/1.1[0m" 200 -
  app.launch_new_instance()
127.0.0.1 - - [22/Jul/2021 04:35:13] "[37mGET /get?msg=Why%20corona%3F HTTP/1.1[0m" 200 -
127.0.0.1 - - [22/Jul/2021 04:41:41] "[37mGET /get?msg=What%20are%20symptoms%20of%20corona%20virs%3F HTTP/1.1[0m" 200 -
127.0.0.1 - - [22/Jul/2021 04:42:57] "[37mGET /get?msg=What%20about%20animals%20or%20pets%20%3F%20How%20it%20is%20affecting%20them%3F HTTP/1.1[0m" 200 -
127.0.0.1 - - [22/Jul/2021 04:43:26] "[37mGET /get?msg=What%20about%20animals%20or%20pets HTTP/1.1[0m" 200 -
