# Imports

In [None]:
import random # To generate random responses
import string # For removing punctuation - data preprocessing
import numpy as np # Handling arrays
import io # Provides Python’s main facilities for dealing with various types of I/O - text, binary and raw I/O
from sklearn.feature_extraction.text import TfidfVectorizer # Data encoding
from sklearn.metrics.pairwise import cosine_similarity # Similarity-based response generation
import warnings
warnings.filterwarnings('ignore')
import nltk
from nltk.stem import WordNetLemmatizer # For Data preprocessing

# Tokenization

In [4]:
def tokenize():
    file = open('corpus.txt','r', errors = 'ignore')
    corpus = file.read()
    sentence_tokens = nltk.sent_tokenize(corpus)
    word_tokens = nltk.word_tokenize(corpus)
    return sentence_tokens, word_tokens
    #print(corpus)

In [5]:
tokenize()

(['Artificial intelligence or AI is intelligence perceiving, synthesizing, and inferring informationn demonstrated by machines, as opposed to intelligence displayed by non human animals and humans.',
  'Example tasks in which this is done include speech recognition, computer vision, translation between natural languages, as well as other mappings of inputs.',
  'AI applications include advanced web search engines e.g., Google Search, recommendation systems used by YouTube, Amazon and Netflix, understanding human speech such as Siri and Alexa, self-driving cars e.g., Waymo, automated decision-making and competing at the highest level in strategic game systems such as chess and Go.',
  'As machines become increasingly capable, tasks considered to require intelligence are often removed from the definition of AI, a phenomenon known as the AI effect.',
  'For instance, optical character recognition is frequently excluded from things considered to be AI, having become a routine technology.',

# Lemmatization

In [6]:
lemmatizer = WordNetLemmatizer()

In [7]:
def lemTokens(tokens):
    lst = []
    for i in tokens: #Every individual token has to be lemmatized
        lst.append(lemmatizer.lemmatize(i))
    return lst

# Normalization
###### Noise removal of special characters - punctuation marks...

In [8]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [9]:
punct = dict((ord(i), None) for i in string.punctuation)
# ord -> the inbuilt function in python for returning Unicode value of a corresponding character.
# Each value is replaced with "None"

In [10]:
punct

{33: None,
 34: None,
 35: None,
 36: None,
 37: None,
 38: None,
 39: None,
 40: None,
 41: None,
 42: None,
 43: None,
 44: None,
 45: None,
 46: None,
 47: None,
 58: None,
 59: None,
 60: None,
 61: None,
 62: None,
 63: None,
 64: None,
 91: None,
 92: None,
 93: None,
 94: None,
 95: None,
 96: None,
 123: None,
 124: None,
 125: None,
 126: None}

In [11]:
chr(126)

'~'

In [12]:
#Tokenization and lemmatization
def lemmer(text):
    tokenized_text = nltk.word_tokenize(text.lower().translate(punct)) #Tokenization of text and Removal of punctuation
    lemmatized_values = lemTokens(tokenized_text) # Calling the lemTokens function
    return lemmatized_values

In [13]:
#Test of lemmer function
lemmer("I hav$$$$e a hea**dache")

['i', 'have', 'a', 'headache']

# Greeting - Implementation

In [17]:
greeting_inputs = ['hi','hello','hey', 'greetings']
greeting_responses = ['Hi, My name is ChatBot. How may I help you?', 'Hey, My name is ChatBot. How may I help you?','Hello, My name is ChatBot. How may I help you?','Greetings, My name is ChatBot. How may I help you?']

In [18]:
def greeting(text):
    for token in text.split():
        if token.lower() in greeting_inputs:
            return random.choice(greeting_responses)

In [19]:
#Testing the greeting function
greeting("Hello I need some help in JavaScript")

'Hi, My name is ChatBot. How may I help you?'

In [20]:
# Limitation - If we type an input "Hello," like a greeting then followed comma, the chatbot won't recognize it as a greeting
greeting("Hello, I need some help in Javacript")

# Responses - Implementation

In [22]:
def respond(user_query):
    bot_response = ''

    # Tokenize the user query
    sent_tokens, word_tokens = tokenize()
    sent_tokens.append(user_query)

    # Data encoding -> Converting sent_tokens to vectors
    tfidf_obj = TfidfVectorizer(tokenizer = lemmer, stop_words = "english" )
    tfidf = tfidf_obj.fit_transform(sent_tokens)


    # Cosine similarity
    # In this case tfidf = [t1, 12, t3, t4, user_query]
    # Get the last element (user_query) which is in index [-1] and compare it with the entire list (t1, t2, t3, t4....)

    sim_values = cosine_similarity(tfidf[-1],tfidf) # Cosine similarity of the last element with the entire list

    # Selecting the token with masimum similarity value
    # -2 means the second last response -> because the last response ([-1]) is the user query
    index = sim_values.argsort()[0][-2] # Sorting values to give the index


    flattened_sim = sim_values.flatten() #Flattening the sim_values to make them one dimensional
    flattened_sim.sort() # Sorting flattened sim_values

    required_tfidf = flattened_sim[-2]

    if(required_tfidf ==0):
        bot_response += "Sorry, I cannot understand your question. I can only respond to questions in my corpus"
        return bot_response
    else:
        bot_response += bot_response + sent_tokens[index]
        return bot_response

In [None]:
print('CHATBOT')
flag = 1

while (flag == 1):
    user_query = input()
    user_query = user_query.lower()

    # If user wants to exit
    if(user_query == 'exit'):
        flag = 0
        print('ChatBot : Bye! I\'m glad to be of assistance to you:)')

    else:
        # If user enters a greeting
        if(greeting(user_query)!= None):
            print("Bot: " +greeting(user_query))
        else:
            res = respond(user_query)
            print("Chatbot: ", res)