# Building a retrieval based chatbot

In [1]:
import nltk
import numpy as np
import random
import string

import bs4 as bs
import requests
import re

import warnings
warnings.filterwarnings = False


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\intel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\intel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Gathering Data from Wikipedia

In [4]:
r = requests.get('https://en.wikipedia.org/wiki/Machine_learning')
raw_html = r.text

In [5]:
# cleaning up
corpus_html = bs.BeautifulSoup(raw_html)

# extracting paragraphs from the html
corpus_paras = corpus_html.find_all('p')
corpus_text = ''

# concatenating all the paras
for para in corpus_paras:
    corpus_text += para.text

# lowering the text
corpus_text = corpus_text.lower()

In [6]:
corpus_text

'machine learning (ml) is the study of computer algorithms that can improve automatically through experience and by the use of data.[1] it is seen as a part of artificial intelligence. machine learning algorithms build a model based on sample data, known as training data, in order to make predictions or decisions without being explicitly programmed to do so.[2] machine learning algorithms are used in a wide variety of applications, such as in medicine, email filtering, speech recognition, and computer vision, where it is difficult or unfeasible to develop conventional algorithms to perform the needed tasks.[3]\na subset of machine learning is closely related to computational statistics, which focuses on making predictions using computers; but not all machine learning is statistical learning. the study of mathematical optimization delivers methods, theory and application domains to the field of machine learning. data mining is a related field of study, focusing on exploratory data analy

In [7]:
# getting rid of all the empty spaces and special characters
corpus_text = re.sub(r'\[[0-9]*\]',' ', corpus_text)
corpus_text = re.sub(r'\s+',' ', corpus_text)

In [8]:
# converting text into sentences and word tokens
corpus_sentences = nltk.sent_tokenize(corpus_text)
corpus_word = nltk.word_tokenize(corpus_text)

In [9]:
corpus_sentences

['machine learning (ml) is the study of computer algorithms that can improve automatically through experience and by the use of data.',
 'it is seen as a part of artificial intelligence.',
 'machine learning algorithms build a model based on sample data, known as training data, in order to make predictions or decisions without being explicitly programmed to do so.',
 'machine learning algorithms are used in a wide variety of applications, such as in medicine, email filtering, speech recognition, and computer vision, where it is difficult or unfeasible to develop conventional algorithms to perform the needed tasks.',
 'a subset of machine learning is closely related to computational statistics, which focuses on making predictions using computers; but not all machine learning is statistical learning.',
 'the study of mathematical optimization delivers methods, theory and application domains to the field of machine learning.',
 'data mining is a related field of study, focusing on explora

In [10]:
corpus_word

['machine',
 'learning',
 '(',
 'ml',
 ')',
 'is',
 'the',
 'study',
 'of',
 'computer',
 'algorithms',
 'that',
 'can',
 'improve',
 'automatically',
 'through',
 'experience',
 'and',
 'by',
 'the',
 'use',
 'of',
 'data',
 '.',
 'it',
 'is',
 'seen',
 'as',
 'a',
 'part',
 'of',
 'artificial',
 'intelligence',
 '.',
 'machine',
 'learning',
 'algorithms',
 'build',
 'a',
 'model',
 'based',
 'on',
 'sample',
 'data',
 ',',
 'known',
 'as',
 'training',
 'data',
 ',',
 'in',
 'order',
 'to',
 'make',
 'predictions',
 'or',
 'decisions',
 'without',
 'being',
 'explicitly',
 'programmed',
 'to',
 'do',
 'so',
 '.',
 'machine',
 'learning',
 'algorithms',
 'are',
 'used',
 'in',
 'a',
 'wide',
 'variety',
 'of',
 'applications',
 ',',
 'such',
 'as',
 'in',
 'medicine',
 ',',
 'email',
 'filtering',
 ',',
 'speech',
 'recognition',
 ',',
 'and',
 'computer',
 'vision',
 ',',
 'where',
 'it',
 'is',
 'difficult',
 'or',
 'unfeasible',
 'to',
 'develop',
 'conventional',
 'algorithms',
 

# Generating greeting responses on predefined set of inputs

In [11]:
greeting_inputs = ("hey", "good morning", "good evening", "morning", "evening", "hi","hello")
greeting_responses = ("hey", "hey hows you?", "hello how you doing?", "hello", "what can i help you with?", "hi")

def greet_response(greeting):
    for token in greeting.split():
        if token.lower() in greeting_inputs:
            return random.choice(greeting_responses)

# Preprocessing with Punctuation Removal and Lemmatizing

In [12]:
wn_lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_corpus(tokens):
    return [wn_lemmatizer.lemmatize(token) for token in tokens]

punct_removal_dict = dict((ord(punctuation), None) for punctuation in string.punctuation)

def get_processed_text(document):
    return lemmatize_corpus(nltk.word_tokenize(document.lower().translate(punct_removal_dict)))


# Language Modeling with Tf-Idf

In [13]:
def respond(user_input):
    
    bot_response = ' '
    corpus_sentences.append(user_input)
    
    #vectorizing the processed text
    word_vectorizer = TfidfVectorizer(tokenizer = get_processed_text, stop_words='english')
    corpus_word_vectors = word_vectorizer.fit_transform(corpus_sentences)
    
    cos_sim_vectors = cosine_similarity(corpus_word_vectors[-1], corpus_word_vectors)
    similar_response_idx = cos_sim_vectors.argsort()[0][-2]
    
    matched_vector = cos_sim_vectors.flatten()
    matched_vector.sort()
    vector_matched = matched_vector[-2]
    
    if vector_matched == 0:
        bot_response = bot_response + "I am sorry, what is it, again?"
        return bot_response
    else:
        bot_response = bot_response + corpus_sentences[similar_response_idx]
        return bot_response

In [14]:
chat = True
print("Hello, what do you want to learn about Machine Learning Today?")
while(chat == True):
    user_query = input("User :")
    user_query = user_query.lower()
    if user_query != 'quit':
        if user_query == 'thanks' or user_query == 'thank you':
            chat = False
            print('MLBot : You are welcome!')
        else:
            if greet_response(user_query) != None:
                print("MLBot : " + greet_response(user_query))
            else:
                print("MLBot : ", end=" ")
                print(respond(user_query))
                corpus_sentences.remove(user_query)
    else:
        chat = False
        print("MLBot : Good Bye!")

Hello, what do you want to learn about Machine Learning Today?
User :hi
MLBot : hi
User :what is machine learning
MLBot :  



 a representative book of the machine learning research during the 1960s was the nilsson's book on learning machines, dealing mostly with machine learning for pattern classification.
User :machine learning algorithms applications
MLBot :   in its application across business problems, machine learning is also referred to as predictive analytics.
User :Hello
MLBot : hey
User :Machine Learning
MLBot :   a representative book of the machine learning research during the 1960s was the nilsson's book on learning machines, dealing mostly with machine learning for pattern classification.
User :Machine learning and statistics
MLBot :   machine learning and statistics are closely related fields in terms of methods, but distinct in their principal goal: statistics draws population inferences from a sample, while machine learning finds generalizable predictive patterns.
User :mathematical model
MLBot :   reinforcement learning algorithms do not assume knowledge of an exact mathematical model of the