## Importing all the Libraries.


In [1]:
import random
import io
import string #Punctuation, data preprocessing
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer  # data encoding
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')
import nltk
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

## Tokenization.

In [3]:
def tokenize():
    file = open('corpus.txt', 'r', errors='ignore')
    corpus = file.read()
    sentence_tokens = nltk.sent_tokenize(corpus)
    word_tokens = nltk.word_tokenize(corpus)
    return sentence_tokens, word_tokens

In [4]:
tokenize()

(['Artificial intelligence is a field of science concerned with building computers and machines that can reason, learn, and act in such a way that would normally require human intelligence or that involves data whose scale exceeds what humans can analyze.',
  'AI is a broad field that encompasses many different disciplines, including computer science, data analytics and statistics, hardware and software engineering, linguistics, neuroscience, and even philosophy and psychology.',
  'On an operational level for business use, AI is a set of technologies that are based primarily on machine learning and deep learning, used for data analytics, predictions and forecasting, object categorization, natural language processing, recommendations, intelligent data retrieval, and more.'],
 ['Artificial',
  'intelligence',
  'is',
  'a',
  'field',
  'of',
  'science',
  'concerned',
  'with',
  'building',
  'computers',
  'and',
  'machines',
  'that',
  'can',
  'reason',
  ',',
  'learn',
  ',',


## Lemmatization

In [5]:
lemmatizer = WordNetLemmatizer()

In [6]:
def lemtokens(tokens):
    list = []
    for i in tokens: #Every individual token have to be lemmatized.
        list.append(lemmatizer.lemmatize(i))
    return list


In [7]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [8]:
# Remove the Punctuation.
punct_dict = dict((ord(i), None) for i in string.punctuation)

In [None]:
#punct_dict

In [None]:
#chr(33)

In [9]:
def lemmer(text):
    tokenized_text = nltk.word_tokenize(text.lower().translate(punct_dict))
    lemmatize_values = lemtokens(tokenized_text)
    return lemmatize_values

In [10]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [11]:
lemmer('Ma@ngo is the king of fruits!!!')

['mango', 'is', 'the', 'king', 'of', 'fruit']

#### Function for Greeting: Rule based.

In [12]:
greeting_inputs = ['hello', 'hi', 'hey', 'greeting']
greeting_responses = ['I am a chatbot', 'hi', 'hey', 'hello', 'whats up']

def greeting(text):
    for tokens in text.split():
        if tokens.lower() in greeting_inputs:
            return random.choice(greeting_responses)

#### Function for generating responses for queries from the corpus.
* Data Encoding - TF-IDF
* Similarity Metrics - Cosine Similarity.
* Choosing vector with maximum similarity in the corpus.

In [15]:
def respond(user_query):
    bot_response = ''

    #Tokenize
    sent_tokens, word_tokens = tokenize()
    sent_tokens.append(user_query)

    # Vectorizing
    tfidf_obj = TfidfVectorizer(tokenizer=lemmer, stop_words="english")
    tfidf = tfidf_obj.fit_transform(sent_tokens)

    #Cosine Similarity
    sim_values = cosine_similarity(tfidf[-1], tfidf) #Cosine similarity of the last element with entire list

    # Selecting response or tokens with max similarity
    index = sim_values.argsort()[0][-2]

    flatten_sim = sim_values.flatten()
    flatten_sim.sort()

    required_tfidf = flatten_sim[-2]

    if(required_tfidf == 0):
        bot_response += 'I cannot understand.'
        return bot_response
    else:
        bot_response += sent_tokens[index]
        return bot_response

#### Write main function for chatbot.

In [16]:
print("CHATBOT")
flag = 1

while(flag == 1):
    user_query = input()
    user_query = user_query.lower()

    if(user_query=='exit'):
        flag = 0
        print("Chatbot: Bye! Have a good day ahead.")

    else:
        #greeting
        if(greeting(user_query) != None):
            print("Chatbot: "+ greeting(user_query))

        else:
            res = respond(user_query)
            print("Chatbot: ", res)


CHATBOT
hi
Chatbot: hey
what is deep learning
Chatbot:  On an operational level for business use, AI is a set of technologies that are based primarily on machine learning and deep learning, used for data analytics, predictions and forecasting, object categorization, natural language processing, recommendations, intelligent data retrieval, and more.
what is AI
Chatbot:  AI is a broad field that encompasses many different disciplines, including computer science, data analytics and statistics, hardware and software engineering, linguistics, neuroscience, and even philosophy and psychology.
exit
Chatbot: Bye! Have a good day ahead.
