In [1]:
#import numpy as np
import random
import string # to process standard python strings

In [2]:
import nltk #natural language toolkit

In [7]:
import io
import warnings
warnings.filterwarnings('ignore')

# Warning : This is the base class of all warning category classes. It is a subclass of Exception.
# ignore : never print matching warnings

In [8]:
f=open('chatbot.txt','r',errors = 'ignore')
raw=f.read()
raw=raw.lower()# converts to lowercase

nltk.download('popular',quiet=True)

# it will download a list of popular resources which include punkt, words, wordnet, omw, treebank, etc.
# punkt : This tokenizer divides a text into a list of sentences, by using an unsupervised algorithm to build 
# a model for abbreviation words, collocations, and words that start sentences. 
# It must be trained on a large collection of plaintext in the target language before it can be used.


# TOkennization
sent_tokens = nltk.sent_tokenize(raw)# converts to list of sentences 
word_tokens = nltk.word_tokenize(raw)# converts to list of words

In [9]:
sent_tokens[:2]

['chatbot\na chatbot is a piece of software that conducts a conversation via auditory or textual methods.',
 'agriculture\nagriculture is the process of producing food, feed, fiber and many other desired products by the cultivation of certain plants and the raising of domesticated animals (livestock).']

In [10]:
word_tokens[:2]

['chatbot', 'a']

'''
Stemming: Stemming is the way toward diminishing curved (or now and again inferred) words to their stem, 
base or root structure — for the most parts a composed word structure. Model if we somehow happened to 
stem the accompanying words: "Stems", "Stemming", "Stemmed", "and Stemtization", the outcome would be a 
solitary word "stem". 

Lemmatization: A slight variation of stemming is lemmatization. The significant contrast between these is 
that, stemming can frequently make non-existent words, though lemmas are real words. In this way, your root 
stem, which means the word you end up with, isn't something you can simply gaze upward in a word reference, 
yet you can look into a lemma. Instances of Lemmatization are that "run" is a base structure for words like 
"running" or "ran" or that "better" and "great" are in a similar lemma so they are viewed as the equivalent. 
'''

In [11]:
lemmer = nltk.stem.WordNetLemmatizer()
#WordNet is a semantically-oriented dictionary of English included in NLTK.

def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

In [12]:
GREETING_INPUTS = ("hello", "hi", "greetings", "sup","hey")
GREETING_RESPONSES = ["hi", "hey", "*nods*", "hi there", "hello", 
                        "I am glad! You are talking to me"]
def greeting(sentence):
 
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

Term Frequency: is a scoring of the recurrence of the word in the present record. 
TF = (Number of times term t shows up in a record)/(Number of terms in the report) 

Inverse Document Frequency: is a scoring of how uncommon the word is across reports. 
IDF = 1+log (N/n), 
where, N is the quantity of reports and n is the quantity of archives a term t has showed up in.

Tf-IDF weight is a weight regularly utilized in data recovery and content mining. This weight is a factual measure used to assess how significant a word is to a report in an assortment or corpus.


In [14]:
from sklearn.metrics.pairwise import cosine_similarity

Cosine Similarity (d1, d2) = Dot product (d1, d2)/||d1|| * ||d2|| 
where d1, d2 are two non-zero vectors.

In [15]:
def response(user_response):
    robo_response=''
    sent_tokens.append(user_response)
    TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
    tfidf = TfidfVec.fit_transform(sent_tokens)
    vals = cosine_similarity(tfidf[-1], tfidf)
    idx=vals.argsort()[0][-2]
    flat = vals.flatten()
    flat.sort()
    req_tfidf = flat[-2]
    if(req_tfidf==0):
        robo_response=robo_response+"I am sorry! I don't understand you"
        return robo_response
    else:
        robo_response = robo_response+sent_tokens[idx]
        return robo_response

In [None]:
flag=True
print("ROBO: My name is Robo. I will answer your queries about Chatbots. If you want to exit, type Bye!")
while(flag==True):
    user_response = input()
    user_response=user_response.lower()
    if(user_response!='bye'):
        if(user_response=='thanks' or user_response=='thank you' or user_response=='thank u'):
            flag=False
            print("ROBO: You are welcome..")
        else:
            if(greeting(user_response)!=None):
                print("ROBO: "+greeting(user_response))
            else:
                print("ROBO: ",end="")
                print(response(user_response))
                sent_tokens.remove(user_response)
    else:
        flag=False
        print("ROBO: Bye! take care..")

ROBO: My name is Robo. I will answer your queries about Chatbots. If you want to exit, type Bye!
