### Importing all the libraries.

In [4]:
!pip install wikipedia

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11679 sha256=8cef1d577c9890c0aee6c964ad4f4ee662825ff0b9771b54d51eecad6934da7b
  Stored in directory: /root/.cache/pip/wheels/5e/b6/c5/93f3dec388ae76edc830cb42901bb0232504dfc0df02fc50de
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [5]:
import random
import io
import string #Punctuation, data preprocessing
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer  # data encoding
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')
import nltk
from nltk.stem import WordNetLemmatizer
import wikipedia


In [6]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Tokenization.

In [14]:
def tokenize(user_query):
    corpus = wikipedia.summary(user_query)
    sentence_tokens = nltk.sent_tokenize(corpus)
    word_tokens = nltk.word_tokenize(corpus)
    return sentence_tokens, word_tokens

## Lemmatization

In [15]:
lemmatizer = WordNetLemmatizer()

In [16]:
def lemtokens(tokens):
    list = []
    for i in tokens: #Every individual token have to be lemmatized.
        list.append(lemmatizer.lemmatize(i))
    return list

In [17]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [18]:
# Remove the Punctuation.
punct_dict = dict((ord(i), None) for i in string.punctuation)

In [19]:
def lemmer(text):
    tokenized_text = nltk.word_tokenize(text.lower().translate(punct_dict))
    lemmatize_values = lemtokens(tokenized_text)
    return lemmatize_values

In [20]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### Function for Greeting: Rule Based.


In [22]:
greeting_inputs = ['hello', 'hi', 'hey', 'greeting']
greeting_responses = ['I am a chatbot', 'hi', 'hey', 'hello', 'whats up']

def greeting(text):
    for tokens in text.split():
        if tokens.lower() in greeting_inputs:
            return random.choice(greeting_responses)

#### Function for generating responses for queries from the corpus.
* Data Encoding - TF-IDF
* Similarity Metrics - Cosine Similarity.
* Choosing vector with maximum similarity in the corpus.

In [23]:
def respond(user_query):
    bot_response = ''

    #Tokenize
    sent_tokens, word_tokens = tokenize(user_query)
    sent_tokens.append(user_query)

    # Vectorizing
    tfidf_obj = TfidfVectorizer(tokenizer=lemmer, stop_words="english")
    tfidf = tfidf_obj.fit_transform(sent_tokens)

    #Cosine Similarity
    sim_values = cosine_similarity(tfidf[-1], tfidf) #Cosine similarity of the last element with entire list

    # Selecting response or tokens with max similarity
    index = sim_values.argsort()[0][-2]

    flatten_sim = sim_values.flatten()
    flatten_sim.sort()

    required_tfidf = flatten_sim[-2]

    if(required_tfidf == 0):
        bot_response += 'I cannot understand.'
        return bot_response
    else:
        bot_response += sent_tokens[index]
        return bot_response

In [24]:
print("CHATBOT")
flag = 1

while(flag == 1):
    user_query = input()
    user_query = user_query.lower()

    if(user_query=='exit'):
        flag = 0
        print("Chatbot: Bye! Have a good day ahead.")

    else:
        #greeting
        if(greeting(user_query) != None):
            print("Chatbot: "+ greeting(user_query))

        else:
            res = respond(user_query)
            print("Chatbot: ", res)

CHATBOT
HI
Chatbot: hello
apple fruit
Chatbot:  An apple is a round, edible fruit produced by an apple tree (Malus spp., among them the domestic or orchard apple; Malus domestica).
Who is the winner of mtv hustle season 3
Chatbot:  On 10 December 2023, during episode 16 of season 3, Samantha Ruth Prabhu announced the Tamil edition of MTV Hustle Namma Pettai.
who was the winner of MTV Hustle show.
Chatbot:  MTV Hustle is an Indian rap and hip-hop reality show.
who is the king of fruits
Chatbot:  Takaya began a sequel titled Fruits Basket Another in September 2015, and the spin-off series The Three Musketeers Arc in April 2019.
exit
Chatbot: Bye! Have a good day ahead.
