In [7]:
import nltk
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity      
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
# import spacy
lemmatizer = nltk.stem.WordNetLemmatizer()

# Download required NLTK data
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\1040\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\1040\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\1040\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
data = pd.read_csv('chatbot dataset.txt', sep = "\t", header = None)
data.rename(columns = {0: 'Question', 1: 'Answer'}, inplace = True)
data

Unnamed: 0,Question,Answer
0,What are your interests,I am interested in all kinds of things. We can...
1,What are your favorite subjects,"My favorite subjects include robotics, compute..."
2,What are your interests,"I am interested in a wide variety of topics, a..."
3,What is your number,I don't have any number
4,What is your number,23 skiddoo!
...,...,...
561,"The Hubble Space Telescope, launched into low ...",Edwin Hubble
562,What is the name of the nearest major galaxy t...,The Andromeda Galaxy.
563,God Save the Queen is the national anthem of w...,The United Kingdom of Great Britain
564,"The Celtic Shelf, the seabed under the Celtic ...",Europe


In [9]:
# Define a function for text preprocessing (including lemmatization)
def preprocess_text(text):
    # Identifies all sentences in the data
    sentences = nltk.sent_tokenize(text)
    
    # Tokenize and lemmatize each word in each sentence
    preprocessed_sentences = []
    for sentence in sentences:
        tokens = [lemmatizer.lemmatize(word.lower()) for word in nltk.word_tokenize(sentence) if word.isalnum()]
        # Turns to basic root - each word in the tokenized word found in the tokenized sentence - if they are all alphanumeric 
        # The code above does the following:
        # Identifies every word in the sentence 
        # Turns it to a lower case 
        # Lemmatizes it if the word is alphanumeric

        preprocessed_sentence = ' '.join(tokens)
        preprocessed_sentences.append(preprocessed_sentence)
    
    return ' '.join(preprocessed_sentences)


data['tokenized Questions'] = data['Question'].apply(preprocess_text)
data

Unnamed: 0,Question,Answer,tokenized Questions
0,What are your interests,I am interested in all kinds of things. We can...,what are your interest
1,What are your favorite subjects,"My favorite subjects include robotics, compute...",what are your favorite subject
2,What are your interests,"I am interested in a wide variety of topics, a...",what are your interest
3,What is your number,I don't have any number,what is your number
4,What is your number,23 skiddoo!,what is your number
...,...,...,...
561,"The Hubble Space Telescope, launched into low ...",Edwin Hubble,the hubble space telescope launched into low e...
562,What is the name of the nearest major galaxy t...,The Andromeda Galaxy.,what is the name of the nearest major galaxy t...
563,God Save the Queen is the national anthem of w...,The United Kingdom of Great Britain,god save the queen is the national anthem of w...
564,"The Celtic Shelf, the seabed under the Celtic ...",Europe,the celtic shelf the seabed under the celtic s...


In [10]:
xtrain = data['tokenized Questions'].to_list()
xtrain

['what are your interest',
 'what are your favorite subject',
 'what are your interest',
 'what is your number',
 'what is your number',
 'what is your favorite number',
 'what can you eat',
 'why ca you eat food',
 'what is your location',
 'what is your location',
 'where are you from',
 'where are you',
 'do you have any brother',
 'do you have any brother',
 'who is your father',
 'who is your mother',
 'who is your bos',
 'what is your age',
 'what is your age',
 'what is the illuminati',
 'what is the illuminatti',
 'what is the illuminatti',
 'what is vineland',
 'what is illuminatus',
 'what is illuminatus',
 'who wrote vineland',
 'who is bilbo baggins',
 'who is geoffrey chaucer',
 'who is pier anthony',
 'have you read plato',
 'have you read frankenstein',
 'have you ever read a book',
 'have you ever read a book',
 'have you ever read a book',
 'have you read many book',
 'have you read homer',
 'ray bradbury',
 'what is mind child',
 'william gibson',
 'william gibson',
 

In [11]:
# Vectorize corpus
tfidf_vectorizer = TfidfVectorizer()
corpus = tfidf_vectorizer.fit_transform(xtrain)

print(corpus)

  (0, 239)	0.7510532735254372
  (0, 494)	0.48874989053623996
  (0, 28)	0.3138809685434418
  (0, 472)	0.3138809685434418
  (1, 422)	0.6671979309473373
  (1, 161)	0.5280519683565775
  (1, 494)	0.38890600576581785
  (1, 28)	0.2497600431750582
  (1, 472)	0.2497600431750582
  (2, 239)	0.7510532735254372
  (2, 494)	0.48874989053623996
  (2, 28)	0.3138809685434418
  (2, 472)	0.3138809685434418
  (3, 323)	0.7397733964976874
  (3, 246)	0.31529152573369384
  (3, 494)	0.5001540097563475
  (3, 472)	0.3212048289790214
  (4, 323)	0.7397733964976874
  (4, 246)	0.31529152573369384
  (4, 494)	0.5001540097563475
  (4, 472)	0.3212048289790214
  (5, 323)	0.6119933172122124
  (5, 246)	0.2608316379531582
  (5, 161)	0.561802595594419
  (5, 494)	0.4137630698764027
  :	:
  (564, 104)	0.22192768676566968
  (564, 337)	0.22192768676566968
  (564, 385)	0.22192768676566968
  (564, 452)	0.22192768676566968
  (564, 386)	0.22192768676566968
  (564, 394)	0.44385537353133936
  (564, 80)	0.44385537353133936
  (564, 324)	

In [12]:
user = input('Pls ask your question: ')

print(user)




In [13]:
preprocess_text(user)

''

In [12]:
#Vectorize user input
user_transformed = tfidf_vectorizer.transform([user])
print(user_transformed)

  (0, 494)	0.5147649035309579
  (0, 472)	0.33058811801511384
  (0, 239)	0.7910300818047242


In [13]:
#find similarity 
similarity_scores = cosine_similarity(user_transformed, corpus)
similarity_scores

array([[0.94946234, 0.28276287, 0.94946234, 0.36364823, 0.36364823,
        0.30083575, 0.11585513, 0.        , 0.35599693, 0.35599693,
        0.        , 0.        , 0.        , 0.        , 0.22005584,
        0.22005584, 0.22005584, 0.34642453, 0.34642453, 0.09948012,
        0.10351445, 0.10351445, 0.11505672, 0.11505672, 0.11505672,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.08426415, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.10959094, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [14]:
data['Answer'].iloc[similarity_scores.argmax()]

'I am interested in all kinds of things. We can talk about anything!'

In [14]:
def collector():
    user = input('Pls ask your question: ')
    pre_user = preprocess_text(user)
    vect_user = tfidf_vectorizer.transform([pre_user])
    similarity_scores = cosine_similarity(vect_user, corpus)
    most_similar_index = similarity_scores.argmax()
    
    return data['Answer'].iloc[most_similar_index]

In [16]:
collector()   #This is to call the collector function

In [15]:
def responder(user_input):
    user_input_processed = preprocess_text(user_input)
    vectorized_user_input = tfidf_vectorizer.transform([user_input_processed])
    similarity_score = cosine_similarity(vectorized_user_input, corpus)
    argument_maximum = similarity_score.argmax()
    print (data['Answer'].iloc[ argument_maximum])

bot_greetings = ['Hello user, i am a creation of zeze the great...Ask your question',
             'How far wetin dey sup?',
             'How may i help you?',
             'Why you show face, everything clear?',
             'Good day user, welcome to my world. How may i help you?']

farewell = [ 'Thanks for your usage... bye',
            'Alright sir... Hope to see you soon',
            'Oya now... e go be',
            'Everygood abi.. later things']

human_greetings = ['hi', 'hello there', 'hey', 'hello']

human_exits = ['thanks bye', 'bye', 'quit', 'exit', 'bye bye', 'close']

import random
random_greeting = random.choice(bot_greetings)
random_farewell = random.choice(bot_greetings)

while True:
    user_input = input('You: ')

    if user_input in human_greetings:
        print(random_greeting)
    elif user_input.lower() in human_exits:
        print(random_farewell)
        break
    else:
        responder(user_input)

Good day user, welcome to my world. How may i help you?
I am interested in all kinds of things. We can talk about anything!
teknolust was released in 2002.
Why you show face, everything clear?
