In [1]:
pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting click (from nltk)
  Using cached click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.4.16-cp311-cp311-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/42.0 kB ? eta -:--:--
     ---------------------------------------- 0.0/42.0 kB ? eta -:--:--
     ---------------------------------------- 0.0/42.0 kB ? eta -:--:--
     ---------------------------------------- 0.0/42.0 kB ? eta -:--:--
     --------- ------------------------------ 10.2/42.0 kB ? eta -:--:--
     ------------------ ------------------- 20.5/42.0 kB 640.0 kB/s eta 0:00:01
     -------------------------------------  41.0/42.0 kB 487.6 kB/s eta 0:00:01
     -------------------------------------  41.0/42.0 kB 487.6 kB/s eta 0:00:01
     -------------------------------------  41.0/42.0 kB 487.6 kB/s eta 0:00:01
     -----------------------------------

In [2]:
import nltk
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity      
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
# import spacy
lemmatizer = nltk.stem.WordNetLemmatizer()

# Download required NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sarah\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sarah\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sarah\AppData\Roaming\nltk_data...


True

In [3]:
data = pd.read_csv('Samsung Dialog.txt', sep = ':', header = None)
data

Unnamed: 0,0,1
0,Customer,"Hi, I'm looking to buy a new phone, and I'm i..."
1,Sales Agent,"Great, we have a wide range of Samsung phones..."
2,Customer,"Well, I want a phone with a good camera, long..."
3,Sales Agent,Absolutely. We have a lot of great options th...
4,Customer,"No, I haven't. Tell me more about it."
5,Sales Agent,The Galaxy S21 Ultra has a 108-megapixel came...
6,Customer,That sounds great. How much does it cost?
7,Sales Agent,"The Galaxy S21 Ultra starts at $1,199, but we..."
8,Customer,"Okay, I'm interested. But I have a few more q..."
9,Sales Agent,The Galaxy S21 Ultra comes with a standard on...


In [4]:
cust = data.loc[data[0] == 'Customer']
sales = data.loc[data[0] == 'Sales Agent']

sales = sales[1].reset_index(drop = True)
cust = cust[1].reset_index(drop = True)

new_data = pd.DataFrame()
new_data['Question'] = cust
new_data['Answer'] = sales

new_data.head()

Unnamed: 0,Question,Answer
0,"Hi, I'm looking to buy a new phone, and I'm i...","Great, we have a wide range of Samsung phones..."
1,"Well, I want a phone with a good camera, long...",Absolutely. We have a lot of great options th...
2,"No, I haven't. Tell me more about it.",The Galaxy S21 Ultra has a 108-megapixel came...
3,That sounds great. How much does it cost?,"The Galaxy S21 Ultra starts at $1,199, but we..."
4,"Okay, I'm interested. But I have a few more q...",The Galaxy S21 Ultra comes with a standard on...


In [7]:
# Define a function for text preprocessing (including lemmatization)
def preprocess_text(text):
    # Identifies all sentences in the new_data
    sentences = nltk.sent_tokenize(text)
    
    # Tokenize and lemmatize each word in each sentence
    preprocessed_sentences = []
    for sentence in sentences:
        tokens = [lemmatizer.lemmatize(word.lower()) for word in nltk.word_tokenize(sentence) if word.isalnum()]
        # Turns to basic root - each word in the tokenized word found in the tokenized sentence - if they are all alphanumeric 
        # The code above does the following:
        # Identifies every word in the sentence 
        # Turns it to a lower case 
        # Lemmatizes it if the word is alphanumeric

        preprocessed_sentence = ' '.join(tokens)
        preprocessed_sentences.append(preprocessed_sentence)
    
    return ' '.join(preprocessed_sentences)


new_data['tokenized Questions'] = new_data['Question'].apply(preprocess_text)
new_data

Unnamed: 0,Question,Answer,tokenized Questions
0,"Hi, I'm looking to buy a new phone, and I'm i...","Great, we have a wide range of Samsung phones...",hi i looking to buy a new phone and i interest...
1,"Well, I want a phone with a good camera, long...",Absolutely. We have a lot of great options th...,well i want a phone with a good camera long ba...
2,"No, I haven't. Tell me more about it.",The Galaxy S21 Ultra has a 108-megapixel came...,no i have tell me more about it
3,That sounds great. How much does it cost?,"The Galaxy S21 Ultra starts at $1,199, but we...",that sound great how much doe it cost
4,"Okay, I'm interested. But I have a few more q...",The Galaxy S21 Ultra comes with a standard on...,okay i interested but i have a few more questi...
5,That's good to know. And what about the opera...,"Yes, the Galaxy S21 Ultra runs on Android 11,...",that good to know and what about the operating...
6,"Okay, that's good. But I'm also interested in...",Absolutely. The Galaxy A52 is a great mid-ran...,okay that good but i also interested in some o...
7,That sounds like a good option for me. How mu...,"The Galaxy A52 starts at $399, but again, we ...",that sound like a good option for me how much ...
8,"Okay, I'll think about it. But can you also t...",Of course. The Galaxy Z Fold2 is a really uni...,okay i think about it but can you also tell me...
9,"That sounds really cool, but it also sounds e...","The Galaxy Z Fold2 starts at $1,999, but agai...",that sound really cool but it also sound expen...


In [9]:
xtrain = new_data['tokenized Questions'].to_list()
xtrain

['hi i looking to buy a new phone and i interested in samsung phone',
 'well i want a phone with a good camera long battery life and plenty of storage',
 'no i have tell me more about it',
 'that sound great how much doe it cost',
 'okay i interested but i have a few more question what kind of warranty come with the phone',
 'that good to know and what about the operating system doe it come with the latest version of android',
 'okay that good but i also interested in some of the other samsung phone can you tell me more about the galaxy a52',
 'that sound like a good option for me how much doe it cost',
 'okay i think about it but can you also tell me about the galaxy z fold2 i heard a lot about it and i curious',
 'that sound really cool but it also sound expensive how much doe it cost',
 'hmm i not sure that a lot of money for a phone',
 'okay can you tell me more about the galaxy a72',
 'that sound like a great option for me how much doe it cost',
 'okay i definitely consider it but

In [10]:
# Vectorized corpus
tfidf_vectorizer = TfidfVectorizer()
corpus = tfidf_vectorizer.fit_transform(xtrain)

print(corpus[0])

  (0, 71)	0.3011232459093611
  (0, 41)	0.2700934382194581
  (0, 42)	0.2460248494975018
  (0, 5)	0.22635938061087743
  (0, 67)	0.4194649271573511
  (0, 58)	0.34485730351794186
  (0, 11)	0.34485730351794186
  (0, 84)	0.2700934382194581
  (0, 51)	0.34485730351794186
  (0, 37)	0.34485730351794186


In [11]:
user = input('Pls ask your question: ')
print(user)

hello, I would like to buy a Samsung phone


In [12]:
preprocess_text(user)

'hello i would like to buy a samsung phone'

In [13]:
# Vectorize user input
user_transformed = tfidf_vectorizer.transform([user])
print(user_transformed)

  (0, 84)	0.41815182665493084
  (0, 71)	0.4661913897476909
  (0, 67)	0.3247024930794629
  (0, 49)	0.4661913897476909
  (0, 11)	0.5338993510984622


In [14]:
# Find similarity
similarity_scores = cosine_similarity(user_transformed, corpus)
similarity_scores

array([[0.57364153, 0.06191879, 0.        , 0.        , 0.06485636,
        0.09497224, 0.17619019, 0.17598405, 0.        , 0.        ,
        0.07873825, 0.        , 0.17305635, 0.        , 0.15803535,
        0.        , 0.        ]])

In [15]:
new_data['Answer'].iloc[similarity_scores.argmax()]

' Great, we have a wide range of Samsung phones to choose from. What features are you looking for in a phone?'

In [16]:
user = input('Pls ask your question: ')
print(user)

ok i want to buy a phone that has a very good camera with a long battery life and has plenty of storage


In [17]:
preprocess_text(user)

'ok i want to buy a phone that ha a very good camera with a long battery life and ha plenty of storage'

In [18]:
# Vectorize user input
user_transformed = tfidf_vectorizer.transform([user])
print(user_transformed)

  (0, 90)	0.21507530970605035
  (0, 86)	0.30147479618422807
  (0, 84)	0.23611610775606548
  (0, 81)	0.14072957131134134
  (0, 75)	0.30147479618422807
  (0, 68)	0.30147479618422807
  (0, 67)	0.183348448954359
  (0, 61)	0.19788372592927259
  (0, 50)	0.30147479618422807
  (0, 48)	0.30147479618422807
  (0, 32)	0.19788372592927259
  (0, 13)	0.30147479618422807
  (0, 11)	0.30147479618422807
  (0, 8)	0.30147479618422807
  (0, 5)	0.19788372592927259


In [19]:
# Find similarity
similarity_scores = cosine_similarity(user_transformed, corpus)
similarity_scores

array([[0.28944028, 0.86695205, 0.        , 0.03601363, 0.12967416,
        0.23017389, 0.12735361, 0.08455341, 0.03648266, 0.02239108,
        0.12244397, 0.        , 0.02792789, 0.        , 0.23340483,
        0.02224727, 0.02412068]])

In [20]:
new_data['Answer'].iloc[similarity_scores.argmax()]

' Absolutely. We have a lot of great options that meet those criteria. Have you considered the Samsung Galaxy S21 Ultra?'

In [22]:
def collector():
    user = input('Pls ask your question: ')
    pre_user = preprocess_text(user)
    vect_user = tfidf_vectorizer.transform([pre_user])
    similarity_scores = cosine_similarity(vect_user, corpus)
    most_similar_index = similarity_scores.argmax()
    
    return new_data['Answer'].iloc[most_similar_index]

In [None]:
collector()

" Of course. The Galaxy Z Fold2 is a really unique phone that has a foldable screen. It's a little more expensive than some of our other phones, but it has some really amazing features. For example, it has a 7.6-inch foldable display, which gives you more screen real estate for multitasking and watching videos. It also has a 12-megapixel camera, a 4,500mAh battery, and up to 512GB of storage."

In [41]:
def responder(text):
    user_input_processed = preprocess_text(text)
    vectorized_user_input = tfidf_vectorizer.transform([user_input_processed])
    similarity_score = cosine_similarity(vectorized_user_input, corpus)
    argument_maximum = similarity_score.argmax()

    print(new_data['Answer'].iloc[argument_maximum])

bot_greetings = ['Hello user, I am Ayo..... Pls ask your question',
             'Howdy, what can I do for you?',
             'Whazzap, what you need?',
             'Oremi, ki lo need?',
             'Welcome user, what can I do you for?'
             ]

farewell = ['Thanks for your usage..... Bye.',
            'Tainku, come again soon.',
            'Gracias, hope to see you again.',
            'Oshey oremi...... Odabo.',
            'Bye user, thanks for your patronage.']

human_greetings = ['hi', 'hello', 'hey', 'whatsup', 'good day', 'hello there', 'howdy', 'waddup']

human_exists = ['bye', 'good bye', 'ciao', 'close', 'exit', 'thanks', 'thank you','close']

import random
random_greeting = random.choice(bot_greetings)
random_farewell = random.choice(farewell)

while True:
    user_input = input('You: ')

    if user_input.lower() in human_greetings:
        print(random_greeting)
    elif user_input.lower() in human_exists:
        print(random_farewell)
        break
    else:
        responder(user_input)

 Great, we have a wide range of Samsung phones to choose from. What features are you looking for in a phone?
 Absolutely. We have a lot of great options that meet those criteria. Have you considered the Samsung Galaxy S21 Ultra?


In [24]:
# def get_response(user_input):
#     user_input_processed = preprocess_text(user_input) # ....................... Preprocess the user's input using the preprocess_text function

#     user_input_vector = tfidf_vectorizer.transform([user_input_processed])# .... Vectorize the preprocessed user input using the TF-IDF vectorizer

#     similarity_scores = cosine_similarity(user_input_vector, corpus) # .. Calculate the score of similarity between the user input vector and the corpus (df) vector

#     most_similar_index = similarity_scores.argmax() # ..... Find the index of the most similar question in the corpus (df) based on cosine similarity

#     return new_data['Answer'].iloc[most_similar_index] # ... Retrieve the corresponding answer from the df DataFrame and return it as the chatbot's response

# # create greeting list 
# greetings = ["Hey There.... I am a creation of Ehiz Danny Agba Coder.... How can I help",
#             "Hi Human.... How can I help",
#             'Twale baba nla, wetin dey happen nah',
#             'How far Alaye, wetin happen'
#             "Good Day .... How can I help", 
#             "Hello There... How can I be useful to you today",
#             "Hi GomyCode Student.... How can I be of use"]

# exits = ['thanks bye', 'bye', 'quit', 'exit', 'bye bye', 'close']
# farewell = ['Thanks....see you soon', 'Babye, See you soon', 'Bye... See you later', 'Bye... come back soon']

# random_farewell = random.choice(farewell) # ---------------- Randomly select a farewell message from the list
# random_greetings = random.choice(greetings) # -------- Randomly select greeting message from the list

# Test your chatbot
# while True:
#     user_input = input("You: ")
#     if user_input.lower() in exits:
#         print(f"\nChatbot: {random_farewell}!")
#         break
#     if user_input.lower() in ['hi', 'hello', 'hey', 'hi there']:
#         print(f"\nChatbot: {random_greetings}!")
#     else:   
#         response = get_response(user_input)
#         print(f"\nChatbot: {response}")

KeyboardInterrupt: Interrupted by user