# Chatbot for Physician Office

In [1]:
#Install packages
from newspaper import Article
import random
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
#Download NLTK
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
#Retrieve the article(s)
import requests
alphabet = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
all_text = ''
for i in range(len(alphabet)):
    r = requests.get('https://www.mayoclinic.org/diseases-conditions/index?letter='+alphabet[i])
    all_text = all_text + r.text

In [4]:
#Parse through all text retrieved using BeautifulSoup
from bs4 import BeautifulSoup
soup = BeautifulSoup(all_text, 'html.parser')
unparsed = []

In [5]:
#Retrieve all links relating to a disease or condition (these URLs will have the sequence /disease-conditions/ in them)
for link in soup.find_all('a'):
    if link.get('href') is not None and link.get('href').find('/diseases-conditions/',0,len(link.get('href'))) > -1:
        unparsed.append(link.get('href'))
print("\n".join(unparsed))

/diseases-conditions/index?letter=A
/es-es/diseases-conditions/index?letter=A
/ar/diseases-conditions/index?letter=A
/diseases-conditions/index?letter=A
/es-es/diseases-conditions/index?letter=A
/ar/diseases-conditions/index?letter=A
/diseases-conditions/brain-tumor/symptoms-causes/syc-20350084
/diseases-conditions/breast-cancer/symptoms-causes/syc-20352470
/diseases-conditions/colon-cancer/symptoms-causes/syc-20353669
/diseases-conditions/adult-congenital-heart-disease/symptoms-causes/syc-20355456
/diseases-conditions/heart-arrhythmia/symptoms-causes/syc-20350668
/diseases-conditions/index?letter=A
/diseases-conditions/index?letter=B
/diseases-conditions/index?letter=C
/diseases-conditions/index?letter=D
/diseases-conditions/index?letter=E
/diseases-conditions/index?letter=F
/diseases-conditions/index?letter=G
/diseases-conditions/index?letter=H
/diseases-conditions/index?letter=I
/diseases-conditions/index?letter=J
/diseases-conditions/index?letter=K
/diseases-conditions/index?letter

In [6]:
len(unparsed)

2903

In [7]:
#Limit URLs only to those about various cancers (this is done simply to lower computing time, but will result in 
#the chatbot to only answer questions about cancers)
i=0
while i < len(unparsed):
    if 'index' in unparsed[i] or 'cancer' not in unparsed[i] or 'mayoclinic' in unparsed[i]:
        unparsed.remove(unparsed[i])
    else:
        i+=1

In [8]:
#Remove duplicate links
def remove_duplicate(x):
    return list(dict.fromkeys(x))

mylist = remove_duplicate(unparsed)

print(mylist)

['/diseases-conditions/breast-cancer/symptoms-causes/syc-20352470', '/diseases-conditions/colon-cancer/symptoms-causes/syc-20353669', '/diseases-conditions/adrenal-cancer/symptoms-causes/syc-20351026', '/diseases-conditions/ampullary-cancer/symptoms-causes/syc-20355066', '/diseases-conditions/anal-cancer/symptoms-causes/syc-20354140', '/diseases-conditions/bladder-cancer/symptoms-causes/syc-20356104', '/diseases-conditions/bone-cancer/symptoms-causes/syc-20350217', '/diseases-conditions/inflammatory-breast-cancer/symptoms-causes/syc-20355413', '/diseases-conditions/male-breast-cancer/symptoms-causes/syc-20374740', '/diseases-conditions/cancer/symptoms-causes/syc-20370588', '/diseases-conditions/cervical-cancer/symptoms-causes/syc-20352501', '/diseases-conditions/endometrial-cancer/symptoms-causes/syc-20352461', '/diseases-conditions/esophageal-cancer/symptoms-causes/syc-20356084', '/diseases-conditions/gallbladder-cancer/symptoms-causes/syc-20353370', '/diseases-conditions/stomach-canc

In [9]:
len(mylist)

40

In [10]:
#Appends all text from all remaining articles to one string
data = ''
for i in range(0, len(mylist)):
    article = Article('https://www.mayoclinic.org/' + mylist[i])
    article.download()
    article.parse()
    data = data + article.text

In [11]:
corpus = data

In [12]:
print(corpus)

Overview

Breast anatomy Open pop-up dialog box Close Breast anatomy Breast anatomy Each breast contains 15 to 20 lobes of glandular tissue, arranged like the petals of a daisy. The lobes are further divided into smaller lobules that produce milk for breast-feeding. Small tubes (ducts) conduct the milk to a reservoir that lies just beneath your nipple.

Breast cancer is cancer that forms in the cells of the breasts.

After skin cancer, breast cancer is the most common cancer diagnosed in women in the United States. Breast cancer can occur in both men and women, but it's far more common in women.

Substantial support for breast cancer awareness and research funding has helped created advances in the diagnosis and treatment of breast cancer. Breast cancer survival rates have increased, and the number of deaths associated with this disease is steadily declining, largely due to factors such as earlier detection, a new personalized approach to treatment and a better understanding of the dis

In [13]:
#Tokenization
text = corpus
sentence_list = nltk.sent_tokenize(text) # A list of sentences

In [14]:
print(sentence_list)



In [15]:
#Return a random greeting response
def greeting_response(text):
    text = text.lower()
    
    #Bot responses
    bot_greetings = ['hello', 'hi', 'Hi! How can I help?']
    
    #User greetings
    user_greetings = ['hi', 'hello', 'yo' 'howdy', 'greetings']
    
    for word in text.split():
        if word in user_greetings:
            return random.choice(bot_greetings)

In [16]:
#Function used for sorting similarity scores
def index_sort(list_var):
    length = len(list_var)
    list_index = list(range(0, length))
    x = list_var
    for i in range(length):
        for j in range(length):
            if x[list_index[i]] > x[list_index[j]]:
                #Swap
                temp = list_index[i]
                list_index[i] = list_index[j]
                list_index[j] = temp
    return list_index

In [17]:
#Create bot responses based on user input
def bot_response(user_input):
    user_input = user_input.lower()
    sentence_list.append(user_input)
    bot_response = ''
    #Transforms the sentence into a count matrix
    cm = CountVectorizer().fit_transform(sentence_list)
    #Computes similarities between user input and all other count matrices
    similarity_scores = cosine_similarity(cm[-1], cm)
    #Reduces dimentionality of the similarity scores
    similarity_scores_list = similarity_scores.flatten()
    #Searches index of the highest similarity score
    index = index_sort(similarity_scores_list)
    index = index[1:]
    response_flag = 0
    
    #Tells the chatbot to print the two most similar sentences
    j = 0
    for i in range(len(index)):
        if similarity_scores_list[index[i]] > 0.0:
            bot_response = bot_response+' '+sentence_list[index[i]]
            response_flag = 1
            j += 1
    #Stop searching if two similar sentences are found
        if j > 2:
            break
    if response_flag == 0:
        bot_response = bot_response+' '+'Sorry, I do not understand'
    
    sentence_list.remove(user_input)
    
    return bot_response

# The Chatbot

In [18]:
#Start the chatbot
print('Chatbot: Hello, what would you like help with today? (To exit, type: quit)')

exit_list = ['quit', 'exit', 'close', 'go away', 'goodbye']

while True:
    user_input = input()
    if user_input.lower() in exit_list:
        print('Thank you for using ChatBot, we hope you found what you were looking for.')
        break
    else:
        if greeting_response(user_input) != None:
            print('Chatbot: '+ greeting_response(user_input))
        else:
            print('Chatbot: '+ bot_response(user_input))

Chatbot: Hello, what would you like help with today? (To exit, type: quit)
quit
Thank you for using ChatBot, we hope you found what you were looking for.
