In [6]:
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from string import punctuation
from time import sleep
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [19]:
class AdvancedChatBot:
    def __init__(self):
        self.end_chat = False
        self.got_topic = False
        self.do_not_respond = True
        self.title = None
        self.text_data = []
        self.sentences = []
        self.para_indices = []
        self.current_sent_idx = None
        self.punctuation_dict = str.maketrans({p: None for p in punctuation})
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english'))
        self.greeting()

    def greeting(self):
        print("Initializing ChatBot ...")
        sleep(2)
        print('Type "bye" or "quit" or "exit" to end chat')
        sleep(2)
        print('\nShare your confluence link here. '
              '\nChatBot will access the confluence page if it has access to, prepare itself to '
              '\nrespond to your queries on that topic. \n')
        sleep(3)
        print('ChatBot will respond with short info. '
              '\nIf you input "more", it will give you detailed info '
              '\nYou can also jump to the next query')
        sleep(3)
        print('-'*50)
        greet = "Hello! Please share the link of the confluence page you want to explore. "
        print("ChatBot >>  " + greet)

    def chat(self):
        while not self.end_chat:
            self.receive_input()
            if self.end_chat:
                print('ChatBot >>  See you soon! Bye!')
                sleep(2)
                print('\nQuitting ChatBot ...')
            elif self.got_topic:
                if not self.do_not_respond:
                    self.respond()
                self.do_not_respond = False

    def receive_input(self):
        text = input("User    >> ")
        if text.lower().strip() in ['bye', 'quit', 'exit']:
            self.end_chat = True
        elif text.lower().strip() == 'more':
            self.do_not_respond = True
            if self.current_sent_idx is not None:
                response = self.text_data[self.para_indices[self.current_sent_idx]]
            else:
                response = "Please input your query first!"
            print("ChatBot >>  " + response)
        elif not self.got_topic:
            self.scrape_wiki(text)
        else:
            self.sentences.append(text)

    def respond(self):
        vectorizer = TfidfVectorizer(tokenizer=self.preprocess)
        tfidf = vectorizer.fit_transform(self.sentences)
        scores = cosine_similarity(tfidf[-1], tfidf)
        self.current_sent_idx = scores.argsort()[0][-2]
        scores = scores.flatten()
        scores.sort()
        value = scores[-1]
        if value != 0:
            print("ChatBot >>  " + self.sentences[self.current_sent_idx])
        else:
            print("ChatBot >>  I am not sure. Sorry!")
        del self.sentences[-1]

    def scrape_wiki(self, topic):
        topic = '_'.join(topic.lower().strip().capitalize().split(' '))
        try:
            link = f'{topic}'
            data = requests.get(link).content
            soup = BeautifulSoup(data, 'html.parser')
            p_data = soup.find_all('p')
            for tag in p_data:
                a = []
                for i in tag.contents:
                    if i.name != 'sup' and i.string is not None:
                        stripped = ' '.join(i.string.strip().split())
                        a.append(stripped)
                self.text_data.append(' '.join(a))

            for i, para in enumerate(self.text_data):
                sentences = sent_tokenize(para)
                self.sentences.extend(sentences)
                index = [i] * len(sentences)
                self.para_indices.extend(index)

            self.title = soup.find('h1').string
            self.got_topic = True
            print(f'ChatBot >>  Topic is "Confluence: {self.title}". Let\'s chat!')
        except Exception as e:
            print(f'ChatBot >>  Error: {e}. Please input some other topic!')

    def preprocess(self, text):
        text = text.lower().strip().translate(self.punctuation_dict)
        words = word_tokenize(text)
        words = [w for w in words if w not in self.stopwords]
        return [self.lemmatizer.lemmatize(w) for w in words]

In [20]:
chatbot = AdvancedChatBot()
chatbot.chat()

Initializing ChatBot ...
Type "bye" or "quit" or "exit" to end chat

Share your confluence link here. 
ChatBot will access the confluence page if it has access to, prepare itself to 
respond to your queries on that topic. 

ChatBot will respond with short info. 
If you input "more", it will give you detailed info 
You can also jump to the next query
--------------------------------------------------
ChatBot >>  Hello! Please share the link of the confluence page you want to explore. 
User    >> https://en.wikipedia.org/wiki/Coffee
ChatBot >>  Topic is "Confluence: Coffee". Let's chat!
User    >> what is coffee?
ChatBot >>  Coffee is a beverage brewed from roasted coffee beans .
User    >> health?
ChatBot >>  Results were complicated by poor study quality, and differences in age, gender, health status, and serving size.
User    >> does it contain caffeine?
ChatBot >>  Robusta strains also contain about 40–50% more caffeine than arabica.
User    >> more
ChatBot >>  Of the two main specie

KeyboardInterrupt: Interrupted by user