In [19]:
# import required libraries
import nltk
import numpy as np
import random
import string
import bs4 as bs
import urllib.request
import re
import os

os.system('python ./requirement.py')

2

In [20]:
# extract text data from the chosen url
raw_html = urllib.request.urlopen('https://en.wikipedia.org/wiki/National_Engineering_School_of_Sfax')
raw_html = raw_html.read()

article_html = bs.BeautifulSoup(raw_html, 'lxml')

article_paragraphs = article_html.find_all('p')

In [21]:
article_text = ''

for para in article_paragraphs:
    article_text += para.text

article_text

'The National Engineering School of Sfax (Arabic: المدرسة الوطنية للمهندسين بصفاقس) or ENIS, is a Tunisian engineering school and research establishment based in the city of Sfax located in the east of the country. It is a part of the University of Sfax.[1]\nThe National Engineering School of Sfax was founded in 1983.[1]\nThe National Engineering School of Sfax has seven independent departments:[1]\n\n\nThis article on a Tunisian institution of higher education is a stub. You can help Wikipedia by expanding it.'

In [22]:
# remove all single digits 
article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)
# Substituting multiple spaces with single space
article_text = re.sub(r'\s+', ' ', article_text)
# The sent_tokenize function in Python can tokenize inserted text into sentences
article_sentences = nltk.sent_tokenize(article_text)
article_sentences

['The National Engineering School of Sfax (Arabic: المدرسة الوطنية للمهندسين بصفاقس) or ENIS, is a Tunisian engineering school and research establishment based in the city of Sfax located in the east of the country.',
 'It is a part of the University of Sfax.',
 'The National Engineering School of Sfax was founded in 1983.',
 'The National Engineering School of Sfax has seven independent departments: This article on a Tunisian institution of higher education is a stub.',
 'You can help Wikipedia by expanding it.']

In [23]:
# Lemmatization is the process of grouping together the different inflected forms of a word so they can be analyzed as a 
# single item.  
wnlemmatizer = nltk.stem.WordNetLemmatizer()
def perform_lemmatization(tokens):
    return [wnlemmatizer.lemmatize(token) for token in tokens]

# lower document caracters and remove punctuation
punctuation_removal = dict((ord(punctuation), None) for punctuation in string.punctuation)
def get_processed_text(document):
    return perform_lemmatization(nltk.word_tokenize(document.lower().translate(punctuation_removal)))

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def generate_response(user_input):
    Enis_bot_response = ''
    article_sentences.append(user_input)
    # process the article sentences with the user input
    word_vectorizer = TfidfVectorizer(tokenizer=get_processed_text, stop_words='english')
    all_word_vectors = word_vectorizer.fit_transform(article_sentences)

    similar_vector_values = cosine_similarity(all_word_vectors[-1], all_word_vectors)
    similar_sentence_number = similar_vector_values.argsort()[0][-2]
    matched_vector = similar_vector_values.flatten()
    matched_vector.sort()
    vector_matched = matched_vector[-2]

    if vector_matched == 0:
        Enis_bot_response = Enis_bot_response + "I am sorry, I could not understand you"
        return Enis_bot_response
    else:
        Enis_bot_response = Enis_bot_response + article_sentences[similar_sentence_number]
        return Enis_bot_response

In [25]:
greeting_inputs = ("hey", "good morning", "good evening", "morning", "evening", "hi", "whatsup")
greeting_responses = ["hey", "hey hows you?", "*nods*", "hello, how you doing", "hello", "Welcome, I am good and you"]

def generate_greeting_response(greeting):
    for token in greeting.split():
        if token.lower() in greeting_inputs:
            return random.choice(greeting_responses)

In [27]:
continue_dialogue = True
print("****** Hello, I am your friend Enis_bot. You can ask me any question regarding ENIS: ******")
while(continue_dialogue == True):
    human_text = input("your input: ")
    human_text = human_text.lower()
    if human_text != 'bye':
        if human_text == 'thanks' or human_text == 'thank you very much' or human_text == 'thank you':
            continue_dialogue = False
            print("Enis_bot: Most welcome")
        else:
            if generate_greeting_response(human_text) != None:
                print("Enis_bot: " + generate_greeting_response(human_text))
            else:
                print("Enis_bot: ", end="")
                print(generate_response(human_text))
                article_sentences.remove(human_text)
    else:
        continue_dialogue = False
        print("Enis_bot: Good bye and take care of yourself...")

****** Hello, I am your friend Enis_bot. You can ask me any question regarding ENIS: ******
your input: Hi
Enis_bot: Welcome, I am good and you
your input: What is enis?
Enis_bot: The National Engineering School of Sfax (Arabic: المدرسة الوطنية للمهندسين بصفاقس) or ENIS, is a Tunisian engineering school and research establishment based in the city of Sfax located in the east of the country.
your input: How many departments does it contain?
Enis_bot: The National Engineering School of Sfax has seven independent departments: This article on a Tunisian institution of higher education is a stub.
your input: Thank you
Enis_bot: Most welcome
