#### Import libraries

In [5]:
import re
import random
import string

import nltk
import requests
import bs4 as bs
import numpy as np

#### Creating the Corpus

In [6]:
response = requests.get('https://en.wikipedia.org/wiki/Tennis')
raw_html = response.text

article_html = bs.BeautifulSoup(raw_html, 'html')
article_paragraphs = article_html.find_all('p')

article_text = ''.join(p.text for p in article_paragraphs).lower()

In [7]:
print(article_text[:30])


tennis is a racket sport that


#### Text Preprocessing


In [8]:
article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)
article_text = re.sub(r'\s+', ' ', article_text)
print(article_text[:50])


 tennis is a racket sport that can be played indiv


#### Divide to sentence


In [9]:
article_sentence = nltk.sent_tokenize(article_text)

for sentence in article_sentence[:3]:
    print(sentence)

 tennis is a racket sport that can be played individually against a single opponent (singles) or between two teams of two players each (doubles).
each player uses a tennis racket that is strung with cord to strike a hollow rubber ball covered with felt over or around a net and into the opponent's court.
the object of the game is to maneuver the ball in such a way that the opponent is not able to play a valid return.


#### Divide to words

In [10]:
article_words = nltk.word_tokenize(article_text)
print(article_words[:3])

['tennis', 'is', 'a']


#### Lemmatization


In [11]:
wnlemmatizer = nltk.stem.WordNetLemmatizer()


def perform_lemmatization(tokens):
    return [wnlemmatizer.lemmatize(token) for token in tokens]


punctuation_removal = {
    ord(punctuation): None
    for punctuation in string.punctuation
}


def get_processed_text(document: str):
    return perform_lemmatization(nltk.word_tokenize(document.lower().translate(punctuation_removal)))


In [12]:
print(punctuation_removal)


{33: None, 34: None, 35: None, 36: None, 37: None, 38: None, 39: None, 40: None, 41: None, 42: None, 43: None, 44: None, 45: None, 46: None, 47: None, 58: None, 59: None, 60: None, 61: None, 62: None, 63: None, 64: None, 91: None, 92: None, 93: None, 94: None, 95: None, 96: None, 123: None, 124: None, 125: None, 126: None}


#### Responding to Greetings


In [13]:
greeting_inputs =(
    'hey',
    'good morning',
    'good evening',
    'morning',
    'evening',
    'hi',
    'whatsup',
)

greeting_responses = [
    'hey',
    'hey hows you?',
    '*nods*',
    'hello, how you doing',
    'hello, how you doing',
    'hello',
    'you are welcome',
]

def generate_greeting_response(greeting: str) -> str:
    for token in greeting.split():
        if token.lower() in greeting_inputs:
            return random.choice(greeting_responses)


#### Responding to User Queries


In [14]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer


In [15]:
def generate_response(user_input: str) -> str:
    sentences = [*article_sentence, user_input]

    word_vectorizer = TfidfVectorizer(
        tokenizer=get_processed_text,
        stop_words='english'
    )

    all_word_vectors = word_vectorizer.fit_transform(sentences)
    similarly_vector_values = cosine_similarity(all_word_vectors[-1], all_word_vectors)
    similar_sentence_number = similarly_vector_values.argsort()[0][-2]

    matched_vector = similarly_vector_values.flatten()
    matched_vector.sort()
    vector_matched = matched_vector[-2]

    if vector_matched == 0:
        return "I'm sorry, I could not understand you."
    else:
        return article_sentence[similar_sentence_number]




In [16]:
generate_response('how')




"I'm sorry, I could not understand you."

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ashir\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True