In [None]:
from bs4 import BeautifulSoup
import requests
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import re
import inflect
p = inflect.engine()

regex = re.compile('[^+a-zA-Z]')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
URL_list = ['https://www.geeksforgeeks.org/introduction-of-object-oriented-programming/',
            'https://www.upgrad.com/blog/types-of-inheritance-in-java/',
            'https://www.geeksforgeeks.org/polymorphism-in-java/',
            'https://www.geeksforgeeks.org/javascript/',
            'https://www.geeksforgeeks.org/introduction-to-c-programming-language/',
            'https://www.geeksforgeeks.org/artificial-intelligence-an-introduction/',
            'https://www.geeksforgeeks.org/introduction-machine-learning/',
            'https://www.geeksforgeeks.org/what-is-reinforcement-learning/',
            'https://www.cuemath.com/data/permutations-and-combinations/',
            'https://www.cuemath.com/algebra/linear-algebra/',
            'https://www.cuemath.com/trigonometry/',
            'https://traveltriangle.com/blog/places-to-visit-in-the-world/',
            'https://www.hsph.harvard.edu/nutritionsource/cancer/preventing-cancer/',
            'https://www.discoverwildlife.com/animal-facts/most-venomous-animals/',
            'https://www.10news.com/10-most-livable-cities-in-the-world']

unique_words = set()  # a set that stores all unique words that appear in all webpages(URLs)
docs_words_list = []  # a list of tuples of the webpage title, and words in that webpage

In [None]:
# Word tokenization for each webpage, and extract unique words out of all webpages

for url in URL_list:
  page = requests.get(url)
  content = page.text
  soup = BeautifulSoup(content, 'lxml')
  title = soup.find('h1').get_text()

  textbox = soup.find('div', class_='text')
  if textbox is None:
    textbox = soup.find('div', class_='page_content')
  if textbox is None:
    textbox = soup.find('div', id='content')
  if textbox is None:
    textbox = soup.find('div', id='learnSection')
  if textbox is None:
    textbox = soup.find('div', class_='blog-content')
  if textbox is None:
    textbox = soup.find('div', class_='entry-content')
  if textbox is None:
    textbox = soup.find('article')
  if textbox is None:
    textbox = soup.find('div', class_='post__content')
  if textbox is None:
    textbox = soup.find('div', class_='RichTextArticleBody-body')

  textbox = regex.sub(' ', textbox.get_text())                # convert to a string while replacing all non-alphabet chars with a space

  if title == 'Reinforcement learning':                       # this is to remove irrelevant texts
    textbox = textbox[:textbox.index(' Implementation')]      # that are retrieved from the url

  tokens = nltk.tokenize.word_tokenize(textbox)               # word tokenization
  tokens = [token.lower() for token in tokens]

  temp_list = []
  for word in tokens:
    if len(word) > 1 and len(word) < 14 and word not in stop_words:
      # check if word is noun
      if p.singular_noun(word):
        if word not in ['data', 'class', 'access', 'poisonous', 'venomous', 'platypus', 'cos']:           # some words that I do not wish to convert
          word = p.singular_noun(word)                        # converting plural nouns to singular form

      temp_list.append(word)
      unique_words.add(word)

  docs_words_list.append((title, temp_list))

In [None]:
for i in docs_words_list:
  print(i)

print(len(unique_words)) # 3141 unique words in 15 webpages

('Introduction of Object Oriented Programming', ['name', 'suggest', 'object', 'oriented', 'programming', 'oop', 'refer', 'language', 'use', 'object', 'programming', 'object', 'oriented', 'programming', 'aim', 'implement', 'real', 'world', 'entity', 'like', 'inheritance', 'hiding', 'polymorphism', 'etc', 'programming', 'main', 'aim', 'oop', 'bind', 'together', 'data', 'function', 'operate', 'part', 'code', 'access', 'data', 'except', 'function', 'oop', 'concept', 'abstraction', 'passing', 'class', 'class', 'user', 'defined', 'data', 'type', 'consist', 'data', 'member', 'member', 'function', 'accessed', 'used', 'creating', 'instance', 'class', 'represent', 'set', 'property', 'method', 'common', 'object', 'one', 'type', 'class', 'like', 'blueprint', 'object', 'example', 'consider', 'class', 'car', 'may', 'many', 'car', 'different', 'name', 'brand', 'share', 'common', 'property', 'like', 'wheel', 'speed', 'limit', 'mileage', 'range', 'etc', 'car', 'class', 'wheel', 'speed', 'limit', 'milea

In [None]:
# Build TF matrix for all webpages

docs_tf_score = []
keys = list(unique_words)
values = [0]*len(unique_words)
tf_score = {k: v for k, v in zip(keys, values)}

for word_list in docs_words_list:
  tf_score_copy = tf_score.copy()
  curr_max_term_count = 0

  for each_word in word_list[1]:
      tf_score_copy[each_word] += 1
      if tf_score_copy[each_word] > curr_max_term_count:  # update the count of the most common word in the doc
        curr_max_term_count = tf_score_copy[each_word]

  tf_score_copy.update((x, y/curr_max_term_count) for x, y in tf_score_copy.items())  # Normalization
  docs_tf_score.append((word_list[0], tf_score_copy))

for i in docs_tf_score:
  print(i)

('Introduction of Object Oriented Programming', {'stronger': 0.0, 'four': 0.0, 'mapping': 0.0, 'monthsideal': 0.0, 'simula': 0.0, 'basic': 0.08, 'added': 0.0, 'delighted': 0.0, 'playing': 0.0, 'floor': 0.0, 'historical': 0.0, 'enhance': 0.0, 'noah': 0.0, 'foul': 0.0, 'led': 0.0, 'resting': 0.0, 'second': 0.0, 'eden': 0.0, 'promptly': 0.0, 'medium': 0.0, 'esophageal': 0.0, 'starting': 0.0, 'logistic': 0.0, 'electronic': 0.0, 'interestingly': 0.0, 'interrogator': 0.0, 'starch': 0.0, 'public': 0.04, 'understood': 0.0, 'ground': 0.0, 'asia': 0.0, 'maintain': 0.0, 'unleash': 0.0, 'inc': 0.0, 'star': 0.0, 'sir': 0.0, 'edge': 0.0, 'eliassen': 0.0, 'european': 0.0, 'lancet': 0.0, 'efficiently': 0.0, 'unsupervised': 0.0, 'overweight': 0.0, 'cosec': 0.0, 'introduced': 0.0, 'inactivity': 0.0, 'john': 0.0, 'desert': 0.0, 'ka': 0.0, 'deadliest': 0.0, 'malay': 0.0, 'report': 0.0, 'template': 0.0, 'stunned': 0.0, 'international': 0.0, 'aiconf': 0.0, 'simplemost': 0.0, 'beyond': 0.0, 'able': 0.0, 'boo

In [None]:
# Calculate IDF for each word
import math

idf_score = {}

N = len(docs_words_list)              # number of documents
for each_word in unique_words:
  df = 0
  for tf_score in docs_tf_score:      # list of tuples
    if tf_score[1][each_word] > 0:
      df += 1
  idf_score[each_word] = df

for key,val in idf_score.items():
  if val == 0:
    print(key)

idf_score.update((x, math.log2(N/y)) for x, y in idf_score.items())
print(idf_score)

{'stronger': 2.9068905956085187, 'four': 1.9068905956085185, 'mapping': 2.9068905956085187, 'monthsideal': 3.9068905956085187, 'simula': 3.9068905956085187, 'basic': 1.3219280948873624, 'added': 2.321928094887362, 'delighted': 3.9068905956085187, 'playing': 3.9068905956085187, 'floor': 3.9068905956085187, 'historical': 3.9068905956085187, 'enhance': 3.9068905956085187, 'noah': 3.9068905956085187, 'foul': 3.9068905956085187, 'led': 3.9068905956085187, 'resting': 3.9068905956085187, 'second': 3.9068905956085187, 'eden': 3.9068905956085187, 'promptly': 3.9068905956085187, 'medium': 3.9068905956085187, 'esophageal': 3.9068905956085187, 'starting': 2.9068905956085187, 'logistic': 3.9068905956085187, 'electronic': 2.9068905956085187, 'interestingly': 3.9068905956085187, 'interrogator': 3.9068905956085187, 'starch': 3.9068905956085187, 'public': 2.321928094887362, 'understood': 2.321928094887362, 'ground': 3.9068905956085187, 'asia': 3.9068905956085187, 'maintain': 2.9068905956085187, 'unleas

In [None]:
# Calculate TF * IDF
docs_tf_idf_score = []

for tf_score in docs_tf_score:
  tf_idf_score = {key: tf_score[1][key] * idf_score.get(key, 0) for key in tf_score[1].keys()}
  docs_tf_idf_score.append((tf_score[0], tf_idf_score))

for i in docs_tf_idf_score:
  print(i)

('Introduction of Object Oriented Programming', {'stronger': 0.0, 'four': 0.0, 'mapping': 0.0, 'monthsideal': 0.0, 'simula': 0.0, 'basic': 0.10575424759098899, 'added': 0.0, 'delighted': 0.0, 'playing': 0.0, 'floor': 0.0, 'historical': 0.0, 'enhance': 0.0, 'noah': 0.0, 'foul': 0.0, 'led': 0.0, 'resting': 0.0, 'second': 0.0, 'eden': 0.0, 'promptly': 0.0, 'medium': 0.0, 'esophageal': 0.0, 'starting': 0.0, 'logistic': 0.0, 'electronic': 0.0, 'interestingly': 0.0, 'interrogator': 0.0, 'starch': 0.0, 'public': 0.0928771237954945, 'understood': 0.0, 'ground': 0.0, 'asia': 0.0, 'maintain': 0.0, 'unleash': 0.0, 'inc': 0.0, 'star': 0.0, 'sir': 0.0, 'edge': 0.0, 'eliassen': 0.0, 'european': 0.0, 'lancet': 0.0, 'efficiently': 0.0, 'unsupervised': 0.0, 'overweight': 0.0, 'cosec': 0.0, 'introduced': 0.0, 'inactivity': 0.0, 'john': 0.0, 'desert': 0.0, 'ka': 0.0, 'deadliest': 0.0, 'malay': 0.0, 'report': 0.0, 'template': 0.0, 'stunned': 0.0, 'international': 0.0, 'aiconf': 0.0, 'simplemost': 0.0, 'be

In [None]:
# Create a function to get N important words in the document
from operator import itemgetter

def get_top_n(dict_elem, n):
    result = dict(sorted(dict_elem.items(), key = itemgetter(1), reverse = True)[:n])
    return result

In [None]:
# Get the top 5 words of significance

for tf_idf_score in docs_tf_idf_score:
  print(tf_idf_score[0], end=': ')
  print(get_top_n(tf_idf_score[1], 5))

Introduction of Object Oriented Programming: {'sale': 1.4064806144190667, 'section': 1.2790318620677483, 'class': 1.1104195997053843, 'car': 0.9302049905947261, 'object': 0.5849625007211562}
Types of Inheritance in Java: Single, Multiple, Multilevel & Hybrid: {'inheritance': 1.6209686700157055, 'class': 1.3219280948873624, 'java': 1.0420928550294688, 'superclas': 0.4791469598387806, 'sub': 0.40543204294050667}
Polymorphism in Java: {'helper': 1.3863160177965712, 'class': 1.3219280948873624, 'print': 1.008229831124779, 'classclas': 1.008229831124779, 'polymorphism': 0.9842015977334289}
JavaScript Tutorial: {'javascript': 2.9068905956085187, 'html': 0.8014134555094397, 'side': 0.447213937785926, 'server': 0.4007067277547198, 'client': 0.4007067277547198}
Introduction to C++ Programming Language: {'c++': 2.321928094887362, 'language': 0.6197382887286971, 'programming': 0.4997889425231429, 'library': 0.47567300655412126, 'code': 0.4326310128722277}
Artificial Intelligence | An Introduction

In [None]:
# define the query and calculate its length as a vector
# convert each webpage to a vector and calculate its length

from numpy.linalg import norm

query = 'no keywords in this query are in the corpus'        # query enters from here

# calculate query length
query_wordlist = query.split()
query_vector = [idf_score.get(word, 0) for word in query_wordlist]
query_length = norm(query_vector)
print(query_length)

# a list of (webpage_title, doc_vector_length, doc_as_vector)
docs_length = []

for tf_idf_score in docs_tf_idf_score:
  temp_list = list(tf_idf_score[1].values())
  document_vector = [tf_idf_score[1].get(word, 0) for word in query_wordlist]
  docs_length.append((tf_idf_score[0], norm(temp_list), document_vector))

for doc in docs_length:
  print(doc)

0.0
('Introduction of Object Oriented Programming', 3.700207023047578, [0, 0, 0, 0, 0, 0, 0, 0, 0])
('Types of Inheritance in Java: Single, Multiple, Multilevel & Hybrid', 2.8333617226778807, [0, 0, 0, 0, 0, 0, 0, 0, 0])
('Polymorphism in Java', 4.246659472949909, [0, 0, 0, 0, 0, 0, 0, 0, 0])
('JavaScript Tutorial', 3.4345443579504487, [0, 0, 0, 0, 0, 0, 0, 0, 0])
('Introduction to C++ Programming Language', 3.173612676233595, [0, 0, 0, 0, 0, 0, 0, 0, 0])
('Artificial Intelligence | An Introduction', 4.775311558829512, [0, 0, 0, 0, 0, 0, 0, 0, 0])
('An introduction to Machine Learning', 2.605242615902864, [0, 0, 0, 0, 0, 0, 0, 0, 0])
('Reinforcement learning', 3.4542955510578883, [0, 0, 0, 0, 0, 0, 0, 0, 0])
('Permutation and Combination', 5.537966855109425, [0, 0, 0, 0, 0, 0, 0, 0, 0])
('Linear Algebra', 5.1157606553729575, [0, 0, 0, 0, 0, 0, 0, 0, 0])
('Trigonometry', 7.8245798670100495, [0, 0, 0, 0, 0, 0, 0, 0, 0])
('10 Breathtaking Places To Visit In The World You Can’t Ignore', 8.

In [None]:
import numpy as np
# calculate the cosine similarity between query and all docs
docs_cosSim = []              # a list of tuples of (webpage_title, cosSim(webpage, query))
for doc in docs_length:       # doc is a tuple of (webpage_title, doc_vector_length, doc_as_vector)
  docs_cosSim.append((doc[0], np.dot(doc[2], query_vector)/(query_length*doc[1])))

# rank the cosSim values from high to low
docs_cosSim = list(sorted(docs_cosSim, key = itemgetter(1), reverse = True))

print('Query you entered:', query)
print('Here are the webpages retrieved based on your query (relevance ranked from high to low):')
i = 1
for cosSim in docs_cosSim:
  print(i,end='. ')
  print(cosSim[0] + ' | Cosine Similarity = ' + str(cosSim[1]))
  i+=1

Query you entered: no keywords in this query are in the corpus
Here are the webpages retrieved based on your query (relevance ranked from high to low):
1. Introduction of Object Oriented Programming | Cosine Similarity = nan
2. Types of Inheritance in Java: Single, Multiple, Multilevel & Hybrid | Cosine Similarity = nan
3. Polymorphism in Java | Cosine Similarity = nan
4. JavaScript Tutorial | Cosine Similarity = nan
5. Introduction to C++ Programming Language | Cosine Similarity = nan
6. Artificial Intelligence | An Introduction | Cosine Similarity = nan
7. An introduction to Machine Learning | Cosine Similarity = nan
8. Reinforcement learning | Cosine Similarity = nan
9. Permutation and Combination | Cosine Similarity = nan
10. Linear Algebra | Cosine Similarity = nan
11. Trigonometry | Cosine Similarity = nan
12. 10 Breathtaking Places To Visit In The World You Can’t Ignore | Cosine Similarity = nan
13. Preventing Cancer | Cosine Similarity = nan
14. 10 most venomous animals | Cosin

  docs_cosSim.append((doc[0], np.dot(doc[2], query_vector)/(query_length*doc[1])))
