In [9]:
import spacy
import requests
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup

In [10]:
#data acquisition from web page url
url = "https://ai.googleblog.com/2022/04/pathways-language-model-palm-scaling-to.html"
data = requests.get(url)

# loading the page content into soup
soup = BeautifulSoup(data.content, "html.parser")

# removing the html, style and script tags
for script in soup(["script", "style"]):
    script.extract()

# extrating only the text content 
text = soup.get_text()

# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)

In [11]:
# loading spacy nlp
nlp = spacy.load("en_core_web_sm")

In [12]:
def spacyTokenizer(document):
  tokens = nlp(document)
  tokens = [token.lemma_ for token in tokens if (
      token.is_stop == False and 
      token.is_punct == False and 
      token.lemma_.strip() != ''
  )]
  return tokens

In [13]:
# splitting the concatenated string into an array
textSplit = text.splitlines()
textContent = str(text).replace('\n', ' ').replace(' ', '')

In [14]:
# instance of tf-idf Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# generate the tf-idf vectors for the textSplit
tfidf_matrix = tfidf_vectorizer.fit_transform(textSplit)

# compute and print the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

[[1.         0.30630504 0.07930533 ... 0.21496096 0.         0.        ]
 [0.30630504 1.         0.         ... 0.         0.         0.        ]
 [0.07930533 0.         1.         ... 0.368929   0.         0.        ]
 ...
 [0.21496096 0.         0.368929   ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]
