### Contents of jupyter notebook:-
 - Text similarity with example
 - Image similarity with example
 - Blog post similarity with example

## TEXT SIMILARITY

In [1]:
# Turning certificate verification off
import os, ssl
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
    ssl._create_default_https_context = ssl._create_unverified_context

import requests
from tqdm import tqdm
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin, urlparse
import urllib.request
import re
import numpy as np
from boilerpy3 import extractors
import gensim
from gensim.models import Word2Vec, KeyedVectors
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/jayant/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jayant/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# TEXT PREPROCESSING

# The term syntax refers the grammatical structure of the text, whereas semantics refers
# to the meaning of the sentence. A sentence that is syntactically correct does not mean
# to be always semantically correct.

# Syntactic analysis (syntax analysis or parsing), analyzes natural language using formal grammar
# Grammatical rules are applied to categories and groups of words, but not to the individual words
# Example - Parsing, Lemmatization, Stemming

# Semantic analysis is the process of understanding and interpreting the words, signs, tone
# and structure of the sentence.

# Using gensim to process the sentences
def sentence_to_words(sentences):
    for sentence in sentences:
        sentence_tokenized = gensim.utils.simple_preprocess(sentence, deacc=True, min_len=2, max_len=15)      
        # Make sure we don't yield empty arrays
        if len(sentence_tokenized) > 0:
            yield sentence_tokenized

# Process the sentences manually
def sentence_to_words_from_scratch(sentences):
    for sentence in sentences:
        sentence_tokenized = [token.lower() for token in 
               word_tokenize(sentence.translate(str.maketrans('','',string.punctuation)))]
        
        # Make sure we don't yield empty arrays
        if len(sentence_tokenized) > 0:
            yield sentence_tokenized

# Remove all stopwords
stop_words = stopwords.words('english')
def remove_stopwords(tokenized_sentences):
    for sentence in tokenized_sentences:
        yield([token for token in sentence if token not in stop_words])

# Lemmatize all words
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatize_words(tokenized_sentences):
    for sentence in tokenized_sentences:
        yield([wordnet_lemmatizer.lemmatize(token) for token in sentence])

snowball_stemmer = SnowballStemmer('english')
def stem_words(tokenized_sentences):
    for sentence in tokenized_sentences:
        yield([snowball_stemmer.stem(token) for token in sentence])
                
def text_preprocessing(url):
    # some sites block common non-browser user agents strings, so creaiting a temporary one to read data
    user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
    headers={'User-Agent': user_agent,} 

    # Get html content of the website
    request=urllib.request.Request(url, None, headers)
    response = urllib.request.urlopen(request)
    html = response.read().decode("utf-8")

    html_extractor = extractors.ArticleExtractor()
    # Text extraction with boilerpy3
    text = html_extractor.get_content(html)
    # Condenses all repeating newline characters into one single newline character
    final = '\n'.join([p for p in re.split('\n|\r', text) if len(p) > 0])
            
    sentences = list(sentence_to_words([final]))
    sentences = list(remove_stopwords(sentences))
    sentences = list(lemmatize_words(sentences))
    sentences = list(stem_words(sentences))
    return ' '.join(sentences[0])

In [3]:
class CheckSim:
    def __init__(self, w2v_model):
        self.w2v_model = w2v_model

    def vectorize(self, doc: str) -> np.ndarray:
        doc = doc.lower()
        words = [w for w in doc.split()]
        word_vecs = []
        for word in words:
            try:
                vec = self.w2v_model[word]
                word_vecs.append(vec)
            except KeyError:
                pass

        vector = np.mean(word_vecs, axis=0)
        return vector

    def _cosine_sim(self, vecA, vecB):
        csim = np.dot(vecA, vecB) / (np.linalg.norm(vecA) * np.linalg.norm(vecB))
        if np.isnan(np.sum(csim)):
            return 0
        return csim

    def calculate_similarity(self, source_doc, target_docs=None):
        source_vec = self.vectorize(source_doc)
        results = []
        for doc in target_docs:
            target_vec = self.vectorize(doc)
            sim_score = self._cosine_sim(source_vec, target_vec)
            results.append(sim_score)
        return results

In [4]:
# We can also train a word2vec model ourselves, but for meaningful results we would need tons of documents
# Also, that might take a lot of time with my computation limits    
    
# Loading a pre-trained word2vec model from google - https://code.google.com/archive/p/word2vec/
# This model contains 300-dimensional vectors for 3 million words and phrases.
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary = True)
sim = CheckSim(model)

# OOPS questions
doc1 = text_preprocessing('https://www.mygreatlearning.com/blog/oops-interview-questions/')

# OOPS questions
doc2 = text_preprocessing('https://career.guru99.com/top-50-oops-interview-questions/')

# C programming language questions
doc3 = text_preprocessing('https://www.upgrad.com/blog/c-interview-questions-answers/')

# Flask tutorial
doc4 = text_preprocessing('https://dev.to/gajesh/the-complete-flask-beginner-tutorial-124i')

# Human Values
doc5 = text_preprocessing('https://stellamaryscoe.edu.in/human-values-ethics.php')

# Psychology
doc6 = text_preprocessing('https://www.verywellmind.com/psychology-4014660')

source_doc = doc1
target_docs = [doc2, doc3, doc4, doc5, doc6]

sim_scores = sim.calculate_similarity(source_doc, target_docs)

In [5]:
print("Similarity between OOPS QUESTIONS and OOPS QUESTIONS: ", sim_scores[0])
print("Similarity between OOPS QUESTIONS and C LANGUAGE: ", sim_scores[1])  
print("Similarity between OOPS QUESTIONS and PYTHON FLASK TUTORIAL: ", sim_scores[2]) 
print("Similarity between OOPS QUESTIONS and HUMAN VALUES: ", sim_scores[3])
print("Similarity between OOPS QUESTIONS and PSYCHOLOGY: ", sim_scores[4])

Similarity between OOPS QUESTIONS and OOPS QUESTIONS:  0.9534115
Similarity between OOPS QUESTIONS and C LANGUAGE:  0.85025674
Similarity between OOPS QUESTIONS and PYTHON FLASK TUTORIAL:  0.80004436
Similarity between OOPS QUESTIONS and HUMAN VALUES:  0.73284316
Similarity between OOPS QUESTIONS and PSYCHOLOGY:  0.7983286


## IMAGE SIMILARITY

In [8]:
# Getting all images from website and saving it in respective folders
def is_valid(url):
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

def get_all_images(url):
    soup = bs(requests.get(url).content, "html.parser")
    urls = []
    for img in tqdm(soup.find_all("img"), "Extracting images"):
        img_url = img.attrs.get("src")
        if not img_url:
            # if img does not contain src attribute, just skip
            continue
        # make the URL absolute by joining domain with the URL that is just extracted
        img_url = urljoin(url, img_url)
        try:
            pos = img_url.index("?")
            img_url = img_url[:pos]
        except ValueError:
            pass
        # finally, if the url is valid
        if is_valid(img_url):
            urls.append(img_url)
    return urls

def download(idx, url, pathname):
    # if path doesn't exist, make that path dir
    if not os.path.isdir(pathname):
        os.makedirs(pathname)
    # download the body of response by chunk, not immediately
    try:
        response = requests.get(url, stream=True)
    except:
        print(url)
        return 
    # get the total file size
    file_size = int(response.headers.get("Content-Length", 0))

    # get the file name
    image_name_save = url.split('/')[-1]
    if len(image_name_save.split('.')) != 2:
        print(url)
        return 
    filename = os.path.join(pathname, image_name_save)
    if filename == pathname + '/':
        print(url)
        return     
    # progress bar, changing the unit to bytes instead of iteration (default by tqdm)
    progress = tqdm(response.iter_content(1024),
                    f"Downloading {filename}", total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
    with open(filename, "wb") as f:
        for data in progress:
            # write data read to the file
            f.write(data)
            # update the progress bar manually
            progress.update(len(data))

def save_imgs_from_url(url, savepath):
    # get all images
    imgs = get_all_images(url)
    for idx, img in enumerate(imgs):
        # for each img, download it
        download(idx+1, img, savepath)

In [9]:
from keras.applications.resnet50 import ResNet50, preprocess_input
from keras.preprocessing import image
from keras.models import Model
from os.path import join
from os import listdir
from PIL import Image
import numpy as np
import io

def get_feature_vectors(img_path, model):
    img = image.load_img(img_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return model.predict(x)

def similar_images(model, dir1, dir2):
    # Iterate over doc1 and doc2 images for similarity check
    basedir_doc1 = dir1
    basedir_doc2 = dir2
    for f1 in listdir(basedir_doc1):
        for f2 in listdir(basedir_doc2):
            # Get path names of images
            img1 = join(basedir_doc1, f1)
            img2 = join(basedir_doc2, f2)

            # Checking PIL compatibility
            try:
                im = Image.open(img1)
                im = Image.open(img2)
            except:
                pass
                #print("Not supported by PIL!")
            else:
                # Get feature vectors for each image
                img1_fv = get_feature_vectors(img1, model)
                img2_fv = get_feature_vectors(img2, model)

                # Difference
                diff = np.linalg.norm(img1_fv - img2_fv)
                if diff < 0.5:
                    print(img1, "is similar to", img2) 
                    
# Using pre-trained ResNet50 and imagenet weights for feature extraction
model = ResNet50(weights='imagenet')
similar_images(model, 'Test/images_doc1/', 'Test/images_doc2/')

Using TensorFlow backend.


Test/images_doc1/doc1_1.png is similar to Test/images_doc2/doc2_4.png
Test/images_doc1/doc1_3.png is similar to Test/images_doc2/doc2_3.png
Test/images_doc1/doc1_3.png is similar to Test/images_doc2/doc2_2.png
Test/images_doc1/doc1_3.png is similar to Test/images_doc2/doc2_1.png
Test/images_doc1/doc1_2.png is similar to Test/images_doc2/doc2_3.png
Test/images_doc1/doc1_2.png is similar to Test/images_doc2/doc2_2.png


# Blog-post Similarity

In [10]:
# Test with your own URLs
source = input("Enter URL of source website: ")
target = input("Enter URL of target webiste: ")

source_doc = text_preprocessing(source)
target_doc = text_preprocessing(target)

save_imgs_from_url(source, 'images_doc1')
save_imgs_from_url(target, 'images_doc2')

Enter URL of source website:  https://www.coachmag.co.uk/workouts/chest-workouts
Enter URL of target webiste:  https://fitpass.co.in/blog/top-10-chest-exercises-for-men


Extracting images: 100%|██████████| 45/45 [00:00<00:00, 13336.89it/s]
Downloading images_doc1/logo.png:   0%|          | 4.00/3.91k [00:00<00:04, 870B/s]
Downloading images_doc1/chest-workout-4-weeks.jpg:   0%|          | 11.0/10.6k [00:00<00:05, 1.87kB/s]
Downloading images_doc1/1-1-bench-press.jpg:   0%|          | 22.0/21.7k [00:00<00:05, 3.95kB/s]
Downloading images_doc1/incline-bench-press.jpg:   0%|          | 24.0/23.8k [00:00<00:06, 3.53kB/s]
Downloading images_doc1/1-2a-incline-dumbbell-bench-press.jpg:   0%|          | 23.0/22.2k [00:00<00:06, 3.66kB/s]
Downloading images_doc1/1-2b-incline-dumbbell-flye.jpg:   0%|          | 20.0/19.2k [00:00<00:05, 3.45kB/s]
Downloading images_doc1/press-up.jpg:   0%|          | 16.0/15.5k [00:00<00:07, 2.15kB/s]
Downloading images_doc1/2-2a-barbell-back-squat.jpg:   0%|          | 20.0/19.6k [00:00<00:09, 2.18kB/s]
Downloading images_doc1/3-1-chin-up.jpg:   0%|          | 21.0/20.4k [00:00<00:06, 3.19kB/s]
Downloading images_doc1/4-1-overhe

https://sb.scorecardresearch.com/p
https://googleads.g.doubleclick.net/pagead/viewthroughconversion/1019003578/


Extracting images: 100%|██████████| 31/31 [00:00<00:00, 19174.67it/s]


https://www.facebook.com/tr
https://www.facebook.com/tr


Downloading images_doc2/atrk.gif:   2%|▏         | 1.00/43.0 [00:00<00:00, 442B/s]
Downloading images_doc2/fitpassLogo.svg: 3.00B [00:00, 400B/s]

https://px.ads.linkedin.com/collect/



Downloading images_doc2/list_ic.svg: 1.00B [00:00, 176B/s]
Downloading images_doc2/blog_ic.svg: 1.00B [00:00, 158B/s]
Downloading images_doc2/studio_log_ic.svg: 1.00B [00:00, 194B/s]
Downloading images_doc2/fitpassLogo.svg: 3.00B [00:00, 497B/s]
Downloading images_doc2/blog_banner_EA9AA3E54E94DC9.png:   0%|          | 76.0/75.3k [00:00<01:04, 1.20kB/s]
Downloading images_doc2/share_facebook_ic@2x.png:   0%|          | 1.00/740 [00:00<00:01, 409B/s]
Downloading images_doc2/share_twitter_ic@2x.png:   0%|          | 1.00/873 [00:00<00:01, 478B/s]
Downloading images_doc2/share_linkedin_ic@2x.png:   0%|          | 1.00/813 [00:00<00:04, 163B/s]
Downloading images_doc2/gallery_image_dumbbell-pullover_D695.jpg:   0%|          | 35.0/34.6k [00:00<01:28, 402B/s]
Downloading images_doc2/gallery_image_barbell-bench-press_7A72.jpg:   0%|          | 96.0/95.4k [00:00<04:17, 379B/s]  
Downloading images_doc2/gallery_image_inclined-dumbbell-fly_9EA4.png:   0%|          | 133/133k [00:00<04:04, 555B/

In [11]:
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary = True)
similarity = CheckSim(model)
scores = similarity.calculate_similarity(source_doc, [target_doc])
print("Similarity score: ", scores[0])

model = ResNet50(weights='imagenet')
similar_images(model, 'images_doc1/', 'images_doc2/')

Similarity score:  0.91736645
images_doc1/chest-workout-4-weeks.jpg is similar to images_doc2/blog_photo_5CE701D91F40CF5111.png


  "Palette images with Transparency expressed in bytes should be "


images_doc1/chest-workout-4-weeks.jpg is similar to images_doc2/gallery_image_pushups_C09A.jpg
images_doc1/chest-workout-4-weeks.jpg is similar to images_doc2/gallery_image_chest-dips_91F4.jpg
images_doc1/chest-workout-4-weeks.jpg is similar to images_doc2/atrk.gif
images_doc1/chest-workout-4-weeks.jpg is similar to images_doc2/payment_ic.png
images_doc1/home_workout_push_up_main.jpg is similar to images_doc2/blog_photo_5CE701D91F40CF5111.png
images_doc1/home_workout_push_up_main.jpg is similar to images_doc2/gallery_image_pushups_C09A.jpg
images_doc1/home_workout_push_up_main.jpg is similar to images_doc2/gallery_image_chest-dips_91F4.jpg
images_doc1/home_workout_push_up_main.jpg is similar to images_doc2/atrk.gif
images_doc1/home_workout_push_up_main.jpg is similar to images_doc2/payment_ic.png
images_doc1/2-2a-barbell-back-squat.jpg is similar to images_doc2/gallery_image_decline-bench-press_D9BA.jpg
images_doc1/2-2a-barbell-back-squat.jpg is similar to images_doc2/gallery_image_bar

In [12]:
# Tested links
# Source - https://www.coachmag.co.uk/workouts/chest-workouts
# Target - https://fitpass.co.in/blog/top-10-chest-exercises-for-men