In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import requests
import re
from bs4 import BeautifulSoup

In [45]:
class Pipeline():
    def __init__(self, medium_content_url, module_url = 'https://tfhub.dev/google/universal-sentence-encoder-large/3'):
        assert type(medium_content_url) == str
        assert type(module_url) == str
        self.medium_content_url = medium_content_url
        self.module_url = module_url
        self.__prepare_text_from_html(self.medium_content_url)
        self.__loading_module(self.module_url)
    
    # function for downloading content from the web
    def __content_downloader(self, url):
        request = requests.get(url)
        soup = BeautifulSoup(request.content, "lxml")
        self._html = str(soup.find("div", {"class": "postArticle-content js-postField js-notesSource js-trackedPost"}))
    
    # cleaning the content and return sentences it's not compelete and need to be modified
    def __cleaning_text(self, html_part):
        text = re.sub('<[A-Za-z\/][^>]*>', ' ', str(html_part))
        text = re.split(r'\s{2,}', text)[1:-1]
        text = [sentence.replace('\xa0', ' ') for sentence in text]
        text = [sentence.replace('\u200a—\u200a', ' ') for sentence in text]
        return text
    
    # preparing a text for execute caculations on it
    def __prepare_text_from_html(self, url):
        self.__content_downloader(url)
        self.text = self.__cleaning_text(self._html)
    
    # function for loading diffrenet module
    def __loading_module(self, module_url):
        # Import the Universal Sentence Encoder's TF Hub module
        self.embed_object = hub.Module(module_url)
        
    # function for runinng embedding module on text
    def __run_embedding(embed_object, text):
        # Reduce logging output.
        tf.logging.set_verbosity(tf.logging.ERROR)

        with tf.Session() as session:
            session.run([tf.global_variables_initializer(), tf.tables_initializer()])
            message_embeddings = session.run(embed_object(text))

        return message_embeddings
    
    # function for asking question from text
    def ask_question_from_text(self, question):
        self.question = list(question)
    
    # function for calculating similarity between question and text
    def __calculating_similarity_tensor(self, question, text):
        question_tensor = tf.Variable(tf.convert_to_tensor(run_embedding(self.embed_object, question)))
        text_tensor = tf.Variable(tf.convert_to_tensor(run_embedding(self.embed_object, text)))
        multiply_tensor = tf.matmul(question_tensor, text_tensor, transpose_b = True)
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            answer = sess.run(multiply_tensor)
        return answer
       
    # calculate similarity tensor here once for all here waiting for find out correct form of invoking above method
    # __calculating_similarity_tensor()
                                
    # function for find sentence in text that answer question that has been asked
    def find_the_most_similar_sentence(self):
        self.similarity_tensor = self.__calculating_similarity_tensor(question, text)
        print('similarity score for the most similar sentence is {}'.format(np.max(self.similarity_tensor)))
        return np.hstack([self.question, self.text[np.argmax(self.similarity_tensor)]]).reshape(-1, 1)
    
    # function for printing information about similarity tensor and printing a stack of similar sentences from text to question
    def information_about_similar_sentences(self, threshold = 0.7):
        self.similarity_tensor = self.__calculating_similarity_tensor(question, text)
        self.sorted_similarity_array = np.array([list(row) for row in sorted(zip(self.similarity_tensor[0], self.text),
                                                reverse = True)])
        self.sorted_similarity_array = pd.DataFrame(sorted_similarity_array[1])
        sorted_similarity_tensor = np.split(self.sorted_similarity_array, 2, axis = 1)[0].flatten().astype('float')
        sentences = np.array([self.sorted_similarity_array[i] for i in np.where(sorted_similarity_tensor > threshold)[0]])
        sentences = np.insert(sentences, 0, values = np.array([None, self.question[0]]).reshape(1, 2), axis=0)
        presentation_dataframe = pd.DataFrame(sentences, columns = ['similarity score', 'sentence'])
        presentation_dataframe = presentation_dataframe[['sentence', 'similarity score']]
        self.presentation_dataframe = presentation_dataframe
        return presentation_dataframe
    
    # function for ploting results
    def plot_similarity(self, rotation = 90):
        sentences = self.presentation_dataframe.values[:, 0]
        scores = self.__calculating_similarity_tensor(sentences, sentences)
        sns.set(font_scale = 1.2)
        g = sns.heatmap(
                scores,
                xticklabels = sentences,
                yticklabels = sentences,
                vmin = 0,
                vmax = 1,
                cmap = "YlOrRd")
        g.set_xticklabels(sentences, rotation=rotation)
        g.set_title("Semantic Textual Similarity")

In [39]:
example = Pipeline('https://blog.insightdatascience.com/reinforcement-learning-from-scratch-819b65f074d8', 'module_url')