In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import requests
import re
from bs4 import BeautifulSoup

In [3]:
# function for downloading content from the web
def content_downloader(url):
    request = requests.get(url)
    soup = BeautifulSoup(request.content, "lxml") 
    return str(soup.find("div", {"class": "postArticle-content js-postField js-notesSource js-trackedPost"}))

In [4]:
# cleaning the content and return sentences it's not compelete and need to be modified
def cleaning_text(html_part):
    text = re.sub('<[A-Za-z\/][^>]*>', ' ', str(html_part))
    text = re.split(r'\s{2,}', text)[1:-1]
    text = [sentence.replace('\xa0', ' ') for sentence in text]
    text = [sentence.replace('\u200a—\u200a', ' ') for sentence in text]
    return text

In [5]:
# preparing a text for execute caculations on it
def prepare_text_from_html(url):
    html_content = content_downloader(url)
    text = cleaning_text(html_content)
    return text

In [6]:
# function for loading diffrenet module
def loading_module(module_url):
    # Import the Universal Sentence Encoder's TF Hub module
    embed_object = hub.Module(module_url)
    return embed_object

In [7]:
# function for runinng embedding module on text
def run_embedding(embed_object, text):
    # Reduce logging output.
    tf.logging.set_verbosity(tf.logging.ERROR)

    with tf.Session() as session:
        session.run([tf.global_variables_initializer(), tf.tables_initializer()])
        message_embeddings = session.run(embed_object(text))

    return message_embeddings

In [8]:
# function for calculating similarity between question and text
def calculating_similarity_tensor(module_url, question, text):
    question_tensor = tf.Variable(tf.convert_to_tensor(run_embedding(loading_module(module_url), question)))
    text_tensor = tf.Variable(tf.convert_to_tensor(run_embedding(loading_module(module_url), text)))
    multiply_tensor = tf.matmul(question_tensor, text_tensor, transpose_b = True)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        answer = sess.run(multiply_tensor)
    return answer

In [9]:
# function for find sentence in text that answer question that has been asked
def find_similar_sentence(similarity_tensor, question, text):
    print('similarity score for the most similar sentence is {}'.format(np.max(similarity_tensor)))
    return np.hstack([question, text[np.argmax(similarity_tensor)]]).reshape(-1, 1)

In [10]:
# function for printing information about similarity tensor and printing a stack of similar sentences from text to question
def print_information(similarity_tensor, question, text, threshold = 0.7):
    print(np.array(list(zip(similarity_tensor[0], text))))
    sorted_similarity_array = np.array([list(row) for row in sorted(zip(similarity_tensor[0], text), reverse = True)])
    if threshold == 0:
        print(sorted_similarity_tensor)
    sorted_similarity_text = np.split(sorted_similarity_array, 2, axis = 1)[1]
    sorted_similarity_tensor = np.split(sorted_similarity_array, 2, axis = 1)[0].flatten().astype('float')
    sentences = [sorted_similarity_text[i] for i in np.where(sorted_similarity_tensor > threshold)[0]]
    return np.vstack([question, sentences]).reshape(-1, 1)

In [11]:
def print_phase(phase):
    print(phase)
    print('=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=')

In [12]:
# function for comparing two different modules from tensorflow hub on same question and text
def calculate_different_model_accuracy(module_url_1, module_url_2, question, text):
    similarity_tensor_1 = calculating_similarity_tensor(module_url_1, question, text)
    similarity_tensor_2 = calculating_similarity_tensor(module_url_2, question, text)
    similar_sentence_1 = find_similar_sentence(similarity_tensor_1, question, text)
    print_phase(similar_sentence_1)
    similar_sentence_2 = find_similar_sentence(similarity_tensor_2, question, text)
    print_phase(similar_sentence_2)
    information_1 = print_information(similarity_tensor_1, question, text, threshold = 0.8)
    print_phase(information_1)
    information_2 = print_information(similarity_tensor_2, question, text, threshold = 0.8)
    print_phase(information_2)

In [26]:
text = prepare_text_from_html('https://towardsdatascience.com/5-resources-to-inspire-your-next-data-science-project-ea6afbe20319')
question = '5 Resources to Inspire Your Next Data Science'

In [29]:
# function for prepare data for input functions for tensorflow DNN classifier
def prepare_dataset_for_finetuning(text, question):
    x = text
    y = np.arange(len(x))
    x = np.hstack([question, text])
    y = np.insert(y, 0, 0)
    x = pd.DataFrame(x,
        index = pd.RangeIndex(1, len(x) + 1),
        columns = ['sentences'])
    x['class'] = pd.Series(y, index = x.index)
    return x

In [30]:
# function for prepare functions for tensorflow DNN classifier
def prepare_input_function(data):
    train_input_function = tf.estimator.inputs.pandas_input_fn(data, data['class'], num_epochs = None, shuffle = True)
    predict_input_function = tf.estimator.inputs.pandas_input_fn(data, data['class'], shuffle = False)
    return train_input_function, predict_input_function

In [31]:
# function for training a DNN classifier base on tensorflow hub text modules
def train_and_evaluate_with_module(hub_module, text, question, trainable = True):
    data = prepare_dataset_for_finetuning(text, question)
    train_input_function = prepare_input_function(data)[0]
    predict_input_function = prepare_input_function(data)[1]
    
    embedded_text_feature_column = hub.text_embedding_column(key = 'sentences', module_spec = hub_module, trainable = trainable)
    
    estimator = tf.estimator.DNNClassifier(
        hidden_units = [500, 100],
        feature_columns = embedded_text_feature_column,
        n_classes = 83,
        optimizer = tf.train.AdagradOptimizer(learning_rate = 0.003))
    
    estimator.train(input_fn = train_input_function, steps = 1000)
    
    train_eval_result = estimator.evaluate(input_fn = predict_input_function)
    training_eval_accuracy = train_eval_result['accuracy']
    
    return training_eval_accuracy