In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import requests
import re
from bs4 import BeautifulSoup

In [10]:
def print_phase(phase):
    print(phase)
    print('=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=')

In [2]:
# function for downloading content from the web
def content_downloader(url):
    request = requests.get(url)
    soup = BeautifulSoup(request.content) 
    return str(soup.find("div", {"class": "postArticle-content js-postField js-notesSource js-trackedPost"}))

In [3]:
# cleaning the content and return sentences it's not compelete and need to be modified
def cleaning_text(html_part):
    text = re.sub('<[A-Za-z\/][^>]*>', ' ', str(html_part))
    text = re.split(r'\s{2,}', text)[1:-1]
    text = [sentence.replace('\xa0', ' ') for sentence in text]
    text = [sentence.replace('\u200a—\u200a', ' ') for sentence in text]
    return text

In [4]:
# preparing a text for execute caculations on it
def prepare_text_from_html(url):
    html_content = content_downloader(url)
    text = cleaning_text(html_content)
    return text

In [5]:
# function for loading diffrenet module
def loading_module(module_url):
    # Import the Universal Sentence Encoder's TF Hub module
    embed_object = hub.Module(module_url)
    return embed_object

In [6]:
# function for runinng embedding module on text
def run_embedding(embed_object, text):
    # Reduce logging output.
    tf.logging.set_verbosity(tf.logging.ERROR)

    with tf.Session() as session:
        session.run([tf.global_variables_initializer(), tf.tables_initializer()])
        message_embeddings = session.run(embed_object(text))

    return message_embeddings

In [7]:
# function for calculating similarity between question and text
def calculating_similarity_tensor(module_url, question, text):
    question_tensor = tf.Variable(tf.convert_to_tensor(run_embedding(loading_module(module_url), question)))
    text_tensor = tf.Variable(tf.convert_to_tensor(run_embedding(loading_module(module_url), text)))
    multiply_tensor = tf.matmul(question_tensor, text_tensor, transpose_b = True)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        answer = sess.run(multiply_tensor)
    return answer

In [8]:
# function for find sentence in text that answer question that has been asked
def find_the_most_similar_sentence(similarity_tensor, question, text):
    print('similarity score for the most similar sentence is {}'.format(np.max(similarity_tensor)))
    return np.hstack([question, text[np.argmax(similarity_tensor)]]).reshape(-1, 1)

In [13]:
text = prepare_text_from_html('https://towardsdatascience.com/5-resources-to-inspire-your-next-data-science-project-ea6afbe20319')
question = ['5 Resources to Inspire Your Next Data Science']

In [29]:
# function for calculating jaccard similarity between two sentences
def get_Jaccard_similarity(question, sentence):
    if type(question) != str:
        question = question[0]
    if type(sentence) != str:
        sentence = sentence[0]
    question_splitted = set(question.split())
    sentence_splitted = set(sentence.split())
    intersection_question_sentence = question_splitted.intersection(sentence_splitted)
    return round(len(intersection_question_sentence) / (len(question_splitted) + len(sentence_splitted) - len(intersection_question_sentence)), 3)

In [9]:
# function for printing information about similarity tensor and printing a stack of similar sentences from text to question
def information_about_similar_sentences(similarity_tensor, question, text, threshold = 0.7, print_sorted = False):
    sorted_similarity_array = np.array([list(row) for row in sorted(zip(similarity_tensor[0], text), reverse = True)])
    if print_sorted:
        print(pd.DataFrame(sorted_similarity_array[1]))
#     sorted_similarity_text = np.split(sorted_similarity_array, 2, axis = 1)[1]
    sorted_similarity_tensor = np.split(sorted_similarity_array, 2, axis = 1)[0].flatten().astype('float')
    sentences = np.array([sorted_similarity_array[i] for i in np.where(sorted_similarity_tensor > threshold)[0]])
    sentences = np.insert(sentences, 0, values = np.array([None, question[0]]).reshape(1, 2), axis=0)
    presentation_dataframe = pd.DataFrame(sentences, columns = ['similarity score', 'sentence'])
    presentation_dataframe = presentation_dataframe[['sentence', 'similarity score']]
    return presentation_dataframe

In [None]:
# function for returning filtering dissimilar sentences to question
def find_Jaccard_similarity(question, presentation_dataframe, threshold = 0.2):
    sentences = presentation_dataframe.values[1:, 0]
    jaccard_similarity_score = np.array([get_Jaccard_similarity(question, sentence) for sentence in sentences])
    conditional_array = [sentences[i] for i in np.where(jaccard_similarity_score > threshold)[0]]
    return conditional_array

In [None]:
# function for ploting results
def plot_similarity(module_url, presentation_dataframe, rotation):
    sentences = presentation_dataframe.values[:, 0]
    scores = calculating_similarity_tensor(module_url, sentences, sentences)
    sns.set(font_scale = 1.2)
    g = sns.heatmap(
            scores,
            xticklabels = sentences,
            yticklabels = sentences,
            vmin = 0,
            vmax = 1,
            cmap = "YlOrRd")
    g.set_xticklabels(sentences, rotation=rotation)
    g.set_title("Semantic Textual Similarity")

In [11]:
# function for comparing two different modules from tensorflow hub on same question and text
def calculate_different_model_accuracy(module_url_1, module_url_2, question, text):
    similarity_tensor_1 = calculating_similarity_tensor(module_url_1, question, text)
    similarity_tensor_2 = calculating_similarity_tensor(module_url_2, question, text)
    similar_sentence_1 = find_similar_sentence(similarity_tensor_1, question, text)
    print_phase(similar_sentence_1)
    similar_sentence_2 = find_similar_sentence(similarity_tensor_2, question, text)
    print_phase(similar_sentence_2)
    information_1 = information_about_similar_sentences(similarity_tensor_1, question, text, threshold = 0.8)
    print_phase(information_1)
    information_2 = information_about_similar_sentences(similarity_tensor_2, question, text, threshold = 0.8)
    print_phase(information_2)

In [92]:
# function for prepare data for input functions for tensorflow DNN classifier
def prepare_dataset_for_finetuning(text, question, list_of_classes):
    x_train = text
    y = np.arange(len(x_train))
    x_train = pd.DataFrame(x_train,
        index = pd.RangeIndex(1, len(x_train) + 1),
        columns = ['sentences'])
    x_train['class'] = pd.Series(y, index = x_train.index)
    x_test = pd.DataFrame(question,
        index = pd.RangeIndex(1, len(question) + 1),
        columns = ['sentences'])
    x_test['class'] = pd.Series(list_of_classes, index = x_test.index)
    return x_train, x_test

In [None]:
# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
    data = {}
    data["sentence"] = []
    data["sentiment"] = []
    for file_path in os.listdir(directory):
      with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
        data["sentence"].append(f.read())
        data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
    return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
    pos_df = load_directory_data(os.path.join(directory, "pos"))
    neg_df = load_directory_data(os.path.join(directory, "neg"))
    pos_df["polarity"] = 1
    neg_df["polarity"] = 0
    return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
    dataset = tf.keras.utils.get_file(
        fname="aclImdb.tar.gz", 
        origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
        extract=True)

    train_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                             "aclImdb", "train"))
    test_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                            "aclImdb", "test"))

    return train_df, test_df

# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

train_df, test_df = download_and_load_datasets()
train_df.head()

In [93]:
# function for prepare functions for tensorflow DNN classifier
def prepare_input_function(data_train, data_test):
    train_input_function = tf.estimator.inputs.pandas_input_fn(train_df, train_df["polarity"], num_epochs=None, shuffle=True)
    predict_train_input_function = tf.estimator.inputs.pandas_input_fn(train_df, train_df["polarity"], shuffle=False)
    predict_test_input_function = tf.estimator.inputs.pandas_input_fn(test_df, test_df["polarity"], shuffle=False)
    return train_input_function, predict_train_input_function, predict_test_input_function

In [94]:
# function for training a DNN classifier base on tensorflow hub text modules
def train_and_evaluate_with_module(hub_module, train_df, test_df, learning_rate = 0.003, steps = 1000, trainable = False):
    input_functions = prepare_input_function(train_df, test_df)
    train_input_function = input_functions[0]
    predict_train_input_function = input_functions[1]
    predict_test_input_function = input_functions[2]
    
    embedded_text_feature_column = hub.text_embedding_column(key = 'sentence', module_spec = hub_module, trainable = trainable)
    
    estimator = tf.estimator.DNNClassifier(
        hidden_units = [500, 100],
        feature_columns = [embedded_text_feature_column],
        n_classes = 2,
        optimizer = tf.train.AdagradOptimizer(learning_rate = learning_rate))
    
    estimator.train(input_fn = train_input_function, steps = steps)
    
    train_eval_result = estimator.evaluate(input_fn = predict_train_input_function)
    test_eval_result = estimator.evaluate(input_fn = predict_test_input_function)
    
    training_set_accuracy = train_eval_result['accuracy']
    test_set_accuracy = test_eval_result['accuracy']
    
    return {
      "Training accuracy": training_set_accuracy,
      "Test accuracy": test_set_accuracy
  }

In [96]:
# function for comparing different state of module
def comparing_module_with_trainable_option(hub_module, train_df, test_df, learning_rate, steps):
    trainable_off = train_and_evaluate_with_module(hub_module, train_df, test_df, learning_rate = learning_rate, steps = steps, trainable = False)
    trainable_on = train_and_evaluate_with_module(hub_module, train_df, test_df, learning_rate = learning_rate, steps = steps, trainable = True)
    return {
        'train_off': trainable_off,
        'train_on': trainable_on
    }

In [None]:
results = comparing_module_with_trainable_option(hub_module, text, question, learning_rate = 0.003, steps = 1000)