In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
import chart_studio.plotly as py
import plotly.graph_objs as go
import spacy
from spacy.lang.en import English

In [2]:
ELMO_URL = "https://tfhub.dev/google/elmo/3"
EXCEL_FILE_PATH = '/content/elmo_data.xlsx'
DESCRIPTION_COLUMN = 'Description'
SEARCH_STRING = "what is thor's weapon"
RESULTS_RETURNED = 3

In [3]:
def load_data(file_path):
    df = pd.read_excel(file_path).reset_index(drop=True)
    return df

In [4]:
def preprocess_text(nlp, text):
    text = text.lower().replace('\n', ' ').replace('\t', ' ').replace('\xa0', ' ')
    sentences = [i.text.strip() for i in nlp(text).sents if len(i) > 1]
    return sentences

In [5]:
def calculate_embeddings(sentences):
    embed = hub.load(ELMO_URL)
    embeddings = embed.signatures["default"](tf.constant(sentences))["default"]
    x = embeddings.numpy()
    return x

In [6]:
def reduce_dimensions(x):
    pca_tsne = TSNE(n_components=2)
    y = pca_tsne.fit_transform(PCA(n_components=50).fit_transform(x))
    return y

In [7]:
def plot_embeddings(y, sentences):
    data = [
        go.Scatter(
            x=y[:, 0],
            y=y[:, 1],
            mode='markers',
            text=sentences,
            marker=dict(
                size=16,
                color=[len(i) for i in sentences],
                opacity=0.8,
                colorscale='viridis',
                showscale=False
            )
        )
    ]
    layout = dict(
        yaxis=dict(zeroline=False),
        xaxis=dict(zeroline=False)
    )
    fig = go.Figure(data=data, layout=layout)
    fig.update_layout(width=900, height=600, title_text='Elmo Embeddings represented in 2 dimensions')
    return fig

In [8]:
def find_similar_sentences(search_string, embeddings, sentences, results_returned):
    similar_scores = []
    similar_terms = []

    embeddings2 = hub.load(ELMO_URL).signatures["default"](tf.constant([search_string],))["default"]
    search_vect = embeddings2.numpy()
    cosine_similarities = pd.Series(cosine_similarity(search_vect, embeddings).flatten())

    for i, j in cosine_similarities.nlargest(int(results_returned)).iteritems():
        similar_score = j
        similar_sentence = ' '.join([word if word.lower() in search_string else word for word in sentences[i].split()])

        similar_scores.append(similar_score)
        similar_terms.append(similar_sentence)

    return similar_scores, similar_terms

In [9]:
df = load_data(EXCEL_FILE_PATH)
nlp = spacy.load('en_core_web_sm')
sentences = preprocess_text(nlp, ' '.join(df[DESCRIPTION_COLUMN]))
embeddings = calculate_embeddings(sentences)
reduced_embeddings = reduce_dimensions(embeddings)
plot = plot_embeddings(reduced_embeddings, sentences)
plot.show()

In [10]:
similar_scores, similar_terms = find_similar_sentences(SEARCH_STRING, embeddings, sentences, RESULTS_RETURNED)
similarity_df = pd.DataFrame({'Similarity Score': similar_scores, 'Similar Terms': similar_terms})


iteritems is deprecated and will be removed in a future version. Use .items instead.



In [11]:
similarity_df

Unnamed: 0,Similarity Score,Similar Terms
0,0.528599,stormbreaker is an enchanted axe used by thor.
1,0.519069,when his irresponsible and impetuous behavior ...
2,0.51235,when all the people of asgard refused to bow t...
