In [None]:
import pickle

import torch




In [None]:
with open("scalers.sklearn", 'rb') as f:
    scalers = pickle.load(f)

In [None]:
embeddings = torch.load("embeddings.ptx", map_location=torch.device('cpu'))

In [None]:
embeddings

In [None]:
def pairwise_similarity(matrix_of_vectors):
    '''
    Computes cosine similarities for between all vectors, extremely useful for comparing
    similarities between embeddings when doing deep embedding learning.

    input:
        matrix_of_vectors: tensor with shape (n_vectors, vector_size)

    output:
        similarities : tensor with shape (n_vector, n_vectors)
    Each row[i, j] is the similarity of the ith element against the jth vector, eg,
    row[0,0] is 1 and row[0,42] is the similarity between the first
    element in the input and the 43th element in the input.
    '''

    dot_product = matrix_of_vectors@matrix_of_vectors.t()
    norms = torch.sqrt(torch.einsum('ii->i', dot_product))
    similarities = dot_product/(norms[None]*norms[..., None])

    return similarities

In [None]:
similarities = pairwise_similarity(embeddings.weight)#  - torch.eye(embeddings.weight.shape[0]) 

In [None]:
similarities

In [None]:
import json
import pandas as pd

with open('MSFT-state.txt') as f:
    d = json.load(f)#[0]['projections']

In [None]:
tsne_projections = pd.DataFrame(d[0]['projections'])
label = pd.read_csv('metadata.tsv', sep='\t')

In [None]:
label.tail()

In [None]:
idx = label.index[label["label"] == "MSFT"].tolist()[0]
print(idx)

In [None]:
idx_similiarities = similarities[idx].abs().tolist()

In [None]:
idx_similiarities

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import rc
import torch
import pandas as pd
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
from matplotlib.ticker import FormatStrFormatter, StrMethodFormatter, PercentFormatter
import numpy as np

In [None]:
params = {
    "pgf.texsystem": "xelatex",
    "pgf.rcfonts": False,
    "font.serif": [],
    "font.family": "serif",
    "font.sans-serif": [],
    "axes.labelsize": 11,
}

plt.rcParams.update(params)
rc("text", usetex=True)

plt.rc('text.latex', preamble=r'\usepackage{amsmath}\usepackage[utf8]{inputenc}')

CM = 1 / 2.54
# cmap = plt.cm.get_cmap("viridis")
# cmap = mpl.colormaps.get_cmap("plasma")
# plt.style.use(['science','nature'])

# Bright color scheme
# color-blind safe
# from Paul Tot's website: https://personal.sron.nl/~pault/
# Set color cycle
# mpl.rcParams['axes.prop_cycle'] = mpl.cycler('color', ['4477AA', 'EE6677', '228833', 'CCBB44', '66CCEE', 'AA3377', 'BBBBBB'])




In [None]:
idx_distance = np.array([1- i for i in idx_similiarities])

In [None]:
idx_distance 

In [None]:
idx_similiarities

In [None]:
zorder = [int(o * 1000) for o in idx_similiarities]

In [None]:
idx_labels = np.array(idx_similiarities) > 0.41

In [None]:
tsne_projections.head()

In [None]:
fig, ax = plt.subplots(figsize=(12 * CM, 8 * CM))

ax.scatter(tsne_projections['tsne-0'][~idx_labels], tsne_projections['tsne-1'][~idx_labels], c="whitesmoke", s=5) 

sc = ax.scatter(tsne_projections['tsne-0'][idx_labels], tsne_projections['tsne-1'][idx_labels], cmap='Blues_r', c=idx_distance[idx_labels], s=10, zorder=1000, marker="o", edgecolors="grey", linewidth=0.5)

ax.set_xlabel('$t$-SNE Axis 1')
ax.set_ylabel('$t$-SNE Axis 0')


for i, cond in enumerate(idx_labels):
    if cond:
        l = label['label'].iloc[i]
        factor = 1.3 if l == 'MSFT' else 1

        ax.annotate(r"\texttt{"+l+r"}", (tsne_projections['tsne-0'].iloc[i]-1e-7, tsne_projections['tsne-1'].iloc[i]+1e-7), fontsize= 6 * factor, ha="right", zorder=2000)#"xx-small")

fig.colorbar(sc)

fig.tight_layout()

plt.savefig('../reports/Graphs/tsne-MSFT.pdf', bbox_inches='tight')