In [None]:

import json

from adjustText import adjust_text

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc
import torch


In [None]:
params = {
    "pgf.texsystem": "xelatex",
    "pgf.rcfonts": False,
    "font.serif": [],
    "font.family": "serif",
    "font.sans-serif": [],
    "axes.labelsize": 11,
}

plt.rcParams.update(params)
rc("text", usetex=True)

plt.rc('text.latex', preamble=r'\usepackage{amsmath}\usepackage[utf8]{inputenc}')

CM = 1 / 2.54

In [None]:
# embeddings_np = np.loadtxt("../models/embeddings_ml.npy") # probably lower precision
# embeddings = torch.from_numpy(embeddings_np)
embeddings = torch.load("../models/embeddings_ml.ptx")

In [None]:
def pairwise_similarity(matrix_of_vectors):
    '''
    Computes cosine similarities for between all vectors, extremely useful for comparing
    similarities between embeddings when doing deep embedding learning.

    Adapted from: https://github.com/dalisson/pairwise_cosine_distance_pytorch/blob/master/pairwise_cosine_similarity.py

    input:
        matrix_of_vectors: tensor with shape (n_vectors, vector_size)

    output:
        similarities : tensor with shape (n_vector, n_vectors)
    Each row[i, j] is the similarity of the ith element against the jth vector, eg,
    row[0,0] is 1 and row[0,42] is the similarity between the first
    element in the input and the 43th element in the input.
    '''

    dot_product = matrix_of_vectors@matrix_of_vectors.t()
    norms = torch.sqrt(torch.einsum('ii->i', dot_product))
    similarities = dot_product/(norms[None]*norms[..., None])

    return similarities

In [None]:
key = 'JPM'

# generated using https://projector.tensorflow.org/
with open(f'../models/state_{key}.txt') as f:
    d = json.load(f)

In [None]:
tsne_projections = pd.DataFrame(d[0]['projections'])
label = pd.read_csv('../models/metadata.tsv', sep='\t', header=None)

In [None]:
idx = label.index[label[0] == key].tolist()[0]
print(idx)

In [None]:
similarities = pairwise_similarity(embeddings) 
idx_similiarities = similarities[idx].tolist()

In [None]:
idx_distance = np.array([1- i for i in idx_similiarities])

In [None]:
zorder = [int(o * 1000) for o in idx_similiarities]

In [None]:
results = pd.Series(idx_distance, index=label[0].tolist())

In [None]:
results.sort_values(ascending=True).head(11)

In [None]:
# filter for most similar underlyings
# top n labels
idx_labels = np.argpartition(idx_similiarities, -11)[-11:]
mask = np.zeros(len(idx_similiarities), dtype=bool)
mask[idx_labels] = True

In [None]:
label[mask]

In [None]:
fig, ax = plt.subplots(figsize=(12 * CM, 8 * CM))

ax.scatter(tsne_projections['tsne-0'][~mask], tsne_projections['tsne-1'][~mask], c="whitesmoke", s=5) 

sc = ax.scatter(tsne_projections['tsne-0'][mask], tsne_projections['tsne-1'][mask], cmap='Blues_r', c=idx_distance[mask], s=10, zorder=1000, marker="o", edgecolors="grey", linewidth=0.5)

ax.set_xlabel('$t$-SNE Axis 1')
ax.set_ylabel('$t$-SNE Axis 2')

texts = []

for i, cond in enumerate(mask):

    if cond:
        l = label[0].iloc[i]
        factor = 1.5 if l == key else 1

        texts.append(ax.text(tsne_projections['tsne-0'].iloc[i], tsne_projections['tsne-1'].iloc[i], r"\texttt{"+l+r"}",fontsize= 7 * factor, zorder=2000, ha="left", va="top"))

adjust_text(texts, ax=ax, min_arrow_len=1, arrowprops=dict(arrowstyle="-", color='k', lw=0.5))

fig.colorbar(sc)

fig.tight_layout()

plt.savefig(f'../reports/Graphs/categorical_embeddings_{key}.pdf', bbox_inches='tight')