In [None]:

import json
import os
import pickle
import sys
from pathlib import Path

from adjustText import adjust_text

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc
import torch
import wandb

In [None]:
params = {
    "pgf.texsystem": "xelatex",
    "pgf.rcfonts": False,
    "font.serif": [],
    "font.family": "serif",
    "font.sans-serif": [],
    "axes.labelsize": 11,
}

plt.rcParams.update(params)
rc("text", usetex=True)

plt.rc('text.latex', preamble=r'\usepackage{amsmath}\usepackage[utf8]{inputenc}')

CM = 1 / 2.54

In [None]:
# set project name. Required to access files and artefacts
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"

In [None]:
run = wandb.init(project="thesis", entity="fbv")

# see w&b
model = "2h81aiow_TransformerClassifier_default.pkl:latest"

In [None]:
model_name = model.split("/")[-1].split(":")[0]

artifact = run.use_artifact(model)
model_dir = artifact.download()
    
with open(Path(model_dir, model_name), 'rb') as f:
    model = pickle.load(f)

embeddings = model.clf.feature_tokenizer.cat_tokenizer.embeddings.weight.to("cpu")

In [None]:
embeddings

In [None]:
# as done https://github.com/pytorch/pytorch/issues/51445
f = open("tensors.tsv", mode="a")
for x in embeddings: 
    x = [str(i.item()) for i in x] 
    f.write('\t'.join(x) + '\n')
f.close()

In [None]:
# import this file (f) into embedding visualizer to generate t-SNE.
# https://projector.tensorflow.org/

In [None]:
# generate t-sne projection using save to bookmark feature https://projector.tensorflow.org/
with open('../models/state.txt') as f:
    d = json.load(f)

In [None]:
tsne_projections = pd.DataFrame(d[0]['projections'])
# get labels from scalers
label = pd.read_csv('../models/metadata.tsv', sep='\t', header=None).rename({0:"label"},axis=1)

In [None]:
def cos_dist_norm(matrix_of_vectors: torch.Tensor):
    """
    Compute the cosine distance ([0, 2]) between two vectors that have been normalized to unit norm.
    """
    return 1 - matrix_of_vectors @ matrix_of_vectors.T

In [None]:
def cos_sim(matrix_of_vectors: torch.Tensor):
    """
    Computes cosine similarities for between all vectors, extremely useful for comparing
    similarities between embeddings when doing deep embedding learning.

    Adapted from: https://github.com/dalisson/pairwise_cosine_distance_pytorch/blob/master/pairwise_cosine_similarity.py

    and:
    https://github.com/tensorflow/tensorboard/blob/00eeb7adcbf341ec25b49c37abee1cfe395ea1f9/tensorboard/plugins/projector/vz_projector/vz-projector-inspector-panel.ts#L398
    https://github.com/tensorflow/tensorboard/blob/00eeb7adcbf341ec25b49c37abee1cfe395ea1f9/tensorboard/plugins/projector/vz_projector/vector.ts#L64
    
    input:
        matrix_of_vectors: tensor with shape (n_vectors, vector_size)

    output:
        similarities : tensor with shape (n_vector, n_vectors)
    Each row[i, j] is the similarity of the ith element against the jth vector, eg,
    row[0,0] is 1 and row[0,42] is the similarity between the first
    element in the input and the 43th element in the input.
    """

    dot_product = matrix_of_vectors @ matrix_of_vectors.t()
    norms = torch.sqrt(torch.einsum("ii->i", dot_product))
    similarities = dot_product / (norms[None] * norms[..., None])
    # similarities = dot_product / (norms[:, None] * norms[None, :])
    return similarities


In [None]:
def cos_dist(matrix_of_vectors: torch.Tensor):
    """
    Compute the cosine distance ([0, 2]) between two vectors.
    """
    return 1 - cos_sim(matrix_of_vectors)

In [None]:
key = "XOM"
idx = label.index[label["label"] == key].tolist()[0]
print(idx)

In [None]:
# similarities = cosine_similarity(embeddings)
distances = cos_dist(embeddings)
idx_distances = distances[idx].tolist()
idx_distances = np.array(idx_distances)

In [None]:
zorder = [int(o * 1000) for o in idx_distances]

In [None]:
results = pd.Series(idx_distances, index=label["label"].tolist())
results.sort_values(ascending=True).head(10)

In [None]:
# filter for 10 most similar underlyings
idx_labels = np.argpartition(idx_distances, 11)[:11]
mask = np.zeros(len(idx_distances), dtype=bool)
mask[idx_labels] = True

In [None]:
label[mask]

In [None]:
fig, ax = plt.subplots(figsize=(12 * CM, 8 * CM))

# all non-near points in white-grey
ax.scatter(
    tsne_projections["tsne-0"][~mask],
    tsne_projections["tsne-1"][~mask],
    c="whitesmoke",
    s=5,
)

# all near points in color
sc = ax.scatter(
    tsne_projections["tsne-0"][mask],
    tsne_projections["tsne-1"][mask],
    cmap="Blues_r",
    c=idx_distances[mask],
    s=10,
    zorder=1000,
    marker="o",
    edgecolors="grey",
    linewidth=0.5,
)

ax.set_xlabel("$t$-SNE Axis 1")
ax.set_ylabel("$t$-SNE Axis 2")

texts = []

for i, cond in enumerate(mask):

    if cond:
        l = label["label"].iloc[i]
        factor = 1.5 if l == key else 1

        # annotate labels with underlyings
        texts.append(
            ax.text(
                tsne_projections["tsne-0"].iloc[i],
                tsne_projections["tsne-1"].iloc[i],
                r"\texttt{" + l + r"}",
                fontsize=7 * factor,
                zorder=2000,
                ha="left",
                va="top",
            )
        )

# adjust labels automatically to avoid overlap
adjust_text(
    texts, ax=ax, min_arrow_len=1, arrowprops=dict(arrowstyle="-", color="k", lw=0.5)
)

fig.colorbar(sc)

fig.tight_layout()

plt.savefig(f"../reports/Graphs/categorical_embeddings_{key}.pdf", bbox_inches="tight")
