In [None]:
# Imports
from os import makedirs
from os.path import join
import joblib
import numpy as np
rng_seed = 399
np.random.seed(rng_seed)
from scipy.stats import pearsonr
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_theme()
from tqdm.auto import tqdm
import pandas as pd
import gudhi as gd
from gudhi.wasserstein import wasserstein_distance
from sklearn.metrics.pairwise import euclidean_distances
from scipy.spatial.distance import squareform

from umap import UMAP
from sklearn.decomposition import PCA
from sklearn.manifold import Isomap
from nltk.corpus import wordnet as wn
import annoy

import plotly.offline as pyo
pyo.init_notebook_mode()
import plotly.express as px
import plotly.graph_objects as go
from plotly import colors

# Directory constants
topological_data_analysis_data_dir = "data"
topological_data_analysis_custom_data_dir = "custom_data"
root_code_dir = ".."
output_dir = join(root_code_dir, "output")
word2vec_training_dir = join(output_dir, "word2vec_training")
word2vec_ann_indices_dir = join(output_dir, "word2vec_ann_indices")
word2vec_cluster_analysis_dir = join(output_dir, "word2vec_cluster_analysis")
tps_experimentation_dir = join(output_dir, "topological_polysemy_experimentation")
analysis_of_embeddings_dir = join(root_code_dir, "analysis_of_embeddings")

# Extend sys path for importing custom Python files
import sys
sys.path.extend([root_code_dir, analysis_of_embeddings_dir])

from utils import get_model_checkpoint_filepaths, pairwise_cosine_distances, words_to_vectors
from word_embeddings.word2vec import load_model_training_output
from vis_utils import plot_word_vectors
from topological_data_analysis.geometric_anomaly_detection import compute_gad
from topological_data_analysis.topological_polysemy import tps_point_cloud
from analysis_utils import transform_word_embeddings

# Prepare data

In [None]:
# Load cyclo-octane data
cyclo_octane_data = pd.read_csv(join(topological_data_analysis_custom_data_dir, "cyclo-octane.csv"), header=None).values
cyclo_octane_data.shape

In [None]:
cyclo_octane_data_dists = euclidean_distances(cyclo_octane_data)

In [None]:
# Load TPS scores of cyclo-octane data
cyclo_octane_tps_scores = np.load(join(tps_experimentation_dir, "cyclo_octane", "tps_scores_50.npy"))
cyclo_octane_tps_scores.shape

# Geometric anomaly detection in word embeddings

In [None]:
gad_result = compute_gad(
    data_points=cyclo_octane_data.copy(order='C'),
    manifold_dimension=2,
    annulus_inner_radius=0.25,
    annulus_outer_radius=0.4,
    use_knn_annulus=False,
    knn_annulus_inner=100,
    knn_annulus_outer=200,
    data_points_pairwise_distances=cyclo_octane_data_dists,
    progressbar_enabled=True,
    n_jobs=-1,
)

In [None]:
# Dimensionality reduction
embedders=[
    ("PCA", PCA(
        n_components=3,
        random_state=rng_seed,
    )),
    ("UMAP", UMAP(
        n_components=3,
        random_state=rng_seed
    )),
    ("Isomap", Isomap(
        n_components=3,
        n_neighbors=5,
        n_jobs=-1
    ))
]
embedding_result = {}
for embedder_key, embedder in tqdm(embedders):
    embedding_result[embedder_key] = embedder.fit_transform(cyclo_octane_data)

In [None]:
# Create vector with point colors
point_colors = np.empty(len(cyclo_octane_data), dtype=object)
for i in range(len(cyclo_octane_data)):
    for key in gad_result.keys():
        if i in gad_result[key]:
            point_colors[i] = key

In [None]:
# Visualize with 3D UMAP/PCA embedding
for embedding_key in embedding_result.keys():
    fig = px.scatter_3d(
        title=embedding_key,
        labels={
            "x": "PC1" if embedding_key == "PCA" else f"{embedding_key}1",
            "y": "PC2" if embedding_key == "PCA" else f"{embedding_key}2",
            "z": "PC3" if embedding_key == "PCA" else f"{embedding_key}3",
        },
        x=embedding_result[embedding_key][:, 0],
        y=embedding_result[embedding_key][:, 1],
        z=embedding_result[embedding_key][:, 2],
        color=point_colors,
        opacity=0.25
    )
    fig.show()

## Plot with TPS_50 scores

In [None]:
# Visualize with 3D UMAP/PCA embedding
for embedding_key in embedding_result.keys():
    fig = go.Figure(
        layout={
            "title": embedding_key,
            "scene": go.layout.Scene(
                xaxis={
                    "title": "PC1" if embedding_key == "PCA" else f"{embedding_key}1",
                },
                yaxis={
                    "title": "PC2" if embedding_key == "PCA" else f"{embedding_key}2",
                },
                zaxis={
                    "title": "PC3" if embedding_key == "PCA" else f"{embedding_key}3",
                }
            ),
        }
    )
    for gad_category, gad_indices in gad_result.items():
        fig.add_trace(go.Scatter3d(
            x=embedding_result[embedding_key][gad_indices][:, 0],
            y=embedding_result[embedding_key][gad_indices][:, 1],
            z=embedding_result[embedding_key][gad_indices][:, 2],
            mode="markers",
            marker=dict(color=cyclo_octane_tps_scores[gad_indices]),
            hovertext=cyclo_octane_tps_scores[gad_indices],
            hoverinfo="x+y+z+text",
            opacity=1,
            name=gad_category,
            marker_colorscale=colors.sequential.Viridis
        ))
    fig.show()