In [1]:
# Imports
from os import makedirs
from os.path import join
import joblib
import numpy as np
rng_seed = 399
np.random.seed(rng_seed)
from scipy.stats import pearsonr
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_theme()
from tqdm.auto import tqdm
import pandas as pd
import gudhi as gd
from gudhi.wasserstein import wasserstein_distance

from umap import UMAP
from sklearn.decomposition import PCA
from sklearn.manifold import Isomap
from nltk.corpus import wordnet as wn
import annoy

import plotly.offline as pyo
pyo.init_notebook_mode()
import plotly.express as px

# Directory constants
topological_data_analysis_data_dir = "data"
root_code_dir = ".."
output_dir = join(root_code_dir, "output")
word2vec_training_dir = join(output_dir, "word2vec_training")
word2vec_ann_indices_dir = join(output_dir, "word2vec_ann_indices")
word2vec_cluster_analysis_dir = join(output_dir, "word2vec_cluster_analysis")
analysis_of_embeddings_dir = join(root_code_dir, "analysis_of_embeddings")

# Extend sys path for importing custom Python files
import sys
sys.path.extend([root_code_dir, analysis_of_embeddings_dir])

from utils import get_model_checkpoint_filepaths, pairwise_cosine_distances, words_to_vectors
from word_embeddings.word2vec import load_model_training_output
from vis_utils import plot_word_vectors
from topological_data_analysis.geometric_anomaly_detection import GeometricAnomalyDetection
from analysis_utils import transform_word_embeddings

[nltk_data] Downloading package punkt to /project/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Prepare data

In [2]:
# Load cyclo-octane data
cyclo_octane_data = pd.read_csv(join(topological_data_analysis_data_dir, "cyclo-octane.csv"), header=None).values
cyclo_octane_data

array([[ 1.7191,  0.8105,  0.4947, ...,  0.8475,  1.6958, -0.3811],
       [ 1.6917,  0.672 ,  0.0682, ...,  0.425 ,  0.9973, -0.7065],
       [ 1.8346,  0.7464, -0.5147, ...,  0.7227,  1.6776, -0.0601],
       ...,
       [ 1.8033,  0.7278,  0.169 , ...,  0.4813,  1.3473,  0.5921],
       [ 1.782 ,  0.604 ,  0.2199, ...,  0.4926,  1.2677,  0.6754],
       [ 1.8881,  0.7178,  0.1994, ...,  0.5371,  1.321 ,  0.548 ]])

# Geometric anomaly detection in word embeddings

In [3]:
gad_instance = GeometricAnomalyDetection(cyclo_octane_data)
gad_result = gad_instance.compute(
    word_ints=None,
    manifold_dimension=2,
    annulus_inner_radius=0.25,
    annulus_outer_radius=0.4,
    tqdm_enabled=True,
)

  0%|          | 0/6040 [00:00<?, ?it/s]

In [4]:
gad_result

{'P_bnd': [31,
  73,
  143,
  179,
  207,
  223,
  242,
  256,
  282,
  289,
  290,
  307,
  320,
  451,
  496,
  548,
  622,
  638,
  644,
  648,
  670,
  673,
  691,
  720,
  727,
  752,
  772,
  846,
  866,
  884,
  928,
  937,
  966,
  992,
  996,
  1004,
  1009,
  1015,
  1082,
  1122,
  1178,
  1187,
  1211,
  1240,
  1256,
  1275,
  1398,
  1454,
  1464,
  1465,
  1482,
  1488,
  1497,
  1521,
  1600,
  1621,
  1636,
  1655,
  1657,
  1664,
  1697,
  1769,
  1777,
  1846,
  1847,
  1854,
  1867,
  1928,
  1978,
  1979,
  1997,
  2021,
  2038,
  2066,
  2075,
  2081,
  2134,
  2140,
  2170,
  2196,
  2283,
  2288,
  2309,
  2316,
  2408,
  2488,
  2515,
  2523,
  2537,
  2538,
  2582,
  2585,
  2632,
  2700,
  2701,
  2736,
  2760,
  2884,
  2909,
  2911,
  2925,
  2932,
  2944,
  2945,
  2947,
  2959,
  2967,
  2981,
  2997,
  3010,
  3014,
  3015,
  3025,
  3042,
  3060,
  3066,
  3136,
  3143,
  3147,
  3157,
  3162,
  3193,
  3217,
  3232,
  3265,
  3294,
  3299,
  3313,
  33

In [5]:
# Create vector with point colors
point_colors = np.empty(len(cyclo_octane_data), dtype=object)
for i in range(len(cyclo_octane_data)):
    for key in gad_result.keys():
        if i in gad_result[key]:
            point_colors[i] = key

In [6]:
# Dimensionality reduction
embedders=[
    ("PCA", PCA(
        n_components=3,
        random_state=rng_seed,
    )),
    ("UMAP", UMAP(
        n_components=3,
        random_state=rng_seed
    )),
    ("Isomap", Isomap(
        n_components=3,
        n_neighbors=5,
        n_jobs=-1
    ))
]
embedding_result = {}
for embedder_key, embedder in tqdm(embedders):
    embedding_result[embedder_key] = embedder.fit_transform(cyclo_octane_data)

  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
# Visualize with 3D UMAP/PCA embedding
for embedding_key in embedding_result.keys():
    fig = px.scatter_3d(
        title=embedding_key,
        labels={
            "x": "PC1" if embedding_key == "PCA" else f"{embedding_key}1",
            "y": "PC2" if embedding_key == "PCA" else f"{embedding_key}2",
            "z": "PC3" if embedding_key == "PCA" else f"{embedding_key}3",
        },
        x=embedding_result[embedding_key][:, 0],
        y=embedding_result[embedding_key][:, 1],
        z=embedding_result[embedding_key][:, 2],
        color=point_colors,
        opacity=0.25
    )
    fig.show()