In [5]:
import numpy as np 
import pandas as pd 
import numpy.typing as npt 
import plotly.express as px

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP 


import matplotlib.pyplot as plt

### Utils

In [12]:
def to_df(arr: npt.NDArray):
	df = pd.DataFrame(arr, columns=["pc1", "pc2", "pc3"])
	return df

def plot_2d(df: pd.DataFrame):
	fig = px.scatter(df, x="pc1", y="pc2", width=800, height=800, opacity=0.5, color_discrete_sequence=px.colors.qualitative.Plotly)

	return fig

def plot_3d(df: pd.DataFrame):
	fig = px.scatter_3d(df, x="pc1", y="pc2", z="pc3", opacity=0.5, color_discrete_sequence=px.colors.qualitative.Plotly)
	return fig

#### Config

In [3]:
from enum import Enum 

class DATASET(Enum):
	CELEBA_L = "celeba_buffalo_l"
	CELEBA_S = "celeba_buffalo_s"

class DATA_CATEGORY(Enum):
	FEATURES = "features"
	EMBEDDINGS = "embeddings"

class PROJECTION(Enum):
	UMAP = "umap"
	TSNE = "tsne_exaggeration_12"

In [4]:
features = pd.read_parquet(f"../data/{DATASET.CELEBA_L.value}__{DATA_CATEGORY.FEATURES.value}.gzip")
embeddings = pd.read_parquet(f"../data/{DATASET.CELEBA_L.value}__{DATA_CATEGORY.EMBEDDINGS.value}.gzip")

#### PCA

In [7]:
%%time 

pca = PCA(n_components=3)
data_projected_pca = pca.fit_transform(embeddings)
data_projected_pca = to_df(data_projected_pca)
data_projected_pca

CPU times: user 1.23 s, sys: 258 ms, total: 1.48 s
Wall time: 1.11 s


Unnamed: 0,pc1,pc2,pc3
0,0.330265,-2.261836,2.862895
1,2.081261,0.274303,3.272021
2,-0.404353,-1.249651,-0.392623
3,-0.614032,-1.678139,-2.018283
4,2.556688,-2.731641,1.152278
...,...,...,...
30007,0.880741,-2.369417,0.737107
30008,-1.119776,-2.606303,-2.142598
30009,0.652556,-3.977205,-2.436503
30010,3.123748,2.784886,-0.802522


In [8]:
plot_3d(data_projected_pca)

In [13]:
plot_2d(data_projected_pca)

#### TSNE

In [20]:
N_SAMPLES = 1000

indices = np.random.permutation(list(range(embeddings.shape[0])))
embeddings_sample = embeddings.loc[indices[:N_SAMPLES],:]

In [21]:
%%time 
tsne = TSNE(n_components=3, perplexity=30, early_exaggeration=3, n_iter=1000)
data_projected_tsne = tsne.fit_transform(embeddings_sample)
data_projected_tsne = to_df(data_projected_tsne)
data_projected_tsne

CPU times: user 48.7 s, sys: 780 ms, total: 49.5 s
Wall time: 41.3 s


Unnamed: 0,pc1,pc2,pc3
0,48.756355,-17.929964,2.732470
1,16.321497,39.424263,-51.404251
2,-26.752907,37.244064,14.644180
3,24.767824,17.986984,1.501463
4,44.432339,-13.139323,33.427032
...,...,...,...
995,59.367317,-25.949978,-16.946417
996,6.887523,-62.585316,4.909607
997,-7.183874,43.410000,-12.586861
998,16.540541,-18.845787,-29.226177


In [30]:
plot_3d(data_projected_tsne)

In [31]:
plot_2d(data_projected_tsne)

#### UMAP

In [32]:
N_SAMPLES = 1000

indices = np.random.permutation(list(range(embeddings.shape[0])))
embeddings_sample = embeddings.loc[indices[:N_SAMPLES],:]

In [29]:
umap = UMAP(n_components=3, n_neighbors=15, min_dist=0.5)
data_projected_umap = umap.fit_transform(embeddings_sample)
data_projected_umap = to_df(data_projected_umap)
data_projected_umap

Unnamed: 0,pc1,pc2,pc3
0,9.612423,11.204711,6.609972
1,10.905683,12.530198,7.034265
2,8.727020,9.871257,6.376821
3,11.442682,12.780072,8.318576
4,10.384645,12.345819,8.871716
...,...,...,...
995,9.550897,10.195659,8.168948
996,10.842942,12.134320,9.145310
997,9.507042,10.893680,7.286634
998,11.997724,13.545664,7.665380


In [33]:
plot_3d(data_projected_umap)

In [34]:
plot_2d(data_projected_umap)