In [None]:
pip install matplotlib scikit-learn datashader dask[dataframe] colorcet holoviews

In [None]:
import torch
import matplotlib.pyplot as plt
import numpy as np
import matplotlib
import pandas as pd
import datashader as ds
import datashader.transfer_functions as tf

In [None]:
def plot_it(points, labels):
    background = 'black'
    height=2048
    width=2048
    alpha=1
    colors=labels
    dpi = plt.rcParams["figure.dpi"]
    point_size = 100.0 / np.sqrt(points.shape[0])
    fig = plt.figure(figsize=(width / dpi, height / dpi))
    ax = fig.add_subplot(111)
    ax.set(xticks=[], yticks=[])
    ax.set_facecolor(background)
    ax.scatter(points[:, 0], points[:, 1], s=point_size, c=colors, alpha=alpha)
    ax.text(0.99,
            0.01, 
            "UMAP: n_neighbors={}, min_dist={}".format(5, 0),
            transform=ax.transAxes,
            horizontalalignment="right",
            color='white',
    )
    return ax



In [None]:
from safetensors.torch import load_file
embeddings = load_file("/workspace/full_embeddings.safetensors")
embeddings = embeddings["full"]
embeddings

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import HDBSCAN

In [None]:
pca_dimensional = PCA(n_components=3, random_state=42)
pca = PCA(n_components=2, random_state=42)

In [None]:
res_dimensional = pca_dimensional.fit_transform(embeddings)
res = pca.fit_transform(embeddings)

In [None]:
res.shape

In [None]:
hdbscan = HDBSCAN(min_cluster_size=20, min_samples=1).fit(res_dimensional)


In [None]:
(hdbscan.labels_.shape, np.unique(hdbscan.labels_))

In [None]:
def get_colors(labels):
    u_labels = np.unique(labels)
    color_key = plt.get_cmap("tab20")(np.linspace(0, 1, u_labels.size))
    new_color_key = {
        k: matplotlib.colors.to_hex(color_key[i])
        for i, k in enumerate(u_labels)
    }
    colors = pd.Series(labels).map(new_color_key)
    return colors

In [None]:
plot_it(res, get_colors(hdbscan.labels_))

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans = KMeans(n_clusters=30, random_state=42)
kmeans_res = kmeans.fit(res_dimensional)

In [None]:
plot_it(res, get_colors(kmeans_res.labels_))

In [None]:
from datasets import load_dataset

In [None]:
dataset = load_dataset("mwarchalowski/grants", "full_clean_emb_mapped")

In [None]:
get_colors(kmeans_res.labels_)

In [None]:
respd = pd.DataFrame(res, columns=('x','y'))
respd["label"] = pd.Categorical(kmeans_res.labels_)

In [None]:
all_labels = np.unique(respd['label'])
all_labels

In [None]:
import colorcet
from datashader.colors import colormap_select, Greys9
from functools import partial


In [None]:
background = 'black'
cm = partial(colormap_select, reverse=(background!="blackz"))

In [None]:
full_map = cm(colorcet.glasbey_dark)
ckey = [full_map[i] for i,label in enumerate(all_labels)]

In [None]:
min_alpha = 100
cvs = ds.Canvas(plot_width=800, plot_height=600)
agg = cvs.points(respd, 'x', 'y', ds.count_cat('label'))
img = ds.tf.shade(agg, color_key=ckey, how='eq_hist', min_alpha=min_alpha)
# img = ds.tf.shade(agg.sel(label=[1]), color_key=cm(colorcet.glasbey_dark), how='eq_hist', min_alpha=min_alpha)
r_img = tf.set_background(img, background)
r_img

In [None]:
grays2 = cm([(i,i,i) for i in np.linspace(0,255,99)])
grays2  = grays2 + ["red"]

In [None]:
import holoviews as hv
from holoviews.element import tiles
from holoviews.operation.datashader import datashade, dynspread, rasterize
from holoviews import opts
hv.extension('bokeh', 'matplotlib')


In [None]:
opts.defaults(opts.Overlay(width=1024, height=1024, xaxis=None, yaxis=None))

In [None]:
_color_key = {'w':'aqua', 'b':'lime',  'a':'red', 'h':'fuchsia', 'o':'yellow'}
races = {'w':'White', 'b':'Black', 'a':'Asian', 'h':'Hispanic', 'o':'Other'}

# color_points = hv.NdOverlay({races[k]: hv.Points([webm(-80,40)]).opts(color=v, size=0) for k, v in _color_key.items()})
# color_points = hv.NdOverlay({races[k]: hv.Points([1,1]).opts(color=v, size=0) for k, v in _color_key.items()})
color_points = hv.NdOverlay({str(k): hv.Points([10,10]).opts(color=v, size=0) for k, v in enumerate(cm(colorcet.glasbey_dark))})

color_points.opts(clone=True, height=100, width=100)


In [29]:
import datashader as ds
import numpy as np
import holoviews as hv
import pandas as pd
import numpy as np

from holoviews import opts
from holoviews.operation.datashader import datashade, rasterize, shade, dynspread, spread
from holoviews.operation.resample import ResampleOperation2D
from holoviews.operation import decimate

hv.extension('bokeh','matplotlib', width=100)

# Default values suitable for this notebook
decimate.max_samples=1000
dynspread.max_px=20
dynspread.threshold=0.5
ResampleOperation2D.width=500
ResampleOperation2D.height=500

def random_walk(n, f=5000):
    """Random walk in a 2D space, smoothed with a filter of length f"""
    xs = np.convolve(np.random.normal(0, 0.1, size=n), np.ones(f)/f).cumsum()
    ys = np.convolve(np.random.normal(0, 0.1, size=n), np.ones(f)/f).cumsum()
    xs += 0.1*np.sin(0.1*np.array(range(n-1+f))) # add wobble on x axis
    xs += np.random.normal(0, 0.005, size=n-1+f) # add measurement noise
    ys += np.random.normal(0, 0.005, size=n-1+f)
    return np.column_stack([xs, ys])

def random_cov():
    """Random covariance for use in generating 2D Gaussian distributions"""
    A = np.random.randn(2,2)
    return np.dot(A, A.T)

def time_series(T = 1, N = 100, mu = 0.1, sigma = 0.1, S0 = 20):  
    """Parameterized noisy time series"""
    dt = float(T)/N
    t = np.linspace(0, T, N)
    W = np.random.standard_normal(size = N) 
    W = np.cumsum(W)*np.sqrt(dt) # standard brownian motion
    X = (mu-0.5*sigma**2)*t + sigma*W 
    S = S0*np.exp(X) # geometric brownian motion
    return S

In [30]:
np.random.seed(1)
points = hv.Points(np.random.multivariate_normal((0,0), [[0.1, 0.1], [0.1, 1.0]], (1000,)),label="Points")
paths = hv.Path([random_walk(2000,30)], kdims=["u","v"], label="Paths")

points + paths

In [28]:
np.random.seed(1)
points = hv.Points(np.random.multivariate_normal((0,0), [[0.1, 0.1], [0.1, 1.0]], (1000000,)),label="Points")
paths = hv.Path([0.15*random_walk(100000) for i in range(10)], kdims=["u","v"], label="Paths")

decimate( points).relabel("Decimated Points") + \
rasterize(points).relabel("Rasterized Points").opts(colorbar=True, width=350) + \
rasterize(paths ).relabel("Rasterized Paths")