In [None]:
import pandas as pd

# url encoding
from urllib.parse import quote 

# import requests
from tqdm.auto import tqdm
import numpy as np

from umap import UMAP
from sklearn.manifold import TSNE

import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.style.use('seaborn-v0_8-colorblind')


In [None]:
def count_bits(x: int):
    return x.bit_count() # python 3.10 or later
    # return bin(x)[2:].count("1") # python 3.9 or earlier

!python --version

In [None]:
## load data

df = pd.read_csv('../../../dim-bridge-data/semion-pigments/pigments_fp.csv', )
df = df.drop(columns=['Unnamed: 0', 'Column1'])

In [None]:
fp = df['fp'].tolist()

In [None]:
## compute similarity & pairwise distance
eps = 1e-2

sim = np.zeros([len(fp), len(fp)])
pdist = np.zeros([len(fp), len(fp)])
l = len(fp[0])
for i,f1 in enumerate(tqdm(fp)):
    for j,f2 in enumerate(fp):
        if i <= j: # assuming symmetry
            
            # difference = int(f1, 2) ^ int(f2, 2)
            # # distance matrix
            # d = count_bits(difference) / l
            # pdist[i,j] = d
            # # similarity matrix
            # s = 1 - count_bits(difference) / l
            # sim[i,j] = s

            intersection = int(f1, 2) & int(f2, 2)
            union = int(f1, 2) | int(f2, 2)
            s = count_bits(intersection) / count_bits(union)
            sim[i,j] = s # similarity matrix
            # pdist[i,j] = (1-s)/(s+eps)
            pdist[i,j] = (1-s)
            
            # assuming symmetry
            pdist[j,i] = pdist[i,j]
            sim[j,i] = sim[i,j]
        
print(pdist)

plt.imshow(sim)
plt.colorbar()
plt.show()

In [None]:
# fp = np.array([[int(xi) for xi in x] for x in fp])

In [None]:
# UMAP / t-SNE

xy = UMAP(n_neighbors=15, min_dist=0.5, metric='precomputed', random_state=0).fit_transform(pdist)
# xy = UMAP(min_dist=0.9, metric='jaccard').fit_transform(fp)
# xy = TSNE( metric='precomputed').fit_transform(pdist)

plt.scatter(xy[:,0], xy[:,1], s=10)
plt.axis('equal')
plt.show()

In [None]:
## prepare output table
df_out = df.copy()

# attach DR coordinates
df_out['x'] = xy[:,0]
df_out['y'] = xy[:,1]

# attach image filename
# df_out['image_filename'] = [f'{i}.png' for i in range(len(df))]
df_out['image_url'] = [f'http://localhost:9001/static/pigments/images/{i}.png' for i in range(len(df))]


# to csv
df_out.to_csv('pigments_umap.csv', index=False)
# df_out.head(5)

In [None]:
# move to dim-bridge server

!mv pigments_umap.csv ../../datasets/pigments/