In [1]:
# Import necessary libraries
import json 
import pandas as pd
import numpy as np
from scipy.stats import spearmanr

In [2]:
with open("out10x.json", 'r') as fdata:
    data = json.load(fdata)

Convert the shepard scores into [0-1] so that 0 is better so it is comparable to other scores

In [3]:
for dataset, res in data.items():
    for key in res.keys():
        if "sheppard" in key: 
            res[key] = 1 - (res[key] + 1) / 2

In [4]:
df = pd.DataFrame(data)

In [5]:
data = df.transpose()
data = data.drop([col for col in data.columns if "UMAP" in col],axis=1)
data

Unnamed: 0,MDS_raw,MDS_norm,MDS_scalenorm,MDS_kruskal,MDS_sheppard,TSNE_raw,TSNE_norm,TSNE_scalenorm,TSNE_kruskal,TSNE_sheppard,RANDOM_raw,RANDOM_norm,RANDOM_scalenorm,RANDOM_kruskal,RANDOM_sheppard
swissroll_0,1.043930e+08,0.596613,0.257206,0.253553,0.107624,2.285075e+11,27.913064,0.454226,0.406530,0.325235,1.528716e+08,0.721972,0.533419,0.429519,0.498075
swissroll_1,1.045109e+08,0.596950,0.258967,0.255297,0.109225,2.256214e+11,27.736229,0.474373,0.423470,0.333070,1.526856e+08,0.721533,0.533961,0.430017,0.498224
swissroll_2,1.044004e+08,0.596634,0.257318,0.253673,0.107727,2.302966e+11,28.022118,0.447100,0.399200,0.323767,1.523274e+08,0.720686,0.532830,0.428646,0.497850
swissroll_3,1.044781e+08,0.596856,0.258484,0.254848,0.108778,2.343174e+11,28.265686,0.451345,0.402439,0.326035,1.526716e+08,0.721500,0.533445,0.429532,0.497965
swissroll_4,1.044668e+08,0.596824,0.258314,0.254684,0.108623,2.204093e+11,27.413987,0.437464,0.392344,0.326245,1.533933e+08,0.723203,0.532843,0.429324,0.497104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
secom_5,1.182158e+09,8.444901,0.347911,0.298442,0.140499,2.928999e+11,132.927994,0.425688,0.380896,0.197223,1.103357e+07,0.815859,0.451265,0.428587,0.497039
secom_6,1.182553e+09,8.446314,0.347483,0.297519,0.139340,2.223938e+11,115.829239,0.415941,0.371536,0.188761,1.113561e+07,0.819622,0.451951,0.429452,0.495110
secom_7,1.182787e+09,8.447150,0.347227,0.296760,0.138496,2.236999e+11,116.168861,0.404372,0.361408,0.194441,1.111968e+07,0.819036,0.453947,0.430692,0.499150
secom_8,1.182857e+09,8.447400,0.347152,0.296771,0.138537,2.602426e+11,125.298553,0.421617,0.374709,0.184354,1.088708e+07,0.810425,0.449933,0.429469,0.494120


In [6]:
data[[col for col in data.columns if "raw" in col]].head()

Unnamed: 0,MDS_raw,TSNE_raw,RANDOM_raw
swissroll_0,104393000.0,228507500000.0,152871600.0
swissroll_1,104510900.0,225621400000.0,152685600.0
swissroll_2,104400400.0,230296600000.0,152327400.0
swissroll_3,104478100.0,234317400000.0,152671600.0
swissroll_4,104466800.0,220409300000.0,153393300.0


In [7]:
metric_names = ["_raw", "_norm", "_sheppard", "_kruskal", "_scalenorm"]

In [8]:
# Calculate how often metrics agree with “correct order”
res = dict()
for mname in metric_names: 
    metric = data[(col for col in data.columns if mname in col)]
    name_lookup = ["MDS", "TSNE", "RANDOM"]
    orderCount = {
        ("MDS", "TSNE", "RANDOM"): 0,
        ("MDS", "RANDOM", "TSNE"): 0,
        ("TSNE", "MDS", "RANDOM"): 0,
        ("TSNE", "RANDOM", "MDS"): 0,
        ("RANDOM", "MDS", "TSNE"): 0,
        ("RANDOM", "TSNE", "MDS"): 0,                    
    }

    for ind,row in metric.iterrows():
        tup_row = sorted([(row[i], name_lookup[i]) for i in range(3)])
        order = tuple([tup[1] for tup in tup_row])
        orderCount[order] += 1

    totalvals = sum(orderCount.values())
    for key in orderCount.keys():
        orderCount[key] /= totalvals
        
    res[mname] = orderCount

  tup_row = sorted([(row[i], name_lookup[i]) for i in range(3)])
  tup_row = sorted([(row[i], name_lookup[i]) for i in range(3)])
  tup_row = sorted([(row[i], name_lookup[i]) for i in range(3)])
  tup_row = sorted([(row[i], name_lookup[i]) for i in range(3)])
  tup_row = sorted([(row[i], name_lookup[i]) for i in range(3)])


In [9]:
res

{'_raw': {('MDS', 'TSNE', 'RANDOM'): 0.0,
  ('MDS', 'RANDOM', 'TSNE'): 0.08333333333333333,
  ('TSNE', 'MDS', 'RANDOM'): 0.125,
  ('TSNE', 'RANDOM', 'MDS'): 0.0,
  ('RANDOM', 'MDS', 'TSNE'): 0.7916666666666666,
  ('RANDOM', 'TSNE', 'MDS'): 0.0},
 '_norm': {('MDS', 'TSNE', 'RANDOM'): 0.0,
  ('MDS', 'RANDOM', 'TSNE'): 0.08333333333333333,
  ('TSNE', 'MDS', 'RANDOM'): 0.125,
  ('TSNE', 'RANDOM', 'MDS'): 0.0,
  ('RANDOM', 'MDS', 'TSNE'): 0.7916666666666666,
  ('RANDOM', 'TSNE', 'MDS'): 0.0},
 '_sheppard': {('MDS', 'TSNE', 'RANDOM'): 0.9166666666666666,
  ('MDS', 'RANDOM', 'TSNE'): 0.0,
  ('TSNE', 'MDS', 'RANDOM'): 0.08333333333333333,
  ('TSNE', 'RANDOM', 'MDS'): 0.0,
  ('RANDOM', 'MDS', 'TSNE'): 0.0,
  ('RANDOM', 'TSNE', 'MDS'): 0.0},
 '_kruskal': {('MDS', 'TSNE', 'RANDOM'): 0.8208333333333333,
  ('MDS', 'RANDOM', 'TSNE'): 0.17916666666666667,
  ('TSNE', 'MDS', 'RANDOM'): 0.0,
  ('TSNE', 'RANDOM', 'MDS'): 0.0,
  ('RANDOM', 'MDS', 'TSNE'): 0.0,
  ('RANDOM', 'TSNE', 'MDS'): 0.0},
 '_scaleno

In [10]:
stab = " & " + " & ".join(mname.replace("_","") for mname in metric_names)
stab += "\\hline \\\\ \n"
stab

' & raw & norm & sheppard & kruskal & scalenorm\\hline \\\\ \n'

In [11]:
blue = [0x4b, 0xae, 0xd6]
white = [0xff] * 3


def color_fn(x):  # f: [0,1] -> [0,1]
    return x  # changing the function allows for tweaking the strength of the gradients

# retrieve HTML hex code for a color given a value between low and hi
def get_color(num, low, hi):
    num = color_fn((num - low) / (hi - low))
    c = [round(num * blue[i] + (1-num) * white[i]) for i in range(3)]
    return ''.join([hex(x)[2:] for x in c]).upper()

In [12]:
# Convert results to latex table
stab = " & " + " & ".join(mname.replace("_","") for mname in metric_names)
stab += "\\hline \\\\ \n"
for row in res['_raw'].keys():
    stab += "$<$".join(el for el in row)
    stab += " & "
    for metric in res.keys(): 
        stab += f"\\cellcolor[HTML]{{{get_color(round(res[metric][row], 3)*100, 0, 100)}}} {round(res[metric][row], 3)*100}\\%"
        if metric != "_scalenorm": 
            stab += " & "
    stab += "\\\\ \\hline\n"
    

In [13]:
print(stab)

 & raw & norm & sheppard & kruskal & scalenorm\hline \\ 
MDS$<$TSNE$<$RANDOM & \cellcolor[HTML]{FFFFFF} 0.0\% & \cellcolor[HTML]{FFFFFF} 0.0\% & \cellcolor[HTML]{5AB5D9} 91.7\% & \cellcolor[HTML]{6BBCDD} 82.1\% & \cellcolor[HTML]{5CB5DA} 90.8\%\\ \hline
MDS$<$RANDOM$<$TSNE & \cellcolor[HTML]{F0F8FC} 8.3\% & \cellcolor[HTML]{F0F8FC} 8.3\% & \cellcolor[HTML]{FFFFFF} 0.0\% & \cellcolor[HTML]{DFF1F8} 17.9\% & \cellcolor[HTML]{F0F8FC} 8.3\%\\ \hline
TSNE$<$MDS$<$RANDOM & \cellcolor[HTML]{E8F5FA} 12.5\% & \cellcolor[HTML]{E8F5FA} 12.5\% & \cellcolor[HTML]{F0F8FC} 8.3\% & \cellcolor[HTML]{FFFFFF} 0.0\% & \cellcolor[HTML]{FEFEFF} 0.8\%\\ \hline
TSNE$<$RANDOM$<$MDS & \cellcolor[HTML]{FFFFFF} 0.0\% & \cellcolor[HTML]{FFFFFF} 0.0\% & \cellcolor[HTML]{FFFFFF} 0.0\% & \cellcolor[HTML]{FFFFFF} 0.0\% & \cellcolor[HTML]{FFFFFF} 0.0\%\\ \hline
RANDOM$<$MDS$<$TSNE & \cellcolor[HTML]{70BFDF} 79.2\% & \cellcolor[HTML]{70BFDF} 79.2\% & \cellcolor[HTML]{FFFFFF} 0.0\% & \cellcolor[HTML]{FFFFFF} 0.0\% & \cell

In [60]:
data.filter([col for col in data.columns if "_raw" in col]).head()

Unnamed: 0,MDS_raw,TSNE_raw,RANDOM_raw
har_0,385776.085765,412911392.0,9191817.0
har_1,389925.192557,380623712.0,9201894.0
har_2,398919.830629,413760896.0,9168915.0
har_3,390429.235642,364554592.0,9212982.0
har_4,398528.705062,348111072.0,9216956.0


In [61]:
import scipy.stats as ss

In [62]:
#Template for extracting rank vector for data
raw_values = data.filter([col for col in data.columns if "_raw" in col]).to_numpy()
raw_ranks = np.array([ss.rankdata(row) for row in raw_values]).flatten()

In [63]:
ranks = dict()
for metric in metric_names:
    values = data.filter([col for col in data.columns if metric in col]).to_numpy()
    metric_rank = np.array([ss.rankdata(row) for row in values]).flatten()
    ranks[metric] = metric_rank

In [64]:
rank_table = dict()
for col in ranks.keys():
    rank_table[col] = dict()
    for row in ranks.keys():
        if col == row: 
            rank_table[col][row] = 1.0
            continue 
        rx = ranks[col]
        ry = ranks[row]
        cov = np.cov(rx,ry)
        rank_table[col][row] = cov[0,1] / (rx.std() * ry.std())

In [65]:
pd.DataFrame(rank_table)

Unnamed: 0,_raw,_norm,_sheppard,_kruskal,_scalenorm
_raw,1.0,1.001391,0.506954,0.500695,0.52573
_norm,1.001391,1.0,0.506954,0.500695,0.52573
_sheppard,0.506954,0.506954,1.0,0.849096,0.922114
_kruskal,0.500695,0.500695,0.849096,1.0,0.947149
_scalenorm,0.52573,0.52573,0.922114,0.947149,1.0


In [66]:
corr_res = rank_table

In [92]:
# Convert results to latex table
stab = " & " + " & ".join(mname.replace("_", "") for mname in metric_names)
stab += "\\\\ \\hline\n"
for row in corr_res:
    stab += row.replace("_", "")
    stab += " & "
    stab += " & ".join(f"\\cellcolor[HTML]{{{get_color(round(corr_res[metric][row], 3)*100, 0, 100)}}} {round(corr_res[metric][row], 3)}" for metric in corr_res.keys())
    stab += "\\\\ \\hline\n"

In [93]:
print(stab)

 & raw & norm & sheppard & kruskal & scalenorm\\ \hline
raw & \cellcolor[HTML]{4BAED6} 1.0 & \cellcolor[HTML]{4BAED6} 1.001 & \cellcolor[HTML]{A4D6EA} 0.507 & \cellcolor[HTML]{A5D6EA} 0.501 & \cellcolor[HTML]{A0D4E9} 0.526\\ \hline
norm & \cellcolor[HTML]{4BAED6} 1.001 & \cellcolor[HTML]{4BAED6} 1.0 & \cellcolor[HTML]{A4D6EA} 0.507 & \cellcolor[HTML]{A5D6EA} 0.501 & \cellcolor[HTML]{A0D4E9} 0.526\\ \hline
sheppard & \cellcolor[HTML]{A4D6EA} 0.507 & \cellcolor[HTML]{A4D6EA} 0.507 & \cellcolor[HTML]{4BAED6} 1.0 & \cellcolor[HTML]{66BADC} 0.849 & \cellcolor[HTML]{59B4D9} 0.922\\ \hline
kruskal & \cellcolor[HTML]{A5D6EA} 0.501 & \cellcolor[HTML]{A5D6EA} 0.501 & \cellcolor[HTML]{66BADC} 0.849 & \cellcolor[HTML]{4BAED6} 1.0 & \cellcolor[HTML]{55B2D8} 0.947\\ \hline
scalenorm & \cellcolor[HTML]{A0D4E9} 0.526 & \cellcolor[HTML]{A0D4E9} 0.526 & \cellcolor[HTML]{59B4D9} 0.922 & \cellcolor[HTML]{55B2D8} 0.947 & \cellcolor[HTML]{4BAED6} 1.0\\ \hline

