In [2]:
# Import necessary libraries
import json 
import pandas as pd
import numpy as np
from scipy.stats import spearmanr

In [84]:
with open("out.json", 'r') as fdata:
    data = json.load(fdata)

Convert the shepard scores into [0-1] so that 0 is better so it is comparable to other scores

In [85]:
for dataset, res in data.items():
    for key in res.keys():
        if "sheppard" in key: 
            res[key] = 1 - (res[key] + 1) / 2

In [86]:
df = pd.DataFrame(data)

In [87]:
data = df.transpose()
data = data.drop([col for col in data.columns if "UMAP" in col],axis=1)
data

Unnamed: 0,MDS_raw,MDS_norm,MDS_scalenorm,MDS_kruskal,MDS_sheppard,TSNE_raw,TSNE_norm,TSNE_scalenorm,TSNE_kruskal,TSNE_sheppard,RANDOM_raw,RANDOM_norm,RANDOM_scalenorm,RANDOM_kruskal,RANDOM_sheppard
har_0,3.857761e+05,0.189285,0.189285,0.145995,0.026139,4.129114e+08,6.192660,0.272679,0.239012,0.065667,9.191817e+06,0.923952,0.541566,0.430894,0.495010
har_1,3.899252e+05,0.190300,0.190300,0.147328,0.027278,3.806237e+08,5.945615,0.262972,0.228254,0.061541,9.201894e+06,0.924459,0.542527,0.429335,0.498207
har_2,3.989198e+05,0.192483,0.192483,0.149706,0.028350,4.137609e+08,6.199028,0.269953,0.228089,0.060864,9.168915e+06,0.922800,0.542241,0.428391,0.498622
har_3,3.904292e+05,0.190423,0.190423,0.147558,0.027250,3.645546e+08,5.818756,0.261281,0.229166,0.062110,9.212982e+06,0.925015,0.539967,0.429214,0.492180
har_4,3.985287e+05,0.192388,0.192388,0.149430,0.028204,3.481111e+08,5.686012,0.269934,0.243324,0.067920,9.216956e+06,0.925215,0.542142,0.429073,0.498795
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
bank_5,3.311263e+06,0.356144,0.356144,0.316203,0.182134,6.186895e+09,15.394465,0.441362,0.410775,0.244368,1.912656e+07,0.855947,0.446756,0.429838,0.498196
bank_6,3.312068e+06,0.356187,0.356187,0.315632,0.182071,5.521767e+09,14.543445,0.424638,0.391157,0.234997,1.919261e+07,0.857423,0.445422,0.428854,0.496147
bank_7,3.317415e+06,0.356474,0.356474,0.316715,0.183114,5.315046e+09,14.268612,0.422871,0.388687,0.230314,1.911926e+07,0.855783,0.446744,0.430161,0.496030
bank_8,3.299065e+06,0.355487,0.355487,0.314383,0.179581,5.791994e+09,14.895061,0.443708,0.408166,0.236405,1.914834e+07,0.856434,0.446389,0.429642,0.496979


In [88]:
data[[col for col in data.columns if "raw" in col]].head()

Unnamed: 0,MDS_raw,TSNE_raw,RANDOM_raw
har_0,385776.085765,412911392.0,9191817.0
har_1,389925.192557,380623712.0,9201894.0
har_2,398919.830629,413760896.0,9168915.0
har_3,390429.235642,364554592.0,9212982.0
har_4,398528.705062,348111072.0,9216956.0


In [89]:
metric_names = ["_raw", "_norm", "_sheppard", "_kruskal", "_scalenorm"]

In [90]:
# Calculate how often metrics agree with “correct order”
res = dict()
for mname in metric_names: 
    metric = data[(col for col in data.columns if mname in col)]
    name_lookup = ["MDS", "TSNE", "RANDOM"]
    orderCount = {
        ("MDS", "TSNE", "RANDOM"): 0,
        ("MDS", "RANDOM", "TSNE"): 0,
        ("TSNE", "MDS", "RANDOM"): 0,
        ("TSNE", "RANDOM", "MDS"): 0,
        ("RANDOM", "MDS", "TSNE"): 0,
        ("RANDOM", "TSNE", "MDS"): 0,                    
    }

    for ind,row in metric.iterrows():
        tup_row = sorted([(row[i], name_lookup[i]) for i in range(3)])
        order = tuple([tup[1] for tup in tup_row])
        orderCount[order] += 1

    totalvals = sum(orderCount.values())
    for key in orderCount.keys():
        orderCount[key] /= totalvals
        
    res[mname] = orderCount

In [91]:
res

{'_raw': {('MDS', 'TSNE', 'RANDOM'): 0.041666666666666664,
  ('MDS', 'RANDOM', 'TSNE'): 0.8333333333333334,
  ('TSNE', 'MDS', 'RANDOM'): 0.125,
  ('TSNE', 'RANDOM', 'MDS'): 0.0,
  ('RANDOM', 'MDS', 'TSNE'): 0.0,
  ('RANDOM', 'TSNE', 'MDS'): 0.0},
 '_norm': {('MDS', 'TSNE', 'RANDOM'): 0.041666666666666664,
  ('MDS', 'RANDOM', 'TSNE'): 0.8333333333333334,
  ('TSNE', 'MDS', 'RANDOM'): 0.125,
  ('TSNE', 'RANDOM', 'MDS'): 0.0,
  ('RANDOM', 'MDS', 'TSNE'): 0.0,
  ('RANDOM', 'TSNE', 'MDS'): 0.0},
 '_sheppard': {('MDS', 'TSNE', 'RANDOM'): 0.9166666666666666,
  ('MDS', 'RANDOM', 'TSNE'): 0.0,
  ('TSNE', 'MDS', 'RANDOM'): 0.08333333333333333,
  ('TSNE', 'RANDOM', 'MDS'): 0.0,
  ('RANDOM', 'MDS', 'TSNE'): 0.0,
  ('RANDOM', 'TSNE', 'MDS'): 0.0},
 '_kruskal': {('MDS', 'TSNE', 'RANDOM'): 0.8208333333333333,
  ('MDS', 'RANDOM', 'TSNE'): 0.17916666666666667,
  ('TSNE', 'MDS', 'RANDOM'): 0.0,
  ('TSNE', 'RANDOM', 'MDS'): 0.0,
  ('RANDOM', 'MDS', 'TSNE'): 0.0,
  ('RANDOM', 'TSNE', 'MDS'): 0.0},
 '_scale

In [92]:
stab = " & " + " & ".join(mname.replace("_","") for mname in metric_names)
stab += "\\hline \\\\ \n"
stab

' & raw & norm & sheppard & kruskal & scalenorm\\hline \\\\ \n'

In [93]:
# Convert results to latex table
stab = " & " + " & ".join(mname.replace("_","") for mname in metric_names)
stab += "\\hline \\\\ \n"
for row in res['_raw'].keys():
    stab += ",".join(el for el in row)
    stab += " & "
    for metric in res.keys(): 
        stab += f"{round(res[metric][row], 3)}"
        if metric != "sheppard": 
            stab += " & "
    stab += "\\\\ \n"
    

In [94]:
print(stab)

 & raw & norm & sheppard & kruskal & scalenorm\hline \\ 
MDS,TSNE,RANDOM & 0.042 & 0.042 & 0.917 & 0.821 & 0.908 & \\ 
MDS,RANDOM,TSNE & 0.833 & 0.833 & 0.0 & 0.179 & 0.083 & \\ 
TSNE,MDS,RANDOM & 0.125 & 0.125 & 0.083 & 0.0 & 0.008 & \\ 
TSNE,RANDOM,MDS & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & \\ 
RANDOM,MDS,TSNE & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & \\ 
RANDOM,TSNE,MDS & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & \\ 



In [95]:
data.filter([col for col in data.columns if "_raw" in col]).head()

Unnamed: 0,MDS_raw,TSNE_raw,RANDOM_raw
har_0,385776.085765,412911392.0,9191817.0
har_1,389925.192557,380623712.0,9201894.0
har_2,398919.830629,413760896.0,9168915.0
har_3,390429.235642,364554592.0,9212982.0
har_4,398528.705062,348111072.0,9216956.0


In [96]:
import scipy.stats as ss

In [97]:
#Template for extracting rank vector for data
raw_values = data.filter([col for col in data.columns if "_raw" in col]).to_numpy()
raw_ranks = np.array([ss.rankdata(row) for row in raw_values]).flatten()

In [98]:
ranks = dict()
for metric in metric_names:
    values = data.filter([col for col in data.columns if metric in col]).to_numpy()
    metric_rank = np.array([ss.rankdata(row) for row in values]).flatten()
    ranks[metric] = metric_rank

In [99]:
rank_table = dict()
for col in ranks.keys():
    rank_table[col] = dict()
    for row in ranks.keys():
        if col == row: 
            rank_table[col][row] = 1.0
            continue 
        rx = ranks[col]
        ry = ranks[row]
        cov = np.cov(rx,ry)
        rank_table[col][row] = cov[0,1] / (rx.std() * ry.std())

In [100]:
pd.DataFrame(rank_table)

Unnamed: 0,_raw,_norm,_sheppard,_kruskal,_scalenorm
_raw,1.0,1.001391,0.506954,0.500695,0.52573
_norm,1.001391,1.0,0.506954,0.500695,0.52573
_sheppard,0.506954,0.506954,1.0,0.849096,0.922114
_kruskal,0.500695,0.500695,0.849096,1.0,0.947149
_scalenorm,0.52573,0.52573,0.922114,0.947149,1.0


In [101]:
# # Determine correct ranking for all metrics in each trial
# rankings = pd.DataFrame(index=data.index, columns=metric_names)
# for index, row in data.iterrows():
#     print(row)
#     break
#     # for metric in metric_names:
#     #     values = row.filter(like=metric)
        
#     #     if metric == "_sheppard":
#     #         rankings.loc[index, metric] = ', '.join(values.sort_values(
#     #             ascending=False).index.str.replace(metric, ''))
#     #     else:
#     #         rankings.loc[index, metric] = ', '.join(values.sort_values(
#     #             ascending=True).index.str.replace(metric, ''))

In [102]:
# # Calculate the correlation between the rankings produced by the stress metrics
# corr_matrix, _ = spearmanr(rankings)
# corr_matrix = np.triu(corr_matrix)

# corr_res = pd.DataFrame(
#     corr_matrix, index=rankings.keys(), columns=rankings.keys())

In [103]:
corr_res = rank_table

In [110]:
# Convert results to latex table
stab = " & " + " & ".join(mname.replace("_", "") for mname in metric_names)
stab += "\\\\ \\hline\n"
for row in corr_res:
    stab += row.replace("_", "")
    stab += " & "
    # for metric in corr_res.keys():
    #     stab += f"{round(corr_res[metric][row], 3)}"
    #     # if metric != "_sheppard":
    #     #     stab += " & "
    stab += " & ".join([str(round(corr_res[metric][row], 3)) for metric in corr_res.keys()])
    stab += "\\\\ \n"

In [111]:
print(stab)

 & raw & norm & sheppard & kruskal & scalenorm\\ \hline
raw & 1 & 1 & 1 & 1 & 1\\ 
norm & 1 & 1 & 1 & 1 & 1\\ 
sheppard & 1 & 1 & 1 & 1 & 1\\ 
kruskal & 1 & 1 & 1 & 1 & 1\\ 
scalenorm & 1 & 1 & 1 & 1 & 1\\ 

