# Begin

### Import Statements

In [1]:
import pandas as pd
import itertools
from scipy.stats import spearmanr

### Load DataFrames

In [2]:
min_kls = pd.read_csv('test-kl-figures/min_kls_new.csv', index_col=[0, 1, 2])
min_kls.columns.name = 'Algorithm'

In [3]:
y_normalized_kls = pd.read_csv('test-kl-figures/y_normalized_kls.csv', index_col=[0, 1, 2])

In [4]:
kl_at_scales_5_20 = pd.read_csv('test-kl-figures/kl_at_5_and_20.csv', index_col=[0, 1, 2])
kl_at_scales_1_10 = pd.read_csv('test-kl-figures/kl_at_1_and_10.csv', index_col=[0, 1, 2])
kl_at_scales = pd.concat([kl_at_scales_1_10, kl_at_scales_5_20], axis=0)
kl_at_scales.columns.name = 'Algorithm'
kl_at_scales = kl_at_scales.unstack().reorder_levels(["Scale", "Algorithm"], axis=1).sort_index(axis=1)

### Common Functions

In [5]:
def get_kl_stats(df):
    """
    Calculate statistics (min, max, mean, standard deviation, median) for each algorithm in `df`
    """
    kl_stats = df.agg([pd.Series.min, pd.Series.max, pd.Series.mean, pd.Series.std, pd.Series.median]).T

    kl_stats.index.rename('Statistic', level=1, inplace=True)

    colors = {
        "min": "background-color: #2F2D2E; color: white",
        "max": "background-color: #808080; color: white",
        "mean": "background-color: #536878; color: white",
        "std": "background-color: #493D31; color: white",
        "median": "background-color: #323F48: color: white",
    }


    def make_pretty(styler):
        styler.set_caption('Statistics')
        styler.apply(lambda row : [colors.get(row.name[1], "")] * len(row), axis=1)
        styler.map_index(lambda stat : colors.get(stat, ""), axis=0, level=1)
        styler.set_table_styles(
        [{'selector': 'td, th', 'props': [('border', '1px solid black')]}]
    )
        return styler

    # Apply the Styler
    return kl_stats.style.pipe(make_pretty)

In [6]:
def percentage_per_order(df: pd.DataFrame, drop_UMAP: bool):
    """
    Find the percentages of instances where the metric in `df` follows a certain order (e.g. MDS >= TSNE >= RANDOM)
    """
    results = {}

    if drop_UMAP:
        permutations = itertools.permutations(['TSNE', 'MDS', 'RANDOM'])
    else:
        permutations = itertools.permutations(['TSNE', 'UMAP', 'MDS', 'RANDOM'])


    for perm in permutations:
        condition = pd.Series(True, index=df.index)
        for i in range(1, len(perm)):
            condition &= df[perm[i-1]] <= df[perm[i]]

        percentage = (condition).mean() * 100
        # results[" < ".join(perm)] = f"{percentage:.3}%"
        results[" < ".join(perm)] = percentage


    return results

In [7]:
def ranks_for_spearman(df: pd.DataFrame, drop_UMAP: bool):
    """
    Gives each entry a rank depending on the order of least KL between the algorithms
    """
    if drop_UMAP:
        permutations = itertools.permutations(['TSNE', 'MDS', 'RANDOM'])
    else:
        permutations = reversed(list(itertools.permutations(df.columns)))

    ranked_df = pd.DataFrame(100, index=df.index, columns=["Rank"])

    for rank, perm in enumerate(permutations):
        condition = pd.Series(True, index=df.index)
        for i in range(1, len(perm)):
            condition &= df[perm[i-1]] <= df[perm[i]]

        ranked_df[condition] = rank

    return ranked_df
        

# Analysis: Min KLs

### Intro

In [8]:
min_kls.info()
min_kls.head(15)

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 460 entries, ('auto-mpg', 'Run 0', 'x') to ('wine', 'Run 9', 'y')
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RANDOM  460 non-null    float64
 1   MDS     460 non-null    float64
 2   UMAP    460 non-null    float64
 3   TSNE    460 non-null    float64
dtypes: float64(4)
memory usage: 17.1+ KB


Unnamed: 0_level_0,Unnamed: 1_level_0,Algorithm,RANDOM,MDS,UMAP,TSNE
Dataset,Run,Coord,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
auto-mpg,Run 0,x,0.334751,6.365166,0.518053,0.197916
auto-mpg,Run 0,y,3.653443,2.659041,2.74419,2.559557
auto-mpg,Run 1,x,0.144634,6.404407,0.494673,0.200003
auto-mpg,Run 1,y,3.654084,2.654081,2.746081,2.564123
auto-mpg,Run 2,x,6e-06,6.470418,0.549763,0.19361
auto-mpg,Run 2,y,3.654109,2.649762,2.707762,2.58086
auto-mpg,Run 3,x,6e-06,6.4342,0.483282,0.192236
auto-mpg,Run 3,y,3.654109,2.652285,2.774298,2.567548
auto-mpg,Run 4,x,5e-06,6.566125,0.475995,0.200524
auto-mpg,Run 4,y,3.654109,2.639707,2.757787,2.540521


### Check within search range

* Since minimum-finding function (`scipy.optimize.minimize_scalar`) was bounded by (0, 300), should check if the minimum actually lies beyond 300 for any embedding

In [9]:
minimum_beyond_bound = ((min_kls.loc[:, :, 'x'] > 250)).any(axis=None)
print("There are some minima beyond 250: ", minimum_beyond_bound)

There are some minima beyond 250:  False


* For which datasets do the graphs need to be drawn for scales greater than 15?

In [10]:
thresh = 15
# set([idx[0] for idx in min_kls.loc[:, :, 'x'][min_kls.loc[:, :, 'x'] > thresh].dropna(how='all').index])
min_kls.loc[:, :, 'x'][min_kls.loc[:, :, 'x'] > thresh].dropna(how='all').style.background_gradient('viridis', vmin=thresh, vmax=250)

Unnamed: 0_level_0,Algorithm,RANDOM,MDS,UMAP,TSNE
Dataset,Run,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
epileptic,Run 0,,221.901708,24.573164,
epileptic,Run 1,,208.626467,23.912558,
epileptic,Run 2,,232.028268,25.064733,
epileptic,Run 3,,244.407178,24.330217,
epileptic,Run 4,,201.327047,24.446405,
epileptic,Run 5,,213.995798,24.234971,
epileptic,Run 6,,214.672051,24.773188,
epileptic,Run 7,,210.26797,24.428381,
epileptic,Run 8,,241.808194,24.742296,
epileptic,Run 9,,241.801079,24.413691,


### KL Statistics of Each Algorithm

In [11]:
get_kl_stats(min_kls.groupby('Coord'))

Unnamed: 0_level_0,Coord,x,y
Algorithm,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1
RANDOM,min,3e-06,2.194635
RANDOM,max,0.57812,6.82521
RANDOM,mean,0.093754,4.964358
RANDOM,std,0.119864,1.279163
RANDOM,median,7e-06,5.314331
MDS,min,0.462498,0.465993
MDS,max,244.407178,6.003137
MDS,mean,17.9422,3.459133
MDS,std,45.857566,1.490983
MDS,median,2.093331,3.367187


# Analysis: Y-Normalized KLs

### Intro

In [12]:
y_normalized_kls.info()
y_normalized_kls.head(15)

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 460 entries, ('auto-mpg', 'Run 0', 'x') to ('wine', 'Run 9', 'y')
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RANDOM  460 non-null    float64
 1   MDS     460 non-null    float64
 2   UMAP    460 non-null    float64
 3   TSNE    460 non-null    float64
dtypes: float64(4)
memory usage: 17.1+ KB


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,RANDOM,MDS,UMAP,TSNE
Dataset,Run,Coord,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
auto-mpg,Run 0,x,0.736095,0.504811,0.037664,0.01768
auto-mpg,Run 0,y,3.65956,3.496947,3.457386,3.381452
auto-mpg,Run 1,x,0.743537,0.516091,0.040224,0.018109
auto-mpg,Run 1,y,3.664046,3.491325,3.413803,3.377061
auto-mpg,Run 2,x,0.73176,0.508052,0.051974,0.017953
auto-mpg,Run 2,y,3.666392,3.495301,3.380314,3.371496
auto-mpg,Run 3,x,0.745632,0.506356,0.047378,0.016292
auto-mpg,Run 3,y,3.665285,3.495957,3.375667,3.389427
auto-mpg,Run 4,x,0.749013,0.508962,0.04834,0.017269
auto-mpg,Run 4,y,3.664596,3.494545,3.351805,3.385543


### KL Statistics of Each Algorithm

In [13]:
get_kl_stats(y_normalized_kls.groupby('Coord'))

Unnamed: 0_level_0,Coord,x,y
Unnamed: 0_level_1,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1
RANDOM,min,0.711309,2.199078
RANDOM,max,0.82222,6.835538
RANDOM,mean,0.729639,4.974573
RANDOM,std,0.016457,1.278445
RANDOM,median,0.724328,5.325002
MDS,min,0.025113,1.985249
MDS,max,0.663372,6.81172
MDS,mean,0.253329,4.840405
MDS,std,0.218615,1.289692
MDS,median,0.177071,5.26425


# Analysis: KL at Scales (1 and 10)

### Intro

In [14]:
kl_at_scales.info()
kl_at_scales.head(15)

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 230 entries, ('auto-mpg', 'Run 0') to ('wine', 'Run 9')
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   (1, MDS)      230 non-null    float64
 1   (1, RANDOM)   230 non-null    float64
 2   (1, TSNE)     230 non-null    float64
 3   (1, UMAP)     230 non-null    float64
 4   (5, MDS)      230 non-null    float64
 5   (5, RANDOM)   230 non-null    float64
 6   (5, TSNE)     230 non-null    float64
 7   (5, UMAP)     230 non-null    float64
 8   (10, MDS)     230 non-null    float64
 9   (10, RANDOM)  230 non-null    float64
 10  (10, TSNE)    230 non-null    float64
 11  (10, UMAP)    230 non-null    float64
 12  (20, MDS)     230 non-null    float64
 13  (20, RANDOM)  230 non-null    float64
 14  (20, TSNE)    230 non-null    float64
 15  (20, UMAP)    230 non-null    float64
dtypes: float64(16)
memory usage: 30.4+ KB


Unnamed: 0_level_0,Scale,1,1,1,1,5,5,5,5,10,10,10,10,20,20,20,20
Unnamed: 0_level_1,Algorithm,MDS,RANDOM,TSNE,UMAP,MDS,RANDOM,TSNE,UMAP,MDS,RANDOM,TSNE,UMAP,MDS,RANDOM,TSNE,UMAP
Dataset,Run,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
auto-mpg,Run 0,3.258463,3.672997,2.957726,2.810229,2.671279,4.161121,3.593373,3.576499,2.698553,4.570605,3.848491,4.074403,2.884478,4.986967,4.209013,4.552697
auto-mpg,Run 1,3.258211,3.679646,2.959572,2.817809,2.666992,4.199261,3.604868,3.606241,2.692613,4.619551,3.867803,4.129957,2.877432,5.040332,4.230974,4.651954
auto-mpg,Run 2,3.257867,3.683661,2.99039,2.763595,2.663695,4.196272,3.655485,3.517261,2.686477,4.607569,3.945222,4.024686,2.869343,5.013873,4.347048,4.518174
auto-mpg,Run 3,3.257554,3.681004,2.972977,2.848668,2.665593,4.193227,3.604901,3.629723,2.689819,4.609424,3.861281,4.150589,2.873127,5.023361,4.224017,4.659922
auto-mpg,Run 4,3.256776,3.679377,2.936701,2.831104,2.655259,4.181618,3.586441,3.604224,2.673962,4.595642,3.850438,4.106868,2.853428,5.008583,4.209997,4.597458
auto-mpg,Run 5,3.258408,3.678597,2.932687,2.767105,2.666965,4.177248,3.581731,3.542717,2.688451,4.583897,3.847234,4.052495,2.868815,4.988555,4.207216,4.557358
auto-mpg,Run 6,3.257174,3.679913,2.999546,2.69538,2.653555,4.193887,3.674461,3.459896,2.67041,4.61254,3.95276,3.977362,2.849332,5.031755,4.312349,4.487866
auto-mpg,Run 7,3.258667,3.680985,2.942217,2.846319,2.668027,4.194356,3.580406,3.57991,2.691199,4.614375,3.835239,4.094509,2.873424,5.036935,4.193775,4.604635
auto-mpg,Run 8,3.257886,3.681088,2.968367,2.795132,2.656456,4.19435,3.603866,3.550324,2.674497,4.607555,3.861908,4.073102,2.854037,5.015844,4.226895,4.618066
auto-mpg,Run 9,3.257652,3.681807,2.968792,2.749498,2.660888,4.209832,3.599729,3.522231,2.681895,4.646386,3.854367,4.042906,2.863397,5.091042,4.216586,4.574389


### KL Statistics For Each Algorithm

In [15]:
get_kl_stats(kl_at_scales.stack(level=0, future_stack=True).groupby('Scale'))

Unnamed: 0_level_0,Scale,1,5,10,20
Algorithm,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MDS,min,1.756479,0.686447,0.474066,0.533392
MDS,max,6.689422,7.938931,6.889937,8.987083
MDS,mean,4.104777,4.193977,3.901952,4.6434
MDS,std,1.364754,1.796769,1.659857,2.074197
MDS,median,3.97693,4.024436,3.712291,4.465689
RANDOM,min,2.208674,2.654912,3.062566,3.486724
RANDOM,max,6.852598,7.369282,7.786726,8.202947
RANDOM,mean,4.990843,5.498898,5.911721,6.323967
RANDOM,std,1.278945,1.277438,1.278227,1.275761
RANDOM,median,5.342521,5.862578,6.282032,6.699373


# Performance of Each Metric

### Percentages of Each Algorithm Ranking

In [16]:
def get_table(drop_UMAP: bool):
    data = {
        "Min-KL" : percentage_per_order(min_kls.loc[:, :, 'y'], drop_UMAP=drop_UMAP),
        "Normalized-KL": percentage_per_order(y_normalized_kls.loc[:, :, 'y'], drop_UMAP=drop_UMAP),
        "KL at Scale = 1": percentage_per_order(kl_at_scales[1], drop_UMAP=drop_UMAP),
        "KL at Scale = 5": percentage_per_order(kl_at_scales[5], drop_UMAP=drop_UMAP),
        "KL at Scale = 10": percentage_per_order(kl_at_scales[10], drop_UMAP=drop_UMAP),
        "KL at Scale = 20": percentage_per_order(kl_at_scales[20], drop_UMAP=drop_UMAP)
    }
    return pd.DataFrame(data).style.background_gradient('hot', vmin=0, vmax=100).format(lambda x : f"{x:.2f}%")


##### **Comparing t-SNE, MDS, Random**

In [17]:
get_table(drop_UMAP=True)

Unnamed: 0,Min-KL,Normalized-KL,KL at Scale = 1,KL at Scale = 5,KL at Scale = 10,KL at Scale = 20
TSNE < MDS < RANDOM,95.65%,80.00%,86.96%,60.00%,61.74%,47.39%
TSNE < RANDOM < MDS,0.00%,0.00%,0.00%,4.78%,3.91%,0.43%
MDS < TSNE < RANDOM,4.35%,20.00%,4.35%,5.22%,16.96%,24.78%
MDS < RANDOM < TSNE,0.00%,0.00%,8.70%,13.48%,13.04%,9.57%
RANDOM < TSNE < MDS,0.00%,0.00%,0.00%,0.87%,0.00%,0.00%
RANDOM < MDS < TSNE,0.00%,0.00%,0.00%,15.65%,4.35%,17.83%


##### **Comparing t-SNE, UMAP, MDS, Random**

In [18]:
get_table(drop_UMAP=False)

Unnamed: 0,Min-KL,Normalized-KL,KL at Scale = 1,KL at Scale = 5,KL at Scale = 10,KL at Scale = 20
TSNE < UMAP < MDS < RANDOM,82.17%,33.91%,82.17%,48.70%,22.17%,15.22%
TSNE < UMAP < RANDOM < MDS,0.00%,0.00%,0.00%,4.35%,3.91%,0.43%
TSNE < MDS < UMAP < RANDOM,12.17%,12.61%,0.43%,0.00%,0.00%,0.00%
TSNE < MDS < RANDOM < UMAP,0.00%,2.17%,0.00%,0.00%,0.00%,0.00%
TSNE < RANDOM < UMAP < MDS,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
TSNE < RANDOM < MDS < UMAP,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
UMAP < TSNE < MDS < RANDOM,1.30%,31.30%,4.35%,11.30%,39.57%,32.17%
UMAP < TSNE < RANDOM < MDS,0.00%,0.00%,0.00%,0.43%,0.00%,0.00%
UMAP < MDS < TSNE < RANDOM,0.00%,7.83%,0.00%,0.87%,8.26%,11.30%
UMAP < MDS < RANDOM < TSNE,0.00%,0.00%,3.04%,0.43%,0.00%,1.30%


### Correlation of Orders

In [19]:
def get_ranks(drop_UMAP: bool):
    ranks = {
        "Min-KL" : ranks_for_spearman(min_kls.loc[:, :, 'y'], drop_UMAP=drop_UMAP).values,
        "Y-Normalized-KL": ranks_for_spearman(y_normalized_kls.loc[:, :, 'y'], drop_UMAP=drop_UMAP).values,
        "KL at Scale = 1": ranks_for_spearman(kl_at_scales[1], drop_UMAP=drop_UMAP).values,
        "KL at Scale = 5": ranks_for_spearman(kl_at_scales[5], drop_UMAP=drop_UMAP).values,
        "KL at Scale = 10": ranks_for_spearman(kl_at_scales[10], drop_UMAP=drop_UMAP).values,
        "KL at Scale = 20": ranks_for_spearman(kl_at_scales[20], drop_UMAP=drop_UMAP).values,
    }

    correlation = pd.DataFrame(pd.NA, index=ranks.keys(), columns=ranks.keys()).astype("Float64")

    methods = list(ranks.keys())
    for i in range(len(methods)):
        for j in range(i, len(methods)):
            correlation.loc[methods[i], methods[j]] = spearmanr(ranks[methods[i]], ranks[methods[j]])[0]

    return correlation.style.set_caption("Without UMAP" if drop_UMAP else "With UMAP").background_gradient('Oranges_r', vmin=0, vmax=1).format(lambda x: '-' if pd.isna(x) else f'{x:.2f}')



#### **Considering only t-SNE, MDS, and Random**

In [20]:
get_ranks(drop_UMAP=True)


Unnamed: 0,Min-KL,Y-Normalized-KL,KL at Scale = 1,KL at Scale = 5,KL at Scale = 10,KL at Scale = 20
Min-KL,1.00,-0.11,0.58,0.22,0.33,0.11
Y-Normalized-KL,-,1.00,0.10,0.29,0.19,0.24
KL at Scale = 1,-,-,1.00,0.41,0.60,0.31
KL at Scale = 5,-,-,-,1.00,0.41,0.81
KL at Scale = 10,-,-,-,-,1.00,0.49
KL at Scale = 20,-,-,-,-,-,1.0


#### **Considering t-SNE, UMAP, MDS, Random**

* Orders are not meaningfully ranked, so correlation analysis is in question

In [21]:
get_ranks(drop_UMAP=False)

Unnamed: 0,Min-KL,Y-Normalized-KL,KL at Scale = 1,KL at Scale = 5,KL at Scale = 10,KL at Scale = 20
Min-KL,1.00,0.30,0.02,0.48,0.57,0.55
Y-Normalized-KL,-,1.00,0.28,0.15,0.32,0.33
KL at Scale = 1,-,-,1.00,0.14,0.14,0.15
KL at Scale = 5,-,-,-,1.00,0.47,0.6
KL at Scale = 10,-,-,-,-,1.00,0.81
KL at Scale = 20,-,-,-,-,-,1.0


#### **For Which Datasets is Expected Order Not Observed for Min-KL?**

In [22]:
def incorrect_datasets(df: pd.DataFrame, drop_UMAP=True):
    if drop_UMAP:
        order = ['TSNE', 'MDS', 'RANDOM']
    else:
        order = ['TSNE', 'UMAP', 'MDS', 'RANDOM']

    condition = pd.Series(False, index=df.index)
    for i in range(1, len(order)):
        condition |= df[order[i-1]] > df[order[i]]


    datasets = sorted(set(df[condition].index.get_level_values(level=0)))
    max_kl = df[condition].max(axis=None)
    not_preserved_kls = df[condition].style.background_gradient('inferno', vmax=max_kl)

    return datasets, not_preserved_kls

incorrect_datasets(min_kls.loc[:, :, 'y'])
bad_datasets, not_preserved_kls = incorrect_datasets(min_kls.loc[:, :, 'y'], drop_UMAP=True)
print(bad_datasets)
not_preserved_kls

['penguins']


Unnamed: 0_level_0,Algorithm,RANDOM,MDS,UMAP,TSNE
Dataset,Run,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
penguins,Run 0,3.474382,2.848743,2.857584,2.894744
penguins,Run 1,3.47438,2.848078,2.954606,2.874477
penguins,Run 2,3.474364,2.849051,2.98827,2.906045
penguins,Run 3,3.474381,2.823071,2.931761,2.909283
penguins,Run 4,3.474312,2.849991,2.956805,2.950817
penguins,Run 5,3.474342,2.855042,2.935944,2.920333
penguins,Run 6,3.474382,2.833601,2.979287,2.903101
penguins,Run 7,3.474382,2.846731,2.910349,2.923358
penguins,Run 8,3.474357,2.869498,2.911648,2.879733
penguins,Run 9,3.474152,2.869061,2.907207,2.90305
