In [1]:
from hdimvis.algorithms.spring_force_algos.hybrid_algo.Hybrid import Hybrid
from hdimvis.create_low_d_layout.LayoutCreation import LayoutCreation
from hdimvis.data_fetchers.DataFetcher import DataFetcher
from experiments.cube.Cube import Cube
from pathlib import Path
from definitions import PROJECT_ROOT
import pickle
from hdimvis.visualise_layouts_and_metrics.plot import show_layout,show_generation_metrics


In [2]:
rna = DataFetcher.fetch_data('rna N3k')
dataset = rna

####################
Fetching the "rna N3k" dataset
####################
Dataset loaded
Dataset shape: (3000, 50)
####################


In [3]:
num_repeats = 20
layouts = [[],[]]


for i in range(num_repeats):
    hybrid_correct = Hybrid(dataset=dataset, alpha=0.6,
                        use_knnd=False, sample_set_size=10, neighbour_set_size=5, use_correct_interpolation_error=True,
                            use_random_sample=True)
    hybrid_layout_correct = LayoutCreation().create_layout(hybrid_correct, optional_metric_collection=None)
    layouts[0].append(hybrid_layout_correct)


    hybrid = Hybrid(dataset=dataset, alpha=0.6,
                        use_knnd=False, sample_set_size=10, neighbour_set_size=5, use_correct_interpolation_error=True,
                    use_random_sample=False)
    hybrid_layout = LayoutCreation().create_layout(hybrid, optional_metric_collection=None)
    layouts[1].append(hybrid_layout)

output_dir= (Path(PROJECT_ROOT).joinpath(Path(f"experiments/hybrid/out/"))).resolve().absolute()

path_to_pickle_arr = (Path(output_dir).joinpath(Path(f"layouts_sampling.pickle"))).resolve()
with open(path_to_pickle_arr, 'wb') as pickle_out:
    pickle.dump(layouts, pickle_out)


####################
The algorithm will use a random initialization for the low D embedding/layout
 sample:355
 idices: 355
sample size 355
####################
A 2D layout of the "rna N3k" dataset will be created 
using the "Hybrid" algorithm
####################
The HD distance measure used is: euclidean
####################
####################
No metrics will be collected during layout creation. 
To change this use the 'metric collection' parameter of the layout 
####################
All stress calculations will be performed using the euclidian norm
####################
Spring constant is set to  0.5 
Damping constant is set to 0 
Spring constant scaling factor is set to 0.03333333333333333 
####################
####################
The algorithm will use a random initialization for the low D embedding/layout
[65, 134, 66, 95, 126, 147, 68, 97, 16, 76, 40, 29, 142, 148, 6, 138, 238, 223, 226, 183, 248, 180, 204, 170, 234, 180, 285, 238, 273, 243, 240, 162, 180, 399, 448, 341, 304, 

In [4]:
import numpy as np
from experiments.utils.get_f1_for_best_k_with_knn import get_f1_for_best_k_with_knn
from experiments.utils.get_kmeans_f1 import get_kmeans_f1



results_arr = np.zeros((2,20,4))
for j in range(2):
    for i,layout in enumerate(layouts[j]):
        print(f"Layout: {i}")
        stress = layout.get_final_stress()
        kmeans_f1 = get_kmeans_f1(layout.get_final_positions(), layout.labels)
        k, f1_for_best_k_with_knn =  get_f1_for_best_k_with_knn(lower_bound=1, upper_bound=30,
                                                            cross_validation_folds=10,
                                                             data=layout.data,
                                                             labels=layout.labels)

        results_arr[j,i,0] = stress
        results_arr[j,i,1] = kmeans_f1
        results_arr[j,i,2],results_arr[j,i,3] =k,  f1_for_best_k_with_knn

print(results_arr)
output_dir= (Path(PROJECT_ROOT).joinpath(
    Path(f"experiments/hybrid/out/"))).resolve().absolute()

path_to_pickle_arr = (Path(output_dir).joinpath(Path(f"results_arr_sampling.pickle"))).resolve()
with open(path_to_pickle_arr, 'wb') as pickle_out:
    pickle.dump(results_arr, pickle_out)

Layout: 0

 Computing vectorised euclidean stress 





Layout: 1

 Computing vectorised euclidean stress 





Layout: 2

 Computing vectorised euclidean stress 





Layout: 3

 Computing vectorised euclidean stress 





Layout: 4

 Computing vectorised euclidean stress 





Layout: 5

 Computing vectorised euclidean stress 





Layout: 6

 Computing vectorised euclidean stress 





Layout: 7

 Computing vectorised euclidean stress 





Layout: 8

 Computing vectorised euclidean stress 





Layout: 9

 Computing vectorised euclidean stress 





Layout: 10

 Computing vectorised euclidean stress 





Layout: 11

 Computing vectorised euclidean stress 





Layout: 12

 Computing vectorised euclidean stress 





Layout: 13

 Computing vectorised euclidean stress 





Layout: 14

 Computing vectorised euclidean stress 





Layout: 15

 Computing vectorised euclidean stress 





Layout: 16

 Computing vectorised euclidean stress 





Layout: 17

 Computing vectorised euclidean stress 





Layout: 18

 Computing vectorised euclidean stress 





Layout: 19

 Computing vectorised euclidean stress 





Layout: 0

 Computing vectorised euclidean stress 





Layout: 1

 Computing vectorised euclidean stress 





Layout: 2

 Computing vectorised euclidean stress 





Layout: 3

 Computing vectorised euclidean stress 





Layout: 4

 Computing vectorised euclidean stress 





Layout: 5

 Computing vectorised euclidean stress 





Layout: 6

 Computing vectorised euclidean stress 





Layout: 7

 Computing vectorised euclidean stress 





Layout: 8

 Computing vectorised euclidean stress 





Layout: 9

 Computing vectorised euclidean stress 





Layout: 10

 Computing vectorised euclidean stress 





Layout: 11

 Computing vectorised euclidean stress 





Layout: 12

 Computing vectorised euclidean stress 





Layout: 13

 Computing vectorised euclidean stress 





Layout: 14

 Computing vectorised euclidean stress 





Layout: 15

 Computing vectorised euclidean stress 





Layout: 16

 Computing vectorised euclidean stress 





Layout: 17

 Computing vectorised euclidean stress 





Layout: 18

 Computing vectorised euclidean stress 





Layout: 19

 Computing vectorised euclidean stress 





[[[0.27460983 0.15155824 1.         0.79959343]
  [0.29046032 0.16552226 1.         0.79959343]
  [0.2825331  0.15282719 1.         0.79959343]
  [0.30964321 0.16014444 1.         0.79959343]
  [0.27160309 0.15487746 1.         0.79959343]
  [0.29968755 0.15218202 1.         0.79959343]
  [0.28599391 0.14839798 1.         0.79959343]
  [0.27706968 0.1427729  1.         0.79959343]
  [0.27703078 0.15407066 1.         0.79959343]
  [0.27645244 0.15366189 1.         0.79959343]
  [0.30553699 0.17725114 1.         0.79959343]
  [0.28361287 0.14242741 1.         0.79959343]
  [0.26930078 0.1457312  1.         0.79959343]
  [0.26755238 0.16084095 1.         0.79959343]
  [0.29214392 0.15485022 1.         0.79959343]
  [0.28632719 0.16688406 1.         0.79959343]
  [0.30806447 0.15312816 1.         0.79959343]
  [0.27894533 0.14850369 1.         0.79959343]
  [0.29441995 0.17111638 1.         0.79959343]
  [0.27155779 0.14882083 1.         0.79959343]]

 [[0.27405784 0.16748062 1.         0.

In [6]:
from scipy import stats
for j,metric in enumerate(["stress", "kmeans", "k", "knn"]):

        if j != 2:
            mean_squad = np.mean(results_arr[0,:,j])
            sd_squad = np.std(results_arr[0,:,j])

            mean_96 = np.mean(results_arr[1,:,j])
            sd_96 = np.std(results_arr[1,:,j])

            if j == 3:
                tstat, pvalue = stats.ttest_ind(results_arr[0,:,j].round(decimals=4), results_arr[1,:,j].round(decimals=4), equal_var=False, alternative='greater')
                result = np.format_float_scientific(pvalue, precision=2,min_digits=2)

            else:
                tstat, pvalue = stats.ttest_ind(results_arr[0,:,j].round(decimals=4), results_arr[1,:,j].round(decimals=4), equal_var=False, alternative='smaller')
                result = np.format_float_scientific(pvalue, precision=2,min_digits=2)
            print(20*"#")
            print(f"Metric : {metric}")
            print(20*"#")


            print(f"random mean : {mean_squad.round(decimals=2)} (SD {sd_squad.round(decimals=2)})")
            print(f"stratified mean : {mean_96.round(decimals=2)} (SD {sd_96.round(decimals=2)})")
            print(f"test: {result}")
        else:
            print(f"random mean k: {np.mean(results_arr[0,:,j])}")
            print(f"stratified mean k: {np.mean(results_arr[1,:,j])}")


####################
Metric : stress
####################
random mean : 0.29 (SD 0.01)
stratified mean : 0.28 (SD 0.01)
test: 6.96e-02
####################
Metric : kmeans
####################
random mean : 0.16 (SD 0.01)
stratified mean : 0.16 (SD 0.01)
test: 8.95e-01
random mean k: 1.0
stratified mean k: 1.0
####################
Metric : knn
####################
random mean : 0.8 (SD 0.0)
stratified mean : 0.8 (SD 0.0)
test: nan


  tstat, pvalue = stats.ttest_ind(results_arr[0,:,j].round(decimals=4), results_arr[1,:,j].round(decimals=4), equal_var=False, alternative='two-sided')
