In [2]:
%load_ext autoreload

%autoreload 2

%matplotlib inline

import os
import sys
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import pacmap
import umap
import trimap
import ivis
import phate

from scipy.stats import chisquare, chi2_contingency, pearsonr
from scipy.stats import kendalltau,spearmanr, weightedtau, theilslopes, wilcoxon, ttest_rel
from scipy.spatial import distance
import dcor

from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.manifold import Isomap, MDS, SpectralEmbedding
from sklearn.manifold import LocallyLinearEmbedding as LLE, TSNE, smacof, trustworthiness


from sklearn.metrics import rand_score, adjusted_mutual_info_score, adjusted_rand_score
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.cluster import KMeans

import numpy.linalg as la
import torch 

from numba import njit

import faiss

from hembedder.utils import distance, hyperparameter_tuning, quality_metrics

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Potential Embeddings to test:
* PCA (sklearn)
* SMACOF (sklearn)
* **LMDS** with exemplars for landmarking (local):
* LMVU (local):
* Sammon (local):
* GPLVM: 
* RankVisu:
* **UMAP**, [github](https://umap-learn.readthedocs.io/en/latest/index.html), [paper](https://arxiv.org/abs/1802.03426)
* **Parametric UMAP**, [github](https://umap-learn.readthedocs.io/en/latest/parametric_umap.html), [paper](https://arxiv.org/abs/2009.12981)
* **DenseMap**, part of UMAP library, [paper](https://www.biorxiv.org/content/10.1101/2020.05.12.077776v1)
* **IVIS**, [github](https://github.com/beringresearch/ivis), [paper](https://www.nature.com/articles/s41598-019-45301-0)
* **PHATE**, [github](https://phate.readthedocs.io/en/stable/), [paper](https://www.nature.com/articles/s41587-019-0336-3)
* **PACMAP**, [github](https://github.com/YingfanWang/PaCMAP), [paper](https://jmlr.org/papers/v22/20-1061.html)
* **TriMap**, [github](https://github.com/eamid/trimap), [paper](https://arxiv.org/abs/1910.00204)
* HOPE, [github](https://github.com/palash1992/GEM), [paper](https://dl.acm.org/doi/abs/10.1145/2939672.2939751)
* opt-SNE, [github](https://github.com/omiq-ai/Multicore-opt-SNE), [paper](https://www.biorxiv.org/content/10.1101/451690v3)
* FactorizedEmbeddings,[github](https://github.com/TrofimovAssya/FactorizedEmbeddings), [paper](https://academic.oup.com/bioinformatics/article/36/Supplement_1/i417/5870511)
* **MCML**, [github](https://github.com/pachterlab/MCML)

Distances:
* Manhattan
* Poincare
* Mahalanobis
* Euclidean

Data:
* celldyn; 3 million samples, 100+ dimensions
* RNA expression; 1200 samples; 40.000 dimensions
* synthetic datasets

Metrics:
* **Distance correlation**
* Rank preservation
* **Jaccard-index** 
* Calinski-Harabasz
* Davies-Bouldin
* Adjusted Rand score
* **Adjusted mutual information**
* Trustworthiness
* **random triplet score**: rank preservation among random triplets
* **Overall rank preservation**

Table with the embeddings, the quality metrics for 
* ```n_components```
* ```n_neighbors```
* ```n_samples```

#### celldyn data

In [5]:
celldyn_full = pd.read_feather("L:\laupodteam\AIOS\Bram\data\CellDyn\celldyn_FULL_transformed_df.feather")
meas_columns = [c for c in celldyn_full.columns if 'c_b' in c or 'COMBO_' in c]
mode_columns = [c for c in celldyn_full.columns if 'c_m' in c]
alrt_columns = [c for c in celldyn_full.columns if 'alrt' in c.lower()]
c_s_columns = [c for c in celldyn_full.columns if 'c_s_' in c.lower()]
demo_columns = ['gender', 'age']

## Parameter sweep definition

In [17]:
sample_sweep = [1000, 2000, 4000, 8000, 16000, 32000, 64000, 128000, 256000, 512000]
component_sweep = [2, 3, 4, 8, 12, 16, 32, 64]
neighbors_sweep = [10, 20, 40, 60, 80, 100, 200]

embedding_metrics = ['triplet', 'knn', 'nkept', 'distpres']

## 