In [2]:
import anndata as ad
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

import phate
import scanpy as sc


In [3]:
# Meta data
DATA_DIR = "../../data/multi_cite/"
FP_CELL_METADATA = os.path.join(DATA_DIR,"metadata.csv")
FP_TRAIN_MULTI_INPUTS = os.path.join(DATA_DIR,"train_multi_inputs.h5")
FP_TEST_MULTI_INPUTS = os.path.join(DATA_DIR,"test_multi_inputs.h5")

df_meta = pd.read_csv(FP_CELL_METADATA)
df_meta

Unnamed: 0,cell_id,day,donor,cell_type,technology
0,c2150f55becb,2,27678,HSC,citeseq
1,65b7edf8a4da,2,27678,HSC,citeseq
2,c1b26cb1057b,2,27678,EryP,citeseq
3,917168fa6f83,2,27678,NeuP,citeseq
4,2b29feeca86d,2,27678,EryP,citeseq
...,...,...,...,...,...
281523,96a60b026659,10,31800,hidden,multiome
281524,d493e546991e,10,31800,hidden,multiome
281525,05666c99aa48,10,31800,hidden,multiome
281526,121f946642b5,10,31800,hidden,multiome


In [4]:
days_to_keep = [2,3,4,5,6,7]
df_meta = df_meta[(df_meta['technology']=='multiome') & (df_meta['donor']==13176) & (df_meta['day'].isin(days_to_keep))]
df_meta

Unnamed: 0,cell_id,day,donor,cell_type,technology
193722,cc1db26a8022,2,13176,HSC,multiome
193723,9cd4f248468c,2,13176,HSC,multiome
193724,891a65593475,2,13176,HSC,multiome
193725,9958bac2678b,2,13176,HSC,multiome
193726,35311a9bd7d5,2,13176,HSC,multiome
...,...,...,...,...,...
229113,5a58b52115cd,7,13176,NeuP,multiome
229114,ccb68da1a889,7,13176,MasP,multiome
229115,6b419951cda1,7,13176,HSC,multiome
229116,2766dd8180cd,7,13176,MasP,multiome


In [5]:
cell_ids_of_interest = df_meta['cell_id'].unique().tolist()
print('Number of cells of interest:', len(cell_ids_of_interest))

Number of cells of interest: 35396


In [6]:
# get info of the hdf
with pd.HDFStore(FP_TRAIN_MULTI_INPUTS, mode='r') as store:
    print(f"Train file info: {store.info()}")

with pd.HDFStore(FP_TEST_MULTI_INPUTS, mode='r') as store:
    print(f"Test file info: {store.info()}")


Train file info: <class 'pandas.io.pytables.HDFStore'>
File path: ../../data/multi_cite/train_multi_inputs.h5
/train_multi_inputs            frame        (shape->[105942,228942])
Test file info: <class 'pandas.io.pytables.HDFStore'>
File path: ../../data/multi_cite/test_multi_inputs.h5
/test_multi_inputs            frame        (shape->[55935,228942])


In [7]:
df_train_list = []
chunk_size = 5000
train_total_rows = 105942
for start in range(0, train_total_rows, chunk_size):
    df_train = pd.read_hdf(FP_TRAIN_MULTI_INPUTS, start=start, stop=start+chunk_size)
    df_train_list.append(df_train)

In [10]:
df_train_list[0]

gene_id,GL000194.1:114519-115365,GL000194.1:55758-56597,GL000194.1:58217-58957,GL000194.1:59535-60431,GL000195.1:119766-120427,GL000195.1:120736-121603,GL000195.1:137437-138345,GL000195.1:15901-16653,GL000195.1:22357-23209,GL000195.1:23751-24619,...,chrY:7722278-7723128,chrY:7723971-7724880,chrY:7729854-7730772,chrY:7731785-7732664,chrY:7810142-7811040,chrY:7814107-7815018,chrY:7818751-7819626,chrY:7836768-7837671,chrY:7869454-7870371,chrY:7873814-7874709
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
56390cf1b95e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.428336,0.0,0.0,0.0,0.0
fc0c60183c33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9b4a87e22ad0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
81cccad8cd81,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
15cb3d85c232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24c23196a4f1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
893c0baff2b3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2f5abd4205d5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
e3159611e631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [14]:
# Filter df_train_list 
df_train_filtered = []
for df in df_train_list:
    print(df.shape)
    filtered_df = df[df.index.isin(cell_ids_of_interest)]
    print(filtered_df.shape)
    df_train_filtered.append(filtered_df)

(5000, 228942)
(0, 228942)
(5000, 228942)
(0, 228942)
(5000, 228942)
(0, 228942)
(5000, 228942)
(0, 228942)
(5000, 228942)
(0, 228942)
(5000, 228942)
(0, 228942)
(5000, 228942)
(1299, 228942)
(5000, 228942)
(5000, 228942)
(5000, 228942)
(5000, 228942)
(5000, 228942)
(5000, 228942)
(5000, 228942)
(5000, 228942)
(5000, 228942)
(5000, 228942)
(5000, 228942)
(5000, 228942)
(5000, 228942)
(4097, 228942)
(5000, 228942)
(0, 228942)
(5000, 228942)
(0, 228942)
(5000, 228942)
(0, 228942)
(5000, 228942)
(0, 228942)
(5000, 228942)
(0, 228942)
(5000, 228942)
(0, 228942)
(5000, 228942)
(0, 228942)
(942, 228942)
(0, 228942)


In [15]:
df_train = pd.concat(df_train_filtered)
print(df_train)

gene_id       GL000194.1:114519-115365  GL000194.1:55758-56597   
cell_id                                                          
cc1db26a8022                       0.0                     0.0  \
9cd4f248468c                       0.0                     0.0   
891a65593475                       0.0                     0.0   
9958bac2678b                       0.0                     0.0   
35311a9bd7d5                       0.0                     0.0   
...                                ...                     ...   
5a58b52115cd                       0.0                     0.0   
ccb68da1a889                       0.0                     0.0   
6b419951cda1                       0.0                     0.0   
2766dd8180cd                       0.0                     0.0   
bd4b7f1a3b17                       0.0                     0.0   

gene_id       GL000194.1:58217-58957  GL000194.1:59535-60431   
cell_id                                                        
cc1db26a8022 

In [16]:
adata = ad.AnnData(df_train)


In [19]:
adata.obs_names = df_train.index
adata

AnnData object with n_obs × n_vars = 35396 × 228942

In [21]:
# PCA
sc.pp.pca(adata, n_comps=50)
adata

: 

### Visualize the df_train_donor

In [28]:
adata = ad.read_h5ad("adata_multi_train_donor-13176_size-20000.h5ad")
adata

AnnData object with n_obs × n_vars = 20000 × 228942
    obs: 'donor', 'technology', 'cell_id'
    var: 'feature_name'

In [7]:
# # adata = ad.AnnData(df_train_donor)
# adata.var_names = df_train_donor.columns
# adata.obs_names = df_train_donor.index
# adata.obs['donor'] = donor
# adata.obs['technology'] = 'multiome'
# adata.obs['cell_id'] = df_train_donor.index
# adata.var['feature_name'] = df_train_donor.columns
# adata

In [30]:
print(df_meta.donor.unique())
print(df_meta.day.unique())

[13176]
[ 2  3  4  7 10]


In [31]:
# pair day with cell id
cellid2day = {cell_id: day for cell_id, day in zip(df_meta['cell_id'], df_meta['day'])}
adata.obs['day'] = [cellid2day[cell_id] for cell_id in adata.obs['cell_id']]

adata

AnnData object with n_obs × n_vars = 20000 × 228942
    obs: 'donor', 'technology', 'cell_id', 'day'
    var: 'feature_name'

In [35]:
adata.obs['day'].unique()

array([3, 7, 4, 2])

In [36]:
day_class = np.unique(adata.obs['day'].tolist())
print(day_class)
day2class = {day: i for i, day in enumerate(day_class)}
adata.obs['day_class'] = [day2class[day] for day in adata.obs['day']]

print(adata)

[2 3 4 7]
AnnData object with n_obs × n_vars = 20000 × 228942
    obs: 'donor', 'technology', 'cell_id', 'day', 'day_class'
    var: 'feature_name'


In [38]:
adata

AnnData object with n_obs × n_vars = 20000 × 228942
    obs: 'donor', 'technology', 'cell_id', 'day', 'day_class'
    var: 'feature_name'

In [39]:
# hvg
sc.pp.highly_variable_genes(adata, n_top_genes=1000)
adata

AnnData object with n_obs × n_vars = 20000 × 228942
    obs: 'donor', 'technology', 'cell_id', 'day', 'day_class'
    var: 'feature_name', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'hvg'

In [40]:
# pca, 50 components
sc.tl.pca(adata, n_comps=100)
adata

AnnData object with n_obs × n_vars = 20000 × 228942
    obs: 'donor', 'technology', 'cell_id', 'day', 'day_class'
    var: 'feature_name', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'hvg', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'

In [41]:
adata

AnnData object with n_obs × n_vars = 20000 × 228942
    obs: 'donor', 'technology', 'cell_id', 'day', 'day_class'
    var: 'feature_name', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'hvg', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'

In [42]:
from scipy.spatial.distance import pdist, squareform
from scipy import sparse as sp

def convert_data(X, colors, seed=42, test_size=0.1, knn=5, t='auto', n_components=3):
    # if X is sparse, convert to dense
    if sp.issparse(X):
        X = X.toarray()
        
    phate_op = phate.PHATE(random_state=seed, t=t, n_components=n_components, knn=knn)
    phate_data = phate_op.fit_transform(X)

    dists = squareform(pdist(phate_op.diff_potential))

    return dict(
        data=X,
        colors=colors,
        dist=dists,
        phate=phate_data
    )

data = convert_data(adata.obsm['X_pca'][:,:50], np.array(adata.obs['day_class'].tolist()))

Calculating PHATE...
  Running PHATE on 20000 observations and 50 variables.
  Calculating graph and diffusion operator...
    Calculating KNN search...
    Calculated KNN search in 8.72 seconds.
    Calculating affinities...



Detected zero distance between 62966 pairs of samples. Consider removing duplicates to avoid errors in downstream processing.


overflow encountered in power



    Calculated affinities in 17.30 seconds.
  Calculated graph and diffusion operator in 26.38 seconds.
  Calculating landmark operator...
    Calculating SVD...
    Calculated SVD in 6.54 seconds.
    Calculating KMeans...
    Calculated KMeans in 3.26 seconds.
  Calculated landmark operator in 10.24 seconds.
  Calculating optimal t...
    Automatically selected t = 5
  Calculated optimal t in 2.72 seconds.
  Calculating diffusion potential...
  Calculated diffusion potential in 0.24 seconds.
  Calculating metric MDS...






  Calculated metric MDS in 49.62 seconds.
Calculated PHATE in 89.20 seconds.


: 

In [21]:
print(data['data'].shape)
print(data['colors'].shape)
print(data['dist'].shape)
print(data['phate'].shape)

# save npz
np.savez("../../data/multi_D-50_d-3_pca.npz", data=data['data'], colors=data['colors'], dist=data['dist'], phate=data['phate'])


(10000, 50)
(10000,)
(10000, 10000)
(10000, 3)


In [23]:
import plotly.graph_objects as go

fig = go.Figure()
for i in range(adata.obs['day_class'].nunique()):
    fig.add_scatter3d(x=data['phate'][adata.obs['day_class']==i,0], 
                      y=data['phate'][adata.obs['day_class']==i,1], 
                      z=data['phate'][adata.obs['day_class']==i,2], 
                      mode='markers', marker=dict(size=2, color=i))
fig.show()


### Visualization