In [1]:
import anndata as ad
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

import phate
import scanpy as sc


In [2]:
# Meta data
DATA_DIR = "../../data/multi_cite/"
FP_CELL_METADATA = os.path.join(DATA_DIR,"metadata.csv")
# FP_TRAIN_MULTI_INPUTS = os.path.join(DATA_DIR,"train_multi_inputs.h5")
# FP_TEST_MULTI_INPUTS = os.path.join(DATA_DIR,"test_multi_inputs.h5")
FP_TRAIN_MULTI_TARGETS = os.path.join(DATA_DIR,"train_multi_targets.h5")

df_meta = pd.read_csv(FP_CELL_METADATA)
df_meta

Unnamed: 0,cell_id,day,donor,cell_type,technology
0,c2150f55becb,2,27678,HSC,citeseq
1,65b7edf8a4da,2,27678,HSC,citeseq
2,c1b26cb1057b,2,27678,EryP,citeseq
3,917168fa6f83,2,27678,NeuP,citeseq
4,2b29feeca86d,2,27678,EryP,citeseq
...,...,...,...,...,...
281523,96a60b026659,10,31800,hidden,multiome
281524,d493e546991e,10,31800,hidden,multiome
281525,05666c99aa48,10,31800,hidden,multiome
281526,121f946642b5,10,31800,hidden,multiome


In [3]:
days_to_keep = [2,3,4,7]
df_meta = df_meta[(df_meta['technology']=='multiome') & (df_meta['donor']==13176) & (df_meta['day'].isin(days_to_keep))]
df_meta

Unnamed: 0,cell_id,day,donor,cell_type,technology
193722,cc1db26a8022,2,13176,HSC,multiome
193723,9cd4f248468c,2,13176,HSC,multiome
193724,891a65593475,2,13176,HSC,multiome
193725,9958bac2678b,2,13176,HSC,multiome
193726,35311a9bd7d5,2,13176,HSC,multiome
...,...,...,...,...,...
229113,5a58b52115cd,7,13176,NeuP,multiome
229114,ccb68da1a889,7,13176,MasP,multiome
229115,6b419951cda1,7,13176,HSC,multiome
229116,2766dd8180cd,7,13176,MasP,multiome


In [4]:
cell_ids_of_interest = df_meta['cell_id'].unique().tolist()
print('Number of cells of interest:', len(cell_ids_of_interest))

Number of cells of interest: 35396


In [9]:
# get info of the hdf
# with pd.HDFStore(FP_TRAIN_MULTI_INPUTS, mode='r') as store:
#     print(f"Train file info: {store.info()}")

# with pd.HDFStore(FP_TEST_MULTI_INPUTS, mode='r') as store:
#     print(f"Test file info: {store.info()}")

with pd.HDFStore(FP_TRAIN_MULTI_TARGETS, mode='r') as store:
    print(f"Train file info: {store.info()}")


Train file info: <class 'pandas.io.pytables.HDFStore'>
File path: ../../data/multi_cite/train_multi_targets.h5
/train_multi_targets            frame        (shape->[105942,23418])


In [5]:
# df_train_list = []
# chunk_size = 5000
# train_total_rows = 105942
# for start in range(0, train_total_rows, chunk_size):
#     df_train = pd.read_hdf(FP_TRAIN_MULTI_INPUTS, start=start, stop=start+chunk_size)
#     df_train_list.append(df_train)
# Filter df_train_list 
# df_train_filtered = []
# for df in df_train_list:
#     print(df.shape)
#     filtered_df = df[df.index.isin(cell_ids_of_interest)]
#     print(filtered_df.shape)
#     df_train_filtered.append(filtered_df)

df_train = pd.read_hdf(FP_TRAIN_MULTI_TARGETS)

In [6]:
print(df_train)

# Filter df_train
df_train = df_train[df_train.index.isin(cell_ids_of_interest)]
print(df_train.shape)

gene_id       ENSG00000121410  ENSG00000268895  ENSG00000175899   
cell_id                                                           
56390cf1b95e              0.0         0.000000              0.0  \
fc0c60183c33              0.0         0.000000              0.0   
9b4a87e22ad0              0.0         0.000000              0.0   
81cccad8cd81              0.0         4.507936              0.0   
15cb3d85c232              0.0         0.000000              0.0   
...                       ...              ...              ...   
063cead1a4ea              0.0         0.000000              0.0   
553bca99ba78              0.0         0.000000              0.0   
00783f28b463              0.0         0.000000              0.0   
e7abb1a0f251              0.0         0.000000              0.0   
193992d571a5              0.0         0.000000              0.0   

gene_id       ENSG00000245105  ENSG00000166535  ENSG00000256661   
cell_id                                                      

In [7]:
adata = ad.AnnData(df_train)

In [8]:
adata.obs_names = df_train.index
adata

AnnData object with n_obs × n_vars = 35396 × 23418

### Visualize the df_train_donor

In [9]:
adata.obs_names

Index(['cc1db26a8022', '9cd4f248468c', '891a65593475', '9958bac2678b',
       '35311a9bd7d5', '0c7590316e10', '81b91aab870b', '166e9908ab52',
       '38ed64b751a8', '57ccbbd8b419',
       ...
       '7a8e4d5cc54e', 'f5012a716bd4', '34c9e765f50d', '0df4d0998c02',
       'd4df16ea98b2', '5a58b52115cd', 'ccb68da1a889', '6b419951cda1',
       '2766dd8180cd', 'bd4b7f1a3b17'],
      dtype='object', name='cell_id', length=35396)

In [10]:
print(df_meta.donor.unique())
print(df_meta.day.unique())

[13176]
[2 3 4 7]


In [11]:
# pair day with cell id
cellid2day = {cell_id: day for cell_id, day in zip(df_meta['cell_id'], df_meta['day'])}
adata.obs['day'] = [cellid2day[cell_id] for cell_id in adata.obs_names]

adata

AnnData object with n_obs × n_vars = 35396 × 23418
    obs: 'day'

In [12]:
adata.obs['day'].unique()

array([2, 3, 4, 7])

In [13]:
day_class = np.unique(adata.obs['day'].tolist())
print(day_class)
day2class = {day: i for i, day in enumerate(day_class)}
adata.obs['day_class'] = [day2class[day] for day in adata.obs['day']]

print(adata)

[2 3 4 7]
AnnData object with n_obs × n_vars = 35396 × 23418
    obs: 'day', 'day_class'


In [14]:
adata

AnnData object with n_obs × n_vars = 35396 × 23418
    obs: 'day', 'day_class'

In [15]:
# hvg
sc.pp.highly_variable_genes(adata, n_top_genes=1000)
adata

AnnData object with n_obs × n_vars = 35396 × 23418
    obs: 'day', 'day_class'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'hvg'

In [16]:
# pca, 100 components
sc.tl.pca(adata, n_comps=100)
adata

AnnData object with n_obs × n_vars = 35396 × 23418
    obs: 'day', 'day_class'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'hvg', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'

In [17]:
adata

AnnData object with n_obs × n_vars = 35396 × 23418
    obs: 'day', 'day_class'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'hvg', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'

In [18]:
from scipy.spatial.distance import pdist, squareform
from scipy import sparse as sp

def convert_data(X, colors, seed=42, test_size=0.1, knn=5, t='auto', n_components=3):
    # if X is sparse, convert to dense
    if sp.issparse(X):
        X = X.toarray()
        
    phate_op = phate.PHATE(random_state=seed, t=t, n_components=n_components, knn=knn)
    phate_data = phate_op.fit_transform(X)

    dists = squareform(pdist(phate_op.diff_potential))

    return dict(
        data=X,
        colors=colors,
        dist=dists,
        phate=phate_data
    )

data = convert_data(adata.obsm['X_pca'][:,:100], np.array(adata.obs['day_class'].tolist()))

Calculating PHATE...
  Running PHATE on 35396 observations and 100 variables.
  Calculating graph and diffusion operator...
    Calculating KNN search...
    Calculated KNN search in 82.58 seconds.
    Calculating affinities...
    Calculated affinities in 37.32 seconds.
  Calculated graph and diffusion operator in 119.93 seconds.
  Calculating landmark operator...
    Calculating SVD...
    Calculated SVD in 2.00 seconds.
    Calculating KMeans...
    Calculated KMeans in 3.52 seconds.
  Calculated landmark operator in 6.17 seconds.
  Calculating optimal t...
    Automatically selected t = 17
  Calculated optimal t in 2.24 seconds.
  Calculating diffusion potential...
  Calculated diffusion potential in 0.54 seconds.
  Calculating metric MDS...




  Calculated metric MDS in 10.30 seconds.
Calculated PHATE in 139.18 seconds.


In [19]:
print(data['data'].shape)
print(data['colors'].shape)
print(data['dist'].shape)
print(data['phate'].shape)

# save npz
np.savez("../../data/multi_D-100_d-3_pca.npz", data=data['data'], colors=data['colors'], dist=data['dist'], phate=data['phate'])


(35396, 100)
(35396,)
(35396, 35396)
(35396, 3)


In [17]:
import plotly.graph_objects as go

fig = go.Figure()
for i in range(adata.obs['day_class'].nunique()):
    fig.add_scatter3d(x=data['phate'][adata.obs['day_class']==i,0], 
                      y=data['phate'][adata.obs['day_class']==i,1], 
                      z=data['phate'][adata.obs['day_class']==i,2], 
                      mode='markers', marker=dict(size=2, color=i))
fig.show()


: 

### Visualization