In [1]:
%load_ext autoreload
%autoreload 2

## Basic setup

In [2]:
import concord as ccd
import scanpy as sc
import torch
import warnings
from pathlib import Path
warnings.filterwarnings('ignore')

data_dir = Path('../data/CBCEcombineN2')
data_dir.mkdir(parents=True, exist_ok=True)
import time
from pathlib import Path
proj_name = "CBCEcombineN2"
save_dir = f"../save/dev_{proj_name}-{time.strftime('%b%d')}/"
save_dir = Path(save_dir)
save_dir.mkdir(parents=True, exist_ok=True)
device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
file_suffix = f"{time.strftime('%b%d-%H%M')}"
seed = 0

In [22]:
n2_adata = sc.read_h5ad('../data/celegans_binyamin/N2_outs/concord_celN2_Jun12-1457.h5ad')
cbce_adata = sc.read_h5ad('../data/CE_CB/adata_cbce_Jan30-1028.h5ad')

In [23]:
n2_adata.obs['embryo.time'] = n2_adata.obs['raw.embryo.time']
n2_adata.obs['batch'] = 'BZ_N2'
n2_adata.obs['batch_fine'] = 'BZ_N2'
n2_adata.obs['batch_broad'] = 'BZ_N2'
n2_adata.obs['species'] = 'C.elegans'
n2_adata.obs_names = [f"{name}-BZ_N2" for name in n2_adata.obs_names]

cbce_adata.obs['batch_fine'] = cbce_adata.obs['batch'].copy()
cbce_adata.obs['batch_broad'] = cbce_adata.obs['dataset3'].astype(str).copy()
adata = cbce_adata.concatenate(n2_adata, batch_key='lab', batch_categories=['Murray_CBCE','Gartner_BZ'])
adata.X = adata.layers["counts"].copy()
# Compute basic statistics
sc.pp.calculate_qc_metrics(adata, inplace=True)
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)

adata.write_h5ad(data_dir / f"{proj_name}_{file_suffix}.h5ad") # Save the adata object with the encoded embeddings
print(f"Saved adata to {data_dir / f'{proj_name}_{file_suffix}.h5ad'}")

Saved adata to ../data/CBCEcombineN2/CBCEcombineN2_Jun26-1610.h5ad


In [None]:
# Save Packer dataset + N2 dataset separately
unique_batches = adata.obs['batch_broad'].unique()
print(f"Unique batches: {unique_batches}")
# If the batch name contains Waterston, Murray or BZ_N2, get it
filtered_batches = [batch for batch in unique_batches if 'Waterston' in batch or 'Murray' in batch or 'BZ_N2' in batch]
print(f"Filtered batches: {filtered_batches}")
adata_celsub = adata[adata.obs['batch_broad'].isin(filtered_batches)].copy()
print(f"adata_celsub shape: {adata_celsub.shape}")
adata_celsub.write_h5ad(data_dir / f"adata_celsub_{file_suffix}.h5ad")
print(f"Saved adata_celsub to {data_dir / f'adata_celsub_{file_suffix}.h5ad'}")

Unique batches: ['Waterston_300_minutes', 'Waterston_400_minutes', 'Waterston_500_1_minutes', 'Waterston_500_2_minutes', 'Ce_M03D44_300_minutes', ..., 'batch_300', 'batch_360', 'batch_400', 'batch_500', 'BZ_N2']
Length: 22
Categories (22, object): ['BZ_N2', 'Ce_M03D44_300_minutes', 'Ce_M03D44_500_minutes', 'Ce_ceh9_300_minutes', ..., 'batch_300', 'batch_360', 'batch_400', 'batch_500']
Filtered batches: ['Waterston_300_minutes', 'Waterston_400_minutes', 'Waterston_500_1_minutes', 'Waterston_500_2_minutes', 'Murray_b01', 'Murray_b02', 'Murray_r17', 'BZ_N2']
adata_celsub shape: (94276, 13405)


In [29]:
# save adata_celegans and adata_cbriggsae separately
adata_celegans = adata[adata.obs['species'] == 'C.elegans'].copy()
adata_cbriggsae = adata[adata.obs['species'] == 'C.briggsae'].copy()
adata_celegans.write_h5ad(data_dir / f"adata_celegans_{file_suffix}.h5ad")
adata_cbriggsae.write_h5ad(data_dir / f"adata_cbriggsae_{file_suffix}.h5ad")

## Run Concord

In [None]:

adata = sc.read_h5ad("../data/CBCEcombineN2/CBCEcombineN2_Jun26-1610.h5ad")

In [None]:
feature_list = ccd.ul.select_features(adata, n_top_features=10000, flavor='seurat_v3') # Loosely select features based on Seurat v3 method (so that enough information is preserved)

concord_args = {
        'adata': adata,
        'input_feature': feature_list,
        'domain_key': 'batch',
        'batch_size':64, # Batch size for training, adjust as needed
        'latent_dim': 300, # Latent dimension size, adjust as needed
        'encoder_dims':[1000], # Encoder dimensions, recommended to be larger than latent_dim
        'use_decoder': False, # Whether to use a decoder, set to True if you want to use the decoder
        'decoder_dims':[1000], # Decoder dimensions, ignored if use_decoder is False
        'augmentation_mask_prob': 0.3, # Probability of masking features, recommended to be between 0.2 and 0.5
        'clr_temperature': 0.3, # Temperature for contrastive loss, recommended to be between 0.1 and 0.5
        'sampler_emb': None,
        'p_intra_knn': 0.3, # Probability of intra-neighborhood sampling, must be less than 0.5
        'sampler_knn': 300, # Size of neighbohood for intra-neighborhood sampling
        'p_intra_domain': .95, # Enrichment probability for intra-domain sampling, recommended to be between 0.85 and 1.0, note the lower the value, the more dataset-specific information (may contain batch effects) is preserved
        'n_epochs': 15, # Number of epochs for training, adjust as needed
        'verbose': True, # Verbosity level, set to True for more detailed output
        'seed': seed, # random seed for reproducibility
        'device': device, # Device for training, can be 'cpu', 'cuda', or 'mps'
        'save_dir': save_dir # Directory to save the model and results
    }

In [None]:
output_key = 'Concord'
cur_ccd = ccd.Concord(**concord_args)
cur_ccd.fit_transform(output_key=output_key) # Result saved to ccd.adata.obsm[output_key]
ccd.ul.save_obsm_to_hdf5(adata, save_dir / f"obsm_{file_suffix}.h5")

In [None]:
import numpy as np
np.sum(adata.obs['raw.embryo.time'] < 0)

In [None]:
adata.obs['batch'].value_counts()

In [None]:
adata.obs.loc[adata.obs['batch'] == 'BZ_N2' , 'embryo.time'] = adata.obs.loc[adata.obs['batch'] == 'BZ_N2' , 'raw.embryo.time'].astype(float) # Convert to float for consistency

In [None]:
adata.obs['batch'] = adata.obs['batch'].astype(str) # Ensure batch is string type
adata.obs.loc[adata.obs['batch'] == 'BZ_N2' , 'species'] = 'C.elegans'
adata.obs['plot.cell.type'] = adata.obs['plot.cell.type'].astype(str) # Ensure cell type is string type
adata.obs.loc[adata.obs['plot.cell.type'] == 'BZ_N2' , 'plot.cell.type'] = 'Unannotated'
adata.obs.loc[adata.obs['plot.cell.type'] == 'nan', 'plot.cell.type'] = np.nan

In [None]:
basis = 'Concord'
ccd.ul.run_umap(adata, source_key=basis, result_key=f'{basis}_UMAP', n_components=2, n_neighbors=30, min_dist=0.1, metric='euclidean', random_state=seed)
show_basis = basis + '_UMAP'
show_cols = ['plot.cell.type', 'embryo.time', "batch", 'lab', 'species']
pal = {'plot.cell.type': 'tab20', 'embryo.time': 'BlueGreenRed', "batch": 'tab20', 'lab':'Set2', 'species': 'Set1'}
ccd.pl.plot_embedding(
    adata, show_basis, show_cols, figsize=(13,9), dpi=600, ncols=3, font_size=5, point_size=.3, legend_loc='on data', 
    pal = pal,
    save_path=save_dir / f"{show_basis}_{file_suffix}.png"
)

In [None]:
ccd.ul.save_obsm_to_hdf5(adata, save_dir / f"obsm_{file_suffix}_pca_init.h5")

In [None]:
ccd.ul.run_umap(adata, source_key=basis, result_key=f'{basis}_UMAP_3D_cosine', n_components=3, n_neighbors=30, min_dist=0.1, metric='cosine', random_state=seed)
import plotly.io as pio
pio.renderers.default = 'notebook'
for col in show_cols:
    show_basis = f'{basis}_UMAP_3D_cosine'
    ccd.pl.plot_embedding_3d(
            adata, basis=show_basis, color_by=col,
            pal = pal[col],
            save_path=save_dir / f'{show_basis}_{col}_{file_suffix}.html',
            point_size=1, opacity=0.8, width=1500, height=1000
        )

In [None]:
import pandas as pd
pd.crosstab(adata.obs['batch'], adata.obs['species'])

In [None]:
adata.write_h5ad(data_dir / f"{proj_name}_{file_suffix}.h5ad") # Save the adata object with the encoded embeddings
print(f"Saved adata to {data_dir / f'{proj_name}_{file_suffix}.h5ad'}")


In [None]:
ccd.ul.anndata_to_viscello(adata, data_dir / f"cello_{proj_name}_{file_suffix}", project_name = proj_name, organism='hsa')
print(f"Saved viscello to {data_dir / f'cello_{proj_name}_{file_suffix}'}")