In [2]:
%load_ext autoreload
%autoreload 2

## Basic setup

In [3]:
import concord as ccd
import scanpy as sc
import torch
import warnings
from pathlib import Path
warnings.filterwarnings('ignore')
import time
from pathlib import Path
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
seed = 0

In [4]:
import time
from pathlib import Path
proj_name = "huycke_intestine"
file_name = proj_name
file_suffix = time.strftime('%b%d-%H%M')
seed = 0

save_dir = Path(f"../save/{proj_name}")
save_dir.mkdir(parents=True, exist_ok=True)

data_dir = Path(f"../data/{proj_name}")
data_dir.mkdir(parents=True, exist_ok=True)


In [None]:
data_path = Path(f"../data/huycke_intestine") / 'adata_huycke_Mar31-2314.h5ad'
adata = sc.read(
    data_path
)

In [None]:
# Find highly variable genes
sc.pp.highly_variable_genes(adata, n_top_genes=5000, flavor='seurat_v3', subset=False)

In [None]:
adata = adata[:, adata.var.highly_variable].copy()
adata.write_h5ad(data_dir / f"{file_name}_preprocessed_HVG.h5ad")
print(f"✅ Preprocessed data saved to {data_dir / f'{file_name}_preprocessed_HVG.h5ad'}")

In [5]:
adata = sc.read_h5ad(data_dir / f"{file_name}_preprocessed_HVG.h5ad")

In [6]:
adata.shape

(65468, 10000)

In [None]:
concord_args = {
        'n_epochs': 15, # Number of epochs for training, adjust as needed
        'save_dir': '../'+str(save_dir) # Directory to save the model and results
    }

In [None]:
import subprocess, json
py_methods = ["scvi", "harmony", "scanorama", "liger", "unintegrated", "concord_hcl", "concord_knn", "contrastive"]
output_dir = '../jobs'
device = 'auto'
conda_env = 'cellpath'
batch_key = 'LaneID'
state_key = 'None'
latent_dim = '50'  # Adjust as needed, but should match the encoder_dims in concord_args
subprocess.run([
    "python", "./generate_py_jobs.py",
    "--proj_name", proj_name,
    "--adata_filename", f"{file_name}_preprocessed_HVG.h5ad",
    "--methods", *py_methods,
    "--batch_key", batch_key,
    "--state_key", state_key,
    "--latent_dim", latent_dim,
    "--output_dir", output_dir,
    "--device", device,
    "--mem", "32G",  # Adjust memory as needed
    "--conda_env", conda_env,
    "--runtime", "2:00:00",
    "--mode", "wynton",
    "--concord_kwargs", json.dumps(concord_args)
])


In [None]:
ccd.ul.anndata_to_viscello(adata,
                        output_dir=data_dir / f"viscello_{proj_name}",
                        project_name=proj_name,
                        organism='mmu')

In [None]:
# Generate script for Seurat
import subprocess
r_methods = ["seurat_cca", "seurat_rpca"]
output_dir = '../jobs'
device = 'auto'
subprocess.run([
    "python", "./generate_seurat_script.py",
    "--proj_name", proj_name,
    "--eset_dir", '../'+ str(data_dir / f"viscello_{proj_name}"),   # <- folder w/ eset.rds
    "--methods", *r_methods,
    "--batch_key", batch_key,
    "--state_key", state_key,
    "--latent_dim", latent_dim,
    "--mem", "100G",  # Adjust memory as needed
    "--runtime", "36:00:00",
    "--output_dir", output_dir,
    "--device", device,
    "--conda_env", conda_env
])

In [None]:
proj_folder = Path(output_dir) / f"benchmark_{proj_name}"   # ../jobs/benchmark_<proj>
proj_folder.mkdir(exist_ok=True)                      # defensive

submit_all = proj_folder / f"submit_all_{proj_name}.sh"
with submit_all.open("w") as f:
    f.write("#!/bin/bash\n")
    f.write("# Auto-generated — submits every job for this project\n")
    f.write("# Run from this folder, or let the script cd into it.\n\n")
    f.write('cd "$(dirname "$0")"\n\n')          # ensures we’re in the right dir
    for sh_file in sorted(proj_folder.glob(f"benchmark_{proj_name}_*.sh")):
        f.write(f'qsub "{sh_file.name}"\n')

submit_all.chmod(0o755)
print(f"📌  Run “{submit_all}” to queue every job.")

### Collect results

In [None]:
data_path = data_dir / 'adata_huycke_Mar31-2314.h5ad'
#data_path = data_dir / f"adata_huycke_Jul07-1048.h5ad"
adata = sc.read(
    data_path
)

In [None]:
from benchmark_utils import collect_benchmark_logs
methods = ["scvi", "harmony", "scanorama", "liger", "unintegrated", "concord_hcl", "concord_knn", "contrastive", "seurat_cca", "seurat_rpca"]

bench_df = collect_benchmark_logs(file_name, methods)
# Save the benchmark results
bench_df.to_csv(save_dir / f"benchmark_summary_{file_suffix}.tsv", sep="\t", index=False)
print(f"✅ Benchmark summary saved to: {save_dir / f'benchmark_summary_{file_suffix}.tsv'}")
# Plot benchmark results
from benchmark_utils import plot_benchmark_performance
import matplotlib.pyplot as plt
custom_rc = {
    'font.family': 'Arial',  # Set the desired font for this plot
}
with plt.rc_context(rc=custom_rc):
    plot_benchmark_performance(bench_df, figsize=(8,2), dpi=300, save_path = save_dir / f"benchmark_plot_{file_suffix}.pdf")


In [None]:
adata.obsm.keys()

In [None]:
# Delete existing latents in adata.obsm
for key in list(adata.obsm.keys()):
    # if 'concord' in key:
    #     print(f"Deleting {key} from adata.obsm")
    del adata.obsm[key]

In [None]:
methods = ["scvi", "harmony", "scanorama", "liger", "concord_hcl", "concord_knn", "contrastive", "seurat_cca", "seurat_rpca"] # "unintegrated",
from benchmark_utils import add_embeddings
adata = add_embeddings(adata, proj_name=proj_name, methods=methods)

In [None]:
# Run umap for all latent embeddings
for basis in methods:
    print("Running UMAP for", basis)
    if basis not in adata.obsm:
        print(f"{basis} not found.")
        continue
    #if 'UMAP' not in basis:
    ccd.ul.run_umap(adata, source_key=basis, result_key=f'{basis}_UMAP', n_components=2, n_neighbors=30, min_dist=0.1, metric='euclidean', random_state=seed)
    ccd.ul.run_umap(adata, source_key=basis, result_key=f'{basis}_UMAP_3D', n_components=3, n_neighbors=30, min_dist=0.1, metric='euclidean', random_state=seed)
    #ccd.ul.run_umap(adata, source_key=basis, result_key=f'{basis}_UMAP_cosine', n_components=3, n_neighbors=30, min_dist=0.1, metric='cosine', random_state=seed)
    #ccd.ul.run_umap(adata, source_key=basis, result_key=f'{basis}_UMAP_cosine_3D', n_components=2, n_neighbors=30, min_dist=0.1, metric='cosine', random_state=seed)

ccd.ul.save_obsm_to_hdf5(adata, save_dir / f"obsm_{file_suffix}.h5")
adata.write_h5ad(data_dir / f"adata_final_{file_suffix}.h5ad")
print(f"✅ Saved adata with embeddings to {data_dir / f'adata_final_{file_suffix}.h5ad'}")

In [None]:
# Define color palette for broad_cell_type_qz
adata.obs['stage'] = adata.obs['MouseAge_combined']
_, _, celltype_pal = ccd.pl.get_color_mapping(adata, 'cell_type', pal='Paired', seed=seed)
_, _, broad_celltype_pal = ccd.pl.get_color_mapping(adata, 'broad_cell_type', pal='tab20', seed=seed)
broad_celltype_pal['Doublet-like'] = '#757575'
_, _, mes_pal = ccd.pl.get_color_mapping(adata, 'mes_subtype', pal='Paired', seed=seed)
_, _, epi_pal = ccd.pl.get_color_mapping(adata, 'epi_subtype', pal='Paired', seed=seed)
_, _, batch_pal = ccd.pl.get_color_mapping(adata,'batch', pal='Set1', seed=seed)
_, _, phase_pal = ccd.pl.get_color_mapping(adata, 'phase', pal='Set1', seed=seed)
_, _, seg_pal = ccd.pl.get_color_mapping(adata, 'seg_classify', pal='tab10', seed=seed)
_, _, lane_pal = ccd.pl.get_color_mapping(adata,'LaneID', pal='Paired', seed=7)
stage_pal = {
    '12.5':"midnightblue", 
    '13.5':"dodgerblue", 
    '14.5':"seagreen", 
    '15.5':"#00C000", 
    '16.5':"#EEC900", 
    '17.5':"#FF7F00", 
    '18.5':"#FF0000"
}
pal = {"cell_type": celltype_pal,
    "broad_cell_type": broad_celltype_pal,
    "mes_subtype": mes_pal,
    "epi_subtype": epi_pal,
    "batch": batch_pal,
    'phase': phase_pal,
    'stage': stage_pal,
    'seg_classify': seg_pal,
    'LaneID': lane_pal}

In [None]:
# plot everything
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import rcParams

# Set Arial as the default font
custom_rc = {
    'font.family': 'Arial',  # Set the desired font for this plot
}

show_keys = methods
show_cols = ['batch', 'broad_cell_type', 'cell_type', 'mes_subtype', 'phase', 'seg_classify', 'LaneID', 'stage']
basis_types = ['UMAP']

font_size=10
point_size=.1
alpha=0.8
ncols = len(show_keys)
figsize=(ncols * 1.5,1.5)
nrows = int(np.ceil(len(show_keys) / ncols))

with plt.rc_context(rc=custom_rc):
    ccd.pl.plot_all_embeddings(
        adata,
        show_keys,
        color_bys=show_cols,
        basis_types=basis_types,
        pal=pal,
        font_size=font_size,
        point_size=point_size,
        alpha=alpha,
        figsize=figsize,
        ncols=ncols,
        seed=seed,
        save_dir=save_dir,
        file_suffix=file_suffix,
        dpi=600,
        save_format='svg'
    )


### Run CONCORD

In [None]:
#data_path = data_dir / 'adata_huycke_Mar31-2314.h5ad'
data_path = data_dir / f"adata_final_Jul17-1624.h5ad"
adata = sc.read(
    data_path
)

In [None]:
feature_list = ccd.ul.select_features(adata, n_top_features=5000, flavor='seurat_v3')
concord_args = {
        'adata': adata,
        'input_feature': feature_list,
        'domain_key': 'LaneID',
        'batch_size': 256,
        'latent_dim': 50,
        'element_mask_prob': 0.4,
        'feature_mask_prob': 0.0,
        'clr_beta':0.0,
        'clr_temperature': 0.3,
        'p_intra_domain': 0.85,
        'p_intra_knn': 0.1,
        'n_epochs': 15,  # Number of epochs for training, adjust as needed
        'save_dir': save_dir,
        'load_data_into_memory': True,
    }

output_key = f'concord_knn'
cur_ccd = ccd.Concord(**concord_args)
cur_ccd.fit_transform(output_key=output_key)


In [None]:
feature_list = ccd.ul.select_features(adata, n_top_features=10000, flavor='seurat_v3')
concord_args = {
        'adata': adata,
        'input_feature': feature_list,
        'domain_key': 'LaneID',
        'batch_size': 256,
        'latent_dim': 50,
        'element_mask_prob': 0.3,
        'feature_mask_prob': 0.2,
        'clr_beta':0.5,
        'clr_temperature': 0.3,
        'n_epochs': 15,  # Number of epochs for training, adjust as needed
        'save_dir': save_dir,
        'load_data_into_memory': True,
    }

output_key = f'concord_hcl'
cur_ccd = ccd.Concord(**concord_args)
cur_ccd.fit_transform(output_key=output_key)


In [None]:
basis = output_key
file_suffix = time.strftime('%b%d-%H%M')
ccd.ul.run_umap(adata, source_key=basis, result_key=f'{basis}_UMAP', n_components=2, n_neighbors=30, min_dist=0.1, metric='euclidean', random_state=seed)
ccd.ul.run_umap(adata, source_key=basis, result_key=f'{basis}_UMAP_3D', n_components=3, n_neighbors=30, min_dist=0.1, metric='euclidean', random_state=seed)

In [None]:
basis = 'concord_hcl'
show_basis = basis + '_UMAP'
show_cols = ['batch', 'broad_cell_type', 'cell_type', 'mes_subtype', 'epi_subtype', 'phase', 'seg_classify', 'LaneID', 'stage']
ccd.pl.plot_embedding(
    adata, show_basis, show_cols, figsize=(13,12), dpi=600, ncols=3, font_size=6, point_size=1, legend_loc="on data",
    pal=pal,
    save_path=save_dir / f"{show_basis}_{file_suffix}.pdf"
)

In [None]:
show_cols = ['batch', 'broad_cell_type', 'cell_type', 'mes_subtype', 'phase', 'seg_classify', 'LaneID', 'stage']
for col in show_cols:
    show_basis = f'{basis}_UMAP_3D'
    ccd.pl.plot_embedding_3d(
            adata, basis=show_basis, color_by=col,
            pal = pal, 
            save_path=save_dir / f'{show_basis}_{col}_{file_suffix}.html',
            point_size=1, opacity=0.8, width=1300, height=1000, 
            autosize=True,
            static=False
        )