In [None]:
%load_ext autoreload
%autoreload 2

## Basic setup

In [None]:
import numpy as np
import scanpy as sc
import time
from pathlib import Path
import torch
import Concord as ccd
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

from matplotlib import font_manager, rcParams
custom_rc = {
    'font.family': 'Arial',  # Set the desired font for this plot
}

mpl.rcParams['svg.fonttype'] = 'none'
mpl.rcParams['pdf.fonttype'] = 42

In [3]:
data_dir = Path('../data/intestine_dev/')
data_path = data_dir / 'adata_huycke_Jan08-1324.h5ad'

adata = sc.read(
    data_path
)


In [4]:
latent_obsm = ccd.ul.load_obsm_from_hdf5('../save/dev_benchmark_Huycke-Jan10/obsm_Jan10-1504.h5')
for key in latent_obsm.keys():
    if key not in adata.obsm.keys():
        adata.obsm[key] = latent_obsm[key]

In [None]:
proj_name = "benchmark_Huycke_cc_"
save_dir = f"../save/dev_{proj_name}-{time.strftime('%b%d')}/"
save_dir = Path(save_dir)
save_dir.mkdir(parents=True, exist_ok=True)
#device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")
seed = 0
ccd.ul.set_seed(seed)
file_suffix = f"{time.strftime('%b%d-%H%M')}"

### PCA space of the cell cycle genes

In [None]:
# Define cell cycle marker genes of mouse
s_genes = [
    'MCM5', 'PCNA', 'TYMS', 'FEN1', 'MCM2', 'MCM4', 'RRM1', 'UNG', 'GINS2', 'MCM6', 'CDCA7',
    'DTL', 'PRIM1', 'UHRF1', 'MLF1IP', 'HELLS', 'RFC2', 'RPA2', 'NASP', 'RAD51AP1', 'GMNN',
    'WDR76', 'SLBP', 'CCNE2', 'UBR7', 'POLD3', 'MSH2', 'ATAD2', 'RAD51', 'RRM2', 'CDC45',
    'CDC6', 'EXO1', 'TIPIN', 'DSCC1', 'BLM', 'CASP8AP2', 'USP1', 'CLSPN', 'POLA1', 'CHAF1B',
    'BRIP1', 'E2F8'
]
g2m_genes = [
    'HMGB2', 'CDK1', 'NUSAP1', 'UBE2C', 'BIRC5', 'TPX2', 'TOP2A', 'NDC80', 'CKS2', 'NUF2',
    'CKS1B', 'MKI67', 'TMPO', 'CENPF', 'TACC3', 'FAM64A', 'SMC4', 'CCNB2', 'CKAP2L', 'CKAP2',
    'AURKB', 'BUB1', 'KIF11', 'ANP32E', 'TUBB4B', 'GTSE1', 'KIF20B', 'HJURP', 'CDCA3', 'HN1',
    'CDC20', 'TTK', 'CDC25C', 'KIF2C', 'RANGAP1', 'NCAPD2', 'DLGAP5', 'CDCA2', 'CDCA8', 'ECT2',
    'KIF23', 'HMMR', 'AURKA', 'PSRC1', 'ANLN', 'LBR', 'CKAP5', 'CENPE', 'CTCF', 'NEK2', 'G2E3',
    'GAS2L3', 'CBX5', 'CENPA'
]

s_genes = ccd.ul.get_mouse_genes(s_genes)
g2m_genes = ccd.ul.get_mouse_genes(g2m_genes)
cc_genes = s_genes + g2m_genes

# get the cc_genes that are in adata.var
cc_genes = [gene for gene in cc_genes if gene in adata.var_names]

adata_cc = adata[:, cc_genes].copy()

# Remove erythrocytes
#adata_cc = adata_cc[~adata_cc.obs['broad_cell_type'].isin(['Erythrocyte'])]

In [None]:
import pandas as pd

cc_genes = pd.read_csv(data_dir / 'cello_benchmark_Huycke_cc__Jan09-1846/fmeta.csv', index_col=0)
cc_genes

In [205]:
adata_cc = adata[:, cc_genes.index]


In [None]:
# run PCA on the adata_cc
ccd.ul.run_pca(adata_cc, source_key = 'X', result_key='PCA_cc', n_pc=30, random_state=seed)

In [207]:
# run UMAP on the adata_cc
ccd.ul.run_umap(adata_cc, source_key = 'PCA_cc', result_key='UMAP_cc', min_dist=0.1, metric='cosine', random_state=seed)

In [6]:
# Define color palette for broad_cell_type_qz
adata.obs['stage'] = adata.obs['MouseAge_combined']
_, _, celltype_pal = ccd.pl.get_color_mapping(adata, 'cell_type', pal='Paired', seed=seed)
_, _, broad_celltype_pal = ccd.pl.get_color_mapping(adata, 'broad_cell_type', pal='tab20', seed=seed)
broad_celltype_pal['Doublet-like'] = '#757575'
_, _, mes_pal = ccd.pl.get_color_mapping(adata, 'mes_subtype', pal='Paired', seed=seed)
_, _, batch_pal = ccd.pl.get_color_mapping(adata,'batch', pal='Set1', seed=seed)
_, _, phase_pal = ccd.pl.get_color_mapping(adata, 'phase', pal='Set1', seed=seed)
_, _, seg_pal = ccd.pl.get_color_mapping(adata, 'seg_classify', pal='tab10', seed=seed)
_, _, lane_pal = ccd.pl.get_color_mapping(adata,'LaneID', pal='Paired', seed=7)
stage_pal = {
    '12.5':"midnightblue", 
    '13.5':"dodgerblue", 
    '14.5':"seagreen", 
    '15.5':"#00C000", 
    '16.5':"#EEC900", 
    '17.5':"#FF7F00", 
    '18.5':"#FF0000"
}
pal = {"cell_type": celltype_pal,
    "broad_cell_type": broad_celltype_pal,
    "mes_subtype": mes_pal,
    "batch": batch_pal,
    'phase': phase_pal,
    'stage': stage_pal,
    'seg_classify': seg_pal,
    'LaneID': lane_pal}

concord_keys = ["Concord", 'Concord-decoder']
other_keys = ["Unintegrated", "Scanorama", "Liger", "Harmony", "scVI", "Seurat"]
combined_keys = other_keys + concord_keys
unique_broad_cell_types = adata.obs['broad_cell_type'].unique()
all_cts = [ct for ct in list(unique_broad_cell_types) if ct not in ['Doublet-like', 'Erythrocyte']] + ['Erythrocyte']

In [209]:
# Cluster
sc.pp.neighbors(adata_cc, n_neighbors=30, use_rep='PCA_cc', metric='cosine')
sc.tl.leiden(adata_cc, resolution=0.5, key_added='leiden_cc')

In [210]:
# Refine phase based on clustering
adata_cc.obs['phase_refined'] = adata_cc.obs['phase'].astype(str)
adata_cc.obs.loc[(adata_cc.obs['phase_refined'] == 'G2M') & (adata_cc.obs['leiden_cc'] == '4'), 'phase_refined'] = 'M'
adata_cc.obs.loc[adata_cc.obs['phase_refined'] == 'G2M', 'phase_refined'] = 'G2'
# copy to adata
adata.obs['phase_refined'] = adata_cc.obs['phase_refined']

In [211]:
pal['phase_refined'] = {'G1': '#4daf4a', 'S': '#e41a1c', 'G2': '#377eb8', 'M': '#984ea3'}

In [None]:
show_emb = 'PCA_cc'

show_cols = ['phase', 'broad_cell_type', 'leiden_cc', 'phase_refined']

ccd.pl.plot_embedding(
    adata_cc, show_emb, show_cols, 
    pal=pal,
    figsize=(6,1.5), dpi=600, ncols=len(show_cols), font_size=3, point_size=1, legend_loc='on data',
    save_path=save_dir / f"embeddings_{show_emb}_{file_suffix}.pdf"
)

In [None]:
show_emb = 'UMAP_cc'

ccd.pl.plot_embedding(
    adata_cc, show_emb, show_cols, 
    pal=pal,
    figsize=(6,1.5), dpi=600, ncols=len(show_cols), font_size=3, point_size=1, legend_loc='on data',
    save_path=save_dir / f"embeddings_{show_emb}_{file_suffix}.pdf"
)

In [None]:
ccd.ul.anndata_to_viscello(adata_cc, data_dir / f"cello_{proj_name}_{file_suffix}", project_name = proj_name, organism='mmu')

In [None]:
show_emb = 'PCA_cc'

show_cols = ['phase', 'phase_refined']

# check the PCA results in the cc gene space for the 10 cell types
ncols = len(all_cts)

for col in show_cols:
    fig, axs = plt.subplots(1, ncols, figsize=(13, 1.5))
    axs = axs.flatten()

    # Change legend font size to 10
    for i in range(len(all_cts)):
        ct = all_cts[i]
        adata_sub = adata_cc[adata_cc.obs['broad_cell_type'] == ct].copy()
        sc.pl.embedding(adata_sub, basis=show_emb, color=col, ax=axs[i], show=False, s=2, legend_loc='on data', legend_fontsize=7, palette=pal[col])
        axs[i].set_xlabel('')
        axs[i].set_ylabel('')
        axs[i].set_title(ct, fontsize=10)

    plt.tight_layout()
    plt.savefig(save_dir / f"embeddings_{show_emb}_{col}_{file_suffix}.png", dpi=600)
    plt.show()

In [None]:
adata.obsm['Seurat']

### Latent correlation with cell cycle

In [None]:
import pandas as pd
#use_key = 'Concord-decoder'
use_key = 'Concord'
#use_key = 'Seurat'
# use_key = 'scVI'
latent = adata.obsm[use_key]
# Compute correlation matrix between each cell cycle score and the latent in adata.obsm[use_key]

data = pd.DataFrame(latent)
data = data.reset_index(drop=True)  # Drop any existing index in the latent data
data

In [None]:
import pandas as pd
#use_key = 'Concord-decoder'
use_key = 'Concord'
#use_key = 'Seurat'
# use_key = 'scVI'
latent = adata.obsm[use_key]
# Compute correlation matrix between each cell cycle score and the latent in adata.obsm[use_key]

latent = pd.DataFrame(latent)
data = latent.reset_index(drop=True)  # Drop any existing index in the latent data

# Add S_score and G2M_score after resetting their indices
data['S'] = adata.obs['S_score'].reset_index(drop=True)

data['G2M'] = adata.obs['G2M_score'].reset_index(drop=True)
data['S-G2M'] = adata.obs['S_score'].reset_index(drop=True) - adata.obs['G2M_score'].reset_index(drop=True)
data['G2M-S'] = adata.obs['G2M_score'].reset_index(drop=True) - adata.obs['S_score'].reset_index(drop=True)

correlation_matrix = data.corr()

# Extract correlations of S_score and G2M_score with the latent dimensions
correlations = correlation_matrix.loc[['S', 'G2M', 'G2M-S', 'S-G2M'], latent.columns]

correlations


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import fcluster
import numpy as np

# Create clustermap
clustermap = sns.clustermap(correlations, cmap='coolwarm', center=0, annot=False, fmt=".2f", figsize=(10, 4))

# Extract the linkage matrix for columns (latent dimensions)
linkage_matrix = clustermap.dendrogram_col.linkage  # Column dendrogram

# Cut the dendrogram to form clusters at a specific level (e.g., max_distance = 1.0)
max_distance = 0.3
clusters = fcluster(linkage_matrix, t=max_distance, criterion='distance')

# Map latent to cluster
latent_dimension_clusters = pd.DataFrame(
    data={
        "latent": [f"{i}" for i in range(len(clusters))],
        "cluster": clusters
    }
)

# Visualize clusters as a bar in the clustermap
# Create a color bar for the cluster assignments
cluster_colors = sns.color_palette("tab10", len(np.unique(clusters)))
cluster_colors_mapped = [cluster_colors[c - 1] for c in clusters]

# Add a color bar to the clustermap
clustermap = sns.clustermap(
    correlations,
    cmap='coolwarm',
    center=0,
    annot=False,
    fmt=".2f",
    figsize=(10, 4),
    col_colors=cluster_colors_mapped,
    xticklabels=True,
    yticklabels=True
)

# Set title and save the updated clustermap
plt.title(f"Correlation matrix with clusters in {use_key}")
plt.savefig(save_dir / f"correlation_matrix_clusters_{use_key}_{file_suffix}.png", dpi=600)


In [None]:
adata.obsm['Concord'][:,32]

In [None]:
show_basis = 'Concord_UMAP'

for neuron_id in range(adata.obsm['Concord'].shape[1]):
    adata.obs[f'N{neuron_id}'] = adata.obsm['Concord'][:,neuron_id]
    show_cols = [f'N{neuron_id}']
    with plt.rc_context(rc=custom_rc):
        ccd.pl.plot_embedding(
            adata, show_basis, show_cols, 
            pal='RdBu_r', vmax_quantile=.995,
            figsize=(1.09,.9), dpi=600, ncols=len(show_cols), font_size=6, point_size=.1, legend_loc='on data', 
            save_path=save_dir / f"embeddings__{show_basis}_{show_cols[0]}_{file_suffix}.pdf"
        )

In [None]:
# corrs_df = correlations.T
# corrs_df['neuron'] = corrs_df.index.astype(int)
# #corrs_df = corrs_df.sort_values('neuron')
# top_threshold = 0.35
# bottom_threshold = 0.2

# top_nodes = corrs_df[(corrs_df['S'].abs() > top_threshold) | (corrs_df['G2M'].abs() > top_threshold)]
# top_nodes_index = top_nodes['neuron']

# bottom_nodes = corrs_df[(corrs_df['S'].abs() < bottom_threshold) & (corrs_df['G2M'].abs() < bottom_threshold)]
# bottom_nodes_index = bottom_nodes['neuron']

# top_nodes, bottom_nodes
# cc_latents = top_nodes_index
# noncc_latents = bottom_nodes_index

In [None]:
# # barplot the correlation between the S_score, G2M_score, G2M-S_score and the Concord latent
# fig, ax = plt.subplots(1, 1, figsize=(6, 4))
# corrs_df.plot(x='neuron', y=['S', 'G2M'], kind='bar', ax=ax)
# # plot the threshold lines
# plt.axhline(y=top_threshold, color='r', linestyle='--')
# plt.axhline(y=-top_threshold, color='r', linestyle='--')
# plt.axhline(y=bottom_threshold, color='b', linestyle=':')
# plt.axhline(y=-bottom_threshold, color='b', linestyle=':')
# plt.ylabel('Correlation')
# plt.xlabel('Node index')
# plt.title('Correlation between S, G2M and vConcord latent')

# # highlight the top nodes' xticks
# for i in range(len(top_nodes_index)):
#     ax.get_xticklabels()[top_nodes_index[i]].set_color('r')
#     ax.get_xticklabels()[top_nodes_index[i]].set_fontweight('bold')

# # highlight the bottom nodes' xticks
# for i in range(len(bottom_nodes_index)):
#     ax.get_xticklabels()[bottom_nodes_index[i]].set_color('b')
#     ax.get_xticklabels()[bottom_nodes_index[i]].set_fontweight('bold')

# plt.tight_layout()

In [None]:
latent_dimension_clusters['cluster'].value_counts()

In [None]:
# Extract the latent belonging to cluster
all_clusters = latent_dimension_clusters['cluster'].unique()
#cc_clusters = [3,4,5]
cc_clusters = [1,2,3,4,5,6]
#cc_clusters = [1,3,4,5]
noncc_clusters = all_clusters[~np.isin(all_clusters, cc_clusters)]
cc_latents = np.where(latent_dimension_clusters['cluster'].isin(cc_clusters))[0]
noncc_latents = np.where(latent_dimension_clusters['cluster'].isin(noncc_clusters))[0]
print(f"Cell cycle related latent dimensions: {cc_latents}")
print(f"Non-cell cycle related latent dimensions: {noncc_latents}")

In [None]:
adata.obsm[f'{use_key}_cc'] = adata.obsm[use_key][:, cc_latents]
adata.obsm[f'{use_key}_noncc'] = adata.obsm[use_key][:, noncc_latents]
n_pc = 3
ccd.ul.run_pca(adata, source_key=f'{use_key}_cc', result_key=f'{use_key}_cc_PCA_global', n_pc=n_pc, random_state=seed)
ccd.ul.run_pca(adata, source_key=f'{use_key}_noncc', result_key=f'{use_key}_noncc_PCA_global', n_pc=n_pc, random_state=seed)

In [82]:
import re
for i in range(len(all_cts)):
    ct = all_cts[i]
    sanitized_ct = re.sub(r'[^\w\-]', '_', ct)
    adata_subset = adata[adata.obs['broad_cell_type'] == ct].copy()
    #adata_subset = sc.read(data_dir / f"adata_cc_{sanitized_ct}_Jan10-1241.h5ad")
    ccd.ul.run_pca(adata_subset, source_key=f'{use_key}_cc', result_key=f'{use_key}_cc_PCA', n_pc=n_pc, random_state=seed)
    ccd.ul.run_umap(adata_subset, source_key=f'{use_key}_cc', result_key=f'{use_key}_cc_UMAP', metric='cosine', min_dist=0.1, random_state=seed)
    ccd.ul.run_pca(adata_subset, source_key=f'{use_key}_noncc', result_key=f'{use_key}_noncc_PCA', n_pc=n_pc, random_state=seed)
    ccd.ul.run_umap(adata_subset, source_key=f'{use_key}_noncc', result_key=f'{use_key}_noncc_UMAP', metric='cosine', min_dist=0.1, random_state=seed)

    adata_subset.write_h5ad(data_dir / f"adata_cc_{sanitized_ct}_{file_suffix}.h5ad")

In [None]:
show_embs = [f'{use_key}_cc_PCA_global', f'{use_key}_cc_PCA', f'{use_key}_cc_UMAP'] + [f'{use_key}_noncc_PCA_global', f'{use_key}_noncc_PCA', f'{use_key}_noncc_UMAP']
show_cols = ['phase', 'phase_refined']
ncols = len(all_cts)
for j, show_emb in enumerate(show_embs):
    for col in show_cols:
        fig, axs = plt.subplots(1, ncols, figsize=(13, 1.5))
        axs = axs.flatten()

        # Change legend font size to 10
        for i in range(len(all_cts)):
            ct = all_cts[i]
            sanitized_ct = re.sub(r'[^\w\-]', '_', ct)
            adata_subset = sc.read(data_dir / f"adata_cc_{sanitized_ct}_Jan11-1338.h5ad")
            sc.pl.embedding(adata_subset, basis=show_emb, color=col, ax=axs[i], show=False, s=2, legend_loc='on data', legend_fontsize=7, palette=pal[col])
            axs[i].set_xlabel('')
            axs[i].set_ylabel('')
            axs[i].set_title(ct, fontsize=10)

        plt.tight_layout()
        plt.savefig(save_dir / f"embeddings_{show_emb}_{col}_{file_suffix}.pdf", dpi=600)
        plt.show()

## Interpretation of neurons with importance analysis

In [272]:
adata.obsm['Seurat'] = adata.obsm['Seurat'].to_numpy()

In [None]:
show_basis = 'Concord'
#show_basis = 'Concord-decoder'
show_basis = 'Seurat'
show_basis = 'scVI'
ncells = 1000
adata_ds = adata.copy()[np.random.choice(adata.n_obs, ncells, replace=False), :]
ccd.pl.heatmap_with_annotations(adata_ds, val=show_basis, transpose=True, obs_keys=['broad_cell_type', 'phase'], 
                             cmap='viridis', vmin=None, vmax=None, 
                             cluster_rows=True, cluster_cols=True, pal=pal, add_color_legend=True,
                             value_annot=False, title=None, title_fontsize=8, annot_fontsize=8,
                             yticklabels=False, xticklabels=False, 
                             use_clustermap=True, 
                             cluster_method='ward',        
                             cluster_metric='euclidean',
                             rasterize=True,
                             ax=None,
                             figsize=(4.5,1.8), 
                             seed = seed,
                             dpi=600, show=True, save_path=save_dir / f"heatmap_{show_basis}_{file_suffix}.pdf")

In [287]:
# Load model and predict latent
config_file = '../save/dev_benchmark_Huycke-Jan07/config_Jan07-1248.json'
model_file = '../save/dev_benchmark_Huycke-Jan07/final_model_Jan07-1248.pt'
# Load config
concord_args = ccd.ul.load_json(str(config_file))
concord_args['pretrained_model'] = model_file

# Downsample data to a small subset for fast estimation of feature contribution to the latent space
import re
layer_index = 6
import numpy as np
#adata_subset = adata.copy()[np.random.choice(adata.n_obs, 10000, replace=False), cur_ccd.config.input_feature]


### Importance analysis across cell types

In [None]:
unique_broad_cell_types = adata.obs['broad_cell_type'].unique()
all_cts = [ct for ct in list(unique_broad_cell_types) if ct not in ['Doublet-like', 'Erythrocyte']] + ['Erythrocyte']
importance_results = {}
for ct in all_cts:
    print(ct)
    sanitized_ct = re.sub(r'[^\w\-]', '_', ct)
    #adata_subset = adata[adata.obs['broad_cell_type'] == ct].copy()
    adata_subset = sc.read(data_dir / f"adata_huycke_{sanitized_ct}_Jan08-1324.h5ad")
    adata_subset = adata_subset[:, concord_args['input_feature']]

    cur_ccd = ccd.Concord(adata=adata_subset, **concord_args)
    cur_ccd.init_model()
    cur_ccd.init_dataloader(input_layer_key='X', preprocess=True, train_frac=1.0, use_sampler=False)
    input_tensors = torch.tensor(adata_subset.X.toarray()).to(cur_ccd.config.device)
    importance_matrix = ccd.ul.compute_feature_importance(cur_ccd.model, input_tensors, layer_index=layer_index)
    importance_results[ct] = importance_matrix


Order neuron activation genes for each cell type

In [395]:
ranked_gene_lists = {}
for ct in all_cts:
    sanitized_ct = re.sub(r'[^\w\-]', '_', ct)
    adata_subset = sc.read(data_dir / f"adata_huycke_{sanitized_ct}_Jan08-1324.h5ad")
    adata_subset = adata_subset[:, concord_args['input_feature']]
    ranked_lists = ccd.ul.prepare_ranked_list(importance_results[ct], adata=adata_subset, expr_level=True)
    # Define filters
    min_zero_frac = 0.03
    min_expression_level = 0

    # Filter function
    def filter_genes(df, min_zero_frac, min_expression_level):
        return df[(df["Nonzero Fraction"] > min_zero_frac) & (df["Expression Level"] > min_expression_level)]

    # Apply the filter to all neuron lists
    filtered_gene_lists = {key: filter_genes(df, min_zero_frac, min_expression_level) for key, df in ranked_lists.items()}
    ranked_gene_lists[ct] = filtered_gene_lists

Plotting the top genes for each cell type

In [None]:
show_cts = all_cts
# show_neurons = ['Neuron 32', 'Neuron 47',  # Global
#                 'Neuron 11', 'Neuron 39', 'Neuron 46', 'Neuron 8', 'Neuron 23', 'Neuron 2', # ENS
#                 'Neuron 42', 'Neuron 40', 'Neuron 31', 'Neuron 46', 'Neuron 37', 'Neuron 3', 'Neuron 19', 'Neuron 0'] # Epitheial
#show_neurons = ['Neuron 46']
show_neurons = np.unique(show_neurons)
for ct in show_cts:
       sanitized_ct = re.sub(r'[^\w\-]', '_', ct)
       show_gene_lists = ranked_gene_lists[ct]
       show_gene_lists = {key: show_gene_lists[key] for key in show_neurons}
       show_basis = 'Concord_UMAP_sub'
       adata_subset = sc.read(data_dir / f"adata_huycke_{sanitized_ct}_Jan08-1324.h5ad")
       ccd.pl.plot_top_genes_embedding(adata_subset, show_gene_lists, show_basis, top_x=8, figsize=(7.5, 1), point_size=1,
                                   font_size=7, colorbar_loc=None, vmax_quantile=.99,
                            save_path=save_dir / f"{sanitized_ct}_embeddings_{show_basis}")

In [397]:
# Select a neuron and show its activating top genes across cell types
show_neuron = 'Neuron 46'
show_cts = all_cts
activation_gene_lists = {}
for ct in show_cts:
    sanitized_ct = re.sub(r'[^\w\-]', '_', ct)
    activation_gene_lists[ct] = ranked_gene_lists[ct][show_neuron]

In [None]:
ccd.pl.plot_top_genes_per_neuron(activation_gene_lists, figsize=(4,3), save_path=save_dir / f"top_genes_neuron_{show_neuron}_{file_suffix}.pdf")

In [None]:
show_basis = 'Concord_UMAP_sub'
show_ct = 'Epithelial'
#show_ct = 'ENS'
show_gene_list = {show_neuron: activation_gene_lists[show_ct]}
sanitized_ct = re.sub(r'[^\w\-]', '_', show_ct)
adata_subset = sc.read(data_dir / f"adata_huycke_{sanitized_ct}_Jan08-1324.h5ad")
ccd.pl.plot_top_genes_embedding(adata_subset, show_gene_list, show_basis, top_x=10, figsize=(9, 1), point_size=1,
                                font_size=7, colorbar_loc=None, vmax_quantile=.99,
                         save_path=save_dir / f"{sanitized_ct}_embeddings_{show_basis}")

In [399]:
import pandas as pd

# Load ortholog table (adjust file path as needed)
orthologs = pd.read_csv("../resources/HOM_MouseHumanSequence.rpt", sep="\t")
mouse_to_human_dict = ccd.ul.get_human_genes_offline(adata.var.index, orthologs=orthologs, return_type='dict')

In [None]:
import pandas as pd
import copy

gsea_gene_lists = copy.deepcopy(activation_gene_lists)
# Convert mouse genes to human genes
for key, df in gsea_gene_lists.items():
    # Map mouse genes to human genes using mouse_to_human_dict
    df['Gene_mouse'] = df['Gene'].copy()
    df['Gene'] = df['Gene_mouse'].map(mouse_to_human_dict).fillna(df['Gene_mouse'])

gsea_gene_lists


In [None]:
import gseapy as gp
import os
all_gsea_results = {}
gene_sets='GO_Biological_Process_2021'

condition_ct = 'Epithelial'
#condition_ct = 'ENS'
ranked_list = gsea_gene_lists[condition_ct]
# Take top 2.5% of genes based on importance
top_genes = ranked_list[ranked_list['Importance'] > ranked_list['Importance'].quantile(0.95)]
with plt.rc_context(rc=custom_rc):
    ccd.ul.compute_go(top_genes['Gene_mouse'], organism="mouse", font_size=12, figsize=(7,3), dpi=600, save_path=save_dir / f"gsea_{condition_ct}_{show_neuron}_{file_suffix}.pdf")

In [527]:
show_ct = 'Epithelial'
show_ct = 'ENS'
sanitized_ct = re.sub(r'[^\w\-]', '_', show_ct)
adata_subset = sc.read(data_dir / f"adata_huycke_{sanitized_ct}_Jan08-1324.h5ad")
score_name = f'{condition_ct}_{show_neuron}_top_gene_score'
sc.tl.score_genes(adata_subset, gene_list=top_genes['Gene_mouse'], score_name=score_name)

In [None]:
show_basis = 'Concord_UMAP_sub'
show_cols = [score_name]
with plt.rc_context(rc=custom_rc):
    ccd.pl.plot_embedding(
        adata_subset, show_basis, show_cols, 
        pal='RdBu_r', vmax=.2,
        figsize=(1.09,.9), dpi=600, ncols=len(show_cols), font_size=6, point_size=1, legend_loc='on data', 
        save_path=save_dir / f"embeddings_{show_ct}_{show_basis}_{condition_ct}_score_{file_suffix}.pdf"
    )