# defaulting

In [1]:
import os

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0" # Change to -1 if you want to use CPU!

import warnings
warnings.filterwarnings('ignore')

import scenvi
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd
import scanpy as sc
import colorcet
import umap.umap_ as umap

An NVIDIA GPU may be present on this machine, but a CUDA-enabled jaxlib is not installed. Falling back to cpu.


In [2]:
cell_type_palette = {'0': (0.843137, 0.0, 0.0, 1.0),
                     '1': (0.54902, 0.235294, 1.0, 1.0),
                     '2': (0.007843, 0.533333, 0.0, 1.0),
                     '3': (0.0, 0.67451, 0.780392, 1.0),
                     '4': (0.596078, 1.0, 0.0, 1.0),
                     '5': (1.0, 0.498039, 0.819608, 1.0),
                     '6': (0.423529, 0.0, 0.309804, 1.0),
                     '7': (1.0, 0.647059, 0.188235, 1.0),
                     '8': (0.345098, 0.231373, 0.0, 1.0),
                     '9': (0.0, 0.341176, 0.34902, 1.0),
                     '10': (0.0, 0.0, 0.866667, 1.0),
                     '11': (0.0, 0.992157, 0.811765, 1.0),
                     '12': (0.631373, 0.458824, 0.415686, 1.0),
                     '13': (0.737255, 0.717647, 1.0, 1.0),
                     '14': (0.584314, 0.709804, 0.470588, 1.0),
                     '15': (0.752941, 0.015686, 0.72549, 1.0),
                     '16': (0.392157, 0.329412, 0.454902, 1.0),
                     '17': (0.47451, 0.0, 0.0, 1.0),
                     '18': (0.027451, 0.454902, 0.847059, 1.0),
                     'Sncg': (0.996078, 0.960784, 0.564706, 1.0),
                     'Sst': (0.0, 0.294118, 0.0, 1.0),
                     'VLMC': (0.560784, 0.478431, 0.0, 1.0),
                     'Vip': (1.0, 0.447059, 0.4, 1.0)}

cell_label_palette = {'GABAergic': (0.843137, 0.0, 0.0, 1.0),
                      'Glutamatergic': (0.54902, 0.235294, 1.0, 1.0),
                      'Non-Neuronal': (0.007843, 0.533333, 0.0, 1.0)}

# loading

In [6]:
print
# sc_data=sc.read_h5ad("/data/kjc2/projects/P330.CSA/rds/downsampled_100_25-04-15-08-34.h5ad")
st_data=sc.read_h5ad("/data/kjc2/projects/P330.CSA/rds/250210_CRC_BJM_0050585_Region1_25-04-14-16-53.h5ad")
# sc_data=sc.read_h5ad("/data/kjc2/projects/P330.CSA/rds/downsampled_crc_after_gene_name_change25-04-14-23-04.h5ad")
sc_data=sc.read_h5ad("/data/kjc2/projects/P330.CSA/rds/downsampled_100_25-04-15-08-34.h5ad")

print(sc_data)
print(st_data)

AnnData object with n_obs × n_vars = 42649 × 28476
    obs: 'dataset', 'medical_condition', 'cancer_type', 'sample_id', 'sample_type', 'tumor_source', 'replicate', 'sample_tissue', 'anatomic_region', 'anatomic_location', 'tumor_stage', 'tumor_stage_TNM', 'tumor_stage_TNM_T', 'tumor_stage_TNM_N', 'tumor_stage_TNM_M', 'tumor_size', 'tumor_dimensions', 'tumor_grade', 'histological_type', 'microsatellite_status', 'mismatch_repair_deficiency_status', 'MLH1_promoter_methylation_status', 'MLH1_status', 'KRAS_status', 'BRAF_status', 'APC_status', 'TP53_status', 'PIK3CA_status', 'SMAD4_status', 'NRAS_status', 'MSH6_status', 'FBXW7_status', 'NOTCH1_status', 'MSH2_status', 'PMS2_status', 'POLE_status', 'ERBB2_status', 'STK11_status', 'HER2_status', 'CTNNB1_status', 'BRAS_status', 'patient_id', 'sex', 'age', 'treatment_status_before_resection', 'treatment_drug', 'treatment_response', 'RECIST', 'platform', 'platform_fine', 'cellranger_version', 'reference_genome', 'matrix_type', 'enrichment_cell_ty

# 변수 확인

In [6]:
sc_data.uns
st_data.uns

OrderedDict()

# 메모리 확인

In [8]:
import psutil
import os

# 현재 프로세스 메모리 사용량 (바이트 단위)
current_process = psutil.Process(os.getpid())
memory_info = current_process.memory_info()

print(f"현재 사용 중인 메모리 (RSS): {memory_info.rss / (1024 ** 2):.2f} MB")
print(f"가상 메모리 크기 (VMS): {memory_info.vms / (1024 ** 2):.2f} MB")

현재 사용 중인 메모리 (RSS): 3387.67 MB
가상 메모리 크기 (VMS): 31130.57 MB


In [9]:
import resource

# 소프트 제한과 하드 제한 확인 (바이트 단위)
soft, hard = resource.getrlimit(resource.RLIMIT_AS)

if soft == resource.RLIM_INFINITY:
    print("소프트 제한: 무제한")
else:
    print(f"소프트 제한: {soft / (1024 ** 3):.2f} GB")
    
if hard == resource.RLIM_INFINITY:
    print("하드 제한: 무제한")
else:
    print(f"하드 제한: {hard / (1024 ** 3):.2f} GB")

소프트 제한: 무제한
하드 제한: 무제한


In [10]:
# 모든 객체의 메모리 사용량 확인
%whos

# 메모리 사용량 모니터링
%memit 명령어 # memory_profiler 패키지 필요

Variable             Type       Data/Info
-----------------------------------------
cell_label_palette   dict       n=3
cell_type_palette    dict       n=23
colorcet             module     <module 'colorcet' from '<...>es/colorcet/__init__.py'>
current_process      Process    psutil.Process(pid=411624<...>ing', started='09:01:40')
hard                 int        -1
matplotlib           module     <module 'matplotlib' from<...>/matplotlib/__init__.py'>
memory_info          pmem       pmem(rss=3552231424, vms=<...>ata=11580604416, dirty=0)
np                   module     <module 'numpy' from '/ho<...>kages/numpy/__init__.py'>
os                   module     <module 'os' from '/home/<...>vi/lib/python3.10/os.py'>
pd                   module     <module 'pandas' from '/h<...>ages/pandas/__init__.py'>
plt                  module     <module 'matplotlib.pyplo<...>es/matplotlib/pyplot.py'>
psutil               module     <module 'psutil' from '/h<...>ages/psutil/__init__.py'>
resource        

UsageError: Line magic function `%memit` not found.


In [11]:
import psutil

total_memory = psutil.virtual_memory().total / (1024 ** 3)
available_memory = psutil.virtual_memory().available / (1024 ** 3)

print(f"총 시스템 메모리: {total_memory:.2f} GB")
print(f"사용 가능한 메모리: {available_memory:.2f} GB")

총 시스템 메모리: 1006.54 GB
사용 가능한 메모리: 35.64 GB


# python script 리뷰

In [None]:
#!/usr/bin/env python
# coding: utf-8

import os
import argparse
import warnings
import logging
import time
from datetime import datetime

# --- Argument Parsing ---
parser = argparse.ArgumentParser(description="Run Scanpy preprocessing and SCENVI analysis on spatial and single-cell data.")
parser.add_argument('--st_h5ad', type=str, required=True, help='Path to the spatial AnnData file (.h5ad)')
parser.add_argument('--sc_h5ad', type=str, required=True, help='Path to the single-cell AnnData file (.h5ad) with Ensembl IDs.')
parser.add_argument('--output_dir', type=str, required=True, help='Directory to save outputs and logs.')
parser.add_argument('--output_prefix', type=str, default='scenvi_result', help='Prefix for output files.')
parser.add_argument('--gpu_id', type=str, default='0', help='GPU ID to use (e.g., "0"). Set to "-1" to use CPU.')
parser.add_argument('--n_pcs', type=int, default=30, help='Number of principal components to use.')
parser.add_argument('--leiden_res', type=float, default=0.8, help='Resolution for Leiden clustering.')
parser.add_argument('--n_neighbors', type=int, default=15, help='Number of neighbors for graph construction.')
parser.add_argument('--n_jobs', type=int, default=4, help='Number of CPU cores for Scanpy parallel tasks.')
parser.add_argument('--downsample_sc', type=float, default=None, help='Fraction to downsample scRNA-seq data (e.g., 0.1 for 10%). No downsampling if not provided.')
parser.add_argument('--save_processed_st', action='store_true', help='Save the processed spatial AnnData before SCENVI.')
parser.add_argument('--save_converted_sc', action='store_true', help='Save the gene name converted (and optionally downsampled) scRNA-seq AnnData.')

args = parser.parse_args()

# --- GPU/CPU Setup ---
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id
use_gpu = False if args.gpu_id == "-1" else True

# --- Logging Setup ---
os.makedirs(args.output_dir, exist_ok=True)
log_filename = os.path.join(args.output_dir, f"{args.output_prefix}_log_{datetime.now().strftime('%y%m%d_%H%M%S')}.log")
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_filename),
        logging.StreamHandler() # Also print logs to console
    ]
)
logger = logging.getLogger()
start_time = time.time()

logger.info(f"Starting script with arguments: {args}")

# --- Import Libraries ---
# Moved imports here to log potential import errors
try:
    import warnings
    warnings.filterwarnings('ignore')
    import scenvi
    import matplotlib
    matplotlib.use('Agg') # Use non-interactive backend for HPC
    import matplotlib.pyplot as plt
    import seaborn as sns
    import numpy as np
    import pandas as pd
    import scanpy as sc
    import mygene
    # import colorcet # Assuming this was used for palettes, define manually if needed
    # import umap.umap_ as umap # Usually not needed directly if using sc.tl.umap
except ImportError as e:
    logger.error(f"Failed to import libraries: {e}", exc_info=True)
    exit(1)

logger.info("Libraries imported successfully.")

# --- Settings ---
sc.settings.verbosity = 3  # Log scanpy actions
sc.settings.n_jobs = args.n_jobs
sc.settings.figdir = args.output_dir # Save figures here
logger.info(f"Scanpy settings: verbosity=3, n_jobs={args.n_jobs}, figdir={args.output_dir}")

# --- Define Palettes (if needed, replace colorcet) ---
# Example: Use Scanpy's default or define manually
# cell_type_palette = ... # Define your palette dictionary here if needed

# === ST Data Preprocessing Function ===
def preprocess_spatial_data(adata_path, output_dir, output_prefix, n_pcs, n_neighbors, leiden_res):
    logger.info(f"--- Starting Spatial Data Preprocessing: {adata_path} ---")
    try:
        adata = sc.read_h5ad(adata_path)
        logger.info(f"Loaded spatial data: {adata}")

        # Basic QC and Filtering (adjust parameters as needed)
        sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)
        min_genes = 20
        min_counts = 50
        min_cells = 5
        logger.info(f"Cells before filtering: {adata.n_obs}")
        sc.pp.filter_cells(adata, min_genes=min_genes)
        sc.pp.filter_cells(adata, min_counts=min_counts)
        logger.info(f"Genes before filtering: {adata.n_vars}")
        sc.pp.filter_genes(adata, min_cells=min_cells)
        logger.info(f"Data shape after filtering: {adata.shape}")

        # Normalization and Log Transform
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)

        # Highly Variable Genes
        sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5, flavor='seurat_v3')
        n_hvg = adata.var['highly_variable'].sum()
        logger.info(f"Found {n_hvg} highly variable genes.")
        # Optional: Save HVG plot
        # sc.pl.highly_variable_genes(adata, save=f"_{output_prefix}_st_hvg.png", show=False)

        # Scale data (important for PCA)
        sc.pp.scale(adata, max_value=10)

        # Dimensionality Reduction and Clustering
        sc.tl.pca(adata, svd_solver='arpack', use_highly_variable=True)
        # Optional: Save PCA variance plot
        # sc.pl.pca_variance_ratio(adata, log=True, n_pcs=50, save=f"_{output_prefix}_st_pca_variance.png", show=False)
        sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=n_pcs)
        sc.tl.leiden(adata, resolution=leiden_res, key_added=f'leiden_res{leiden_res}')
        sc.tl.umap(adata)

        logger.info("Spatial data preprocessing complete.")
        logger.info(f"Final spatial AnnData object: {adata}")

        # Optional: Save UMAP plot
        sc.pl.umap(adata, color=[f'leiden_res{leiden_res}'], save=f"_{output_prefix}_st_umap_leiden.png", show=False, title=f'Leiden Clusters (res={leiden_res})')

        # Optional: Save spatial plot (ensure palette is defined if needed)
        # plt.figure(figsize=(10,10))
        # sns.scatterplot(x=adata.obsm['spatial'][:, 0], y=adata.obsm['spatial'][:, 1], hue=adata.obs[f'leiden_res{leiden_res}'], s=12, legend=None) # palette=cell_type_palette
        # plt.axis('equal'); plt.axis('off'); plt.title("Spatial Data Clusters")
        # plt.savefig(os.path.join(output_dir, f"{output_prefix}_st_spatial_leiden.png"), bbox_inches='tight', dpi=150)
        # plt.close()

        return adata

    except Exception as e:
        logger.error(f"Error during spatial data preprocessing: {e}", exc_info=True)
        raise

# === SC Data Gene Name Conversion Function ===
def convert_sc_gene_names(adata_path):
    logger.info(f"--- Starting SC Data Gene Name Conversion: {adata_path} ---")
    try:
        sc_data = sc.read_h5ad(adata_path)
        logger.info(f"Loaded scRNA-seq data: {sc_data}")
        logger.info(f"Original SC var names (first 5): {sc_data.var_names[:5].tolist()}")
        logger.info(f"Original SC index is unique: {sc_data.var.index.is_unique}")

        # Backup and process Ensembl IDs
        sc_data.var['original_ensembl_id_with_version'] = sc_data.var_names
        sc_data.var['base_ensembl_id'] = sc_data.var_names.str.split('.').str[0]
        unique_base_ids = sc_data.var['base_ensembl_id'].unique().tolist()
        logger.info(f"Number of unique base Ensembl IDs to query: {len(unique_base_ids)}")

        # Query mygene
        mg = mygene.MyGeneInfo()
        logger.info(f"Querying {len(unique_base_ids)} unique Ensembl IDs using mygene.info...")
        query_start_time = time.time()
        gene_info = mg.querymany(unique_base_ids, scopes='ensembl.gene', fields='symbol', species='human', as_dataframe=True, returnall=True)
        logger.info(f"MyGene query finished in {time.time() - query_start_time:.2f} seconds.")

        # Create map and apply
        symbol_map = gene_info['out']['symbol'].dropna().to_dict()
        logger.info(f"Created map for {len(symbol_map)} Ensembl IDs to symbols.")
        sc_data.var['gene_symbol'] = sc_data.var['base_ensembl_id'].map(symbol_map)

        # Handle missing symbols
        missing_symbols_mask = sc_data.var['gene_symbol'].isna()
        n_missing = missing_symbols_mask.sum()
        logger.info(f"Number of genes with missing symbols (will use base Ensembl ID): {n_missing}")
        if n_missing > 0:
            sc_data.var.loc[missing_symbols_mask, 'gene_symbol'] = sc_data.var.loc[missing_symbols_mask, 'base_ensembl_id']

        # Update var_names and make unique
        logger.info("Updating sc_data.var_names with gene symbols...")
        sc_data.var_names = sc_data.var['gene_symbol'].astype(str)
        if not sc_data.var_names.is_unique:
            logger.warning("Duplicate gene symbols found. Making var_names unique...")
            sc_data.var_names_make_unique()
        else:
            logger.info("Gene symbols are unique.")

        logger.info(f"Updated SC var names (first 5): {sc_data.var_names[:5].tolist()}")

        # Clean up columns and index name for saving
        if 'gene_symbol' in sc_data.var.columns:
            sc_data.var.drop(columns=['gene_symbol'], inplace=True)
        sc_data.var.index.name = None
        logger.info("Cleaned up var metadata for saving.")

        logger.info("SC data gene name conversion complete.")
        return sc_data

    except Exception as e:
        logger.error(f"Error during SC data gene name conversion: {e}", exc_info=True)
        raise

# === Main Execution ===
try:
    # --- 1. Load and Preprocess Spatial Data ---
    # Note: Preprocessing might have already been done. If so, load processed file directly.
    # Here we assume preprocessing is needed based on the provided script blocks.
    st_data = preprocess_spatial_data(args.st_h5ad, args.output_dir, args.output_prefix, args.n_pcs, args.n_neighbors, args.leiden_res)
    if args.save_processed_st:
        st_processed_path = os.path.join(args.output_dir, f"{args.output_prefix}_st_processed.h5ad")
        logger.info(f"Saving processed spatial data to {st_processed_path}")
        st_data.write_h5ad(st_processed_path)

    # --- 2. Load, Convert (and optionally Downsample) SC Data ---
    sc_data = convert_sc_gene_names(args.sc_h5ad)
    sc_data = preprocess_spatial_data(args.sc_data, args.output_dir, args.output_prefix, args.n_pcs, args.n_neighbors, args.leiden_res)

    # Optional Downsampling
    if args.downsample_sc is not None and 0 < args.downsample_sc < 1:
        n_original_cells = sc_data.n_obs
        logger.info(f"Downsampling scRNA-seq data to {args.downsample_sc*100:.1f}% fraction.")
        sc.pp.subsample(sc_data, fraction=args.downsample_sc, random_state=0)
        logger.info(f"Cells after downsampling: {sc_data.n_obs} (from {n_original_cells})")
    elif args.downsample_sc is not None:
        logger.warning(f"Invalid downsample fraction provided: {args.downsample_sc}. Skipping downsampling.")

    # Ensure log1p is applied if not already present (crucial for scenvi)
    # Assuming the input sc_h5ad might contain raw counts
    # Check if data looks log-transformed, apply if necessary
    # A simple check: if max value is very large and mostly integers, it's likely counts
    is_logged = False
    if 'log1p' in sc_data.uns:
         logger.info("Found 'log1p' in sc_data.uns, assuming data is log-transformed.")
         is_logged = True
    elif np.issubdtype(sc_data.X.dtype, np.floating) and sc_data.X.max() < 50: # Heuristic check
         logger.info("sc_data.X seems to contain log-like values.")
         is_logged = True

    if not is_logged:
         logger.warning("sc_data.X might not be log-transformed. Applying sc.pp.log1p(). Ensure input data format is as expected.")
         # Need normalization first if applying log1p to raw counts
         sc.pp.normalize_total(sc_data, target_sum=1e4)
         sc.pp.log1p(sc_data)

    if args.save_converted_sc:
        sc_converted_path = os.path.join(args.output_dir, f"{args.output_prefix}_sc_converted_processed.h5ad")
        logger.info(f"Saving converted/processed scRNA-seq data to {sc_converted_path}")
        # Ensure index name is None before saving
        sc_data.var.index.name = None
        sc_data.write_h5ad(sc_converted_path)


    # --- 3. Run SCENVI ---
    logger.info("--- Starting SCENVI Analysis ---")
    scenvi_start_time = time.time()

    # Check shared genes before running
    shared_genes = st_data.var_names.intersection(sc_data.var_names)
    logger.info(f"Number of shared genes between ST and SC data: {len(shared_genes)}")
    if len(shared_genes) == 0:
        logger.error("No shared genes found between spatial and single-cell data after processing. Cannot run SCENVI.")
        raise ValueError("No shared genes for SCENVI.")

    logger.info("Initializing SCENVI model...")
    # Check scenvi documentation for exact GPU parameter if needed, assuming it detects automatically or uses env var
    envi_model = scenvi.ENVI(spatial_data = st_data, sc_data = sc_data)

    logger.info("Training SCENVI model...")
    # Add use_gpu=use_gpu if the train method supports it explicitly
    # envi_model.train(use_gpu=use_gpu)
    envi_model.train() # Assuming it uses CUDA_VISIBLE_DEVICES

    logger.info("Running SCENVI auxiliary functions...")
    envi_model.impute_genes()
    envi_model.infer_niche_covet()
    envi_model.infer_niche_celltype()

    logger.info(f"SCENVI analysis finished in {time.time() - scenvi_start_time:.2f} seconds.")

    # --- 4. Add SCENVI Results back to AnnData ---
    logger.info("Adding SCENVI results to AnnData objects.")
    # For Spatial Data
    st_data.obsm['envi_latent'] = envi_model.spatial_data.obsm['envi_latent']
    st_data.obsm['COVET'] = envi_model.spatial_data.obsm['COVET']
    st_data.obsm['COVET_SQRT'] = envi_model.spatial_data.obsm['COVET_SQRT']
    st_data.uns['COVET_genes'] = envi_model.CovGenes
    st_data.obsm['imputation'] = envi_model.spatial_data.obsm['imputation']
    st_data.obsm['cell_type_niche'] = envi_model.spatial_data.obsm['cell_type_niche']

    # For SC Data
    sc_data.obsm['envi_latent'] = envi_model.sc_data.obsm['envi_latent']
    sc_data.obsm['COVET'] = envi_model.sc_data.obsm['COVET']
    sc_data.obsm['COVET_SQRT'] = envi_model.sc_data.obsm['COVET_SQRT']
    sc_data.obsm['cell_type_niche'] = envi_model.sc_data.obsm['cell_type_niche']
    sc_data.uns['COVET_genes'] = envi_model.CovGenes # Redundant but consistent

    # --- 5. Save Final AnnData Objects ---
    st_final_path = os.path.join(args.output_dir, f"{args.output_prefix}_st_final_with_scenvi.h5ad")
    sc_final_path = os.path.join(args.output_dir, f"{args.output_prefix}_sc_final_with_scenvi.h5ad")

    logger.info(f"Saving final spatial data with SCENVI results to {st_final_path}")
    st_data.var.index.name = None # Ensure index name is None
    st_data.write_h5ad(st_final_path)

    logger.info(f"Saving final scRNA-seq data with SCENVI results to {sc_final_path}")
    sc_data.var.index.name = None # Ensure index name is None
    sc_data.write_h5ad(sc_final_path)

    total_time = time.time() - start_time
    logger.info(f"--- Script finished successfully in {total_time:.2f} seconds ---")

except Exception as e:
    logger.error(f"An error occurred during the main execution: {e}", exc_info=True)
    total_time = time.time() - start_time
    logger.info(f"--- Script failed after {total_time:.2f} seconds ---")
    exit(1)

# py-script2

In [None]:
#!/usr/bin/env python
# coding: utf-8

import os
import argparse
import warnings
import logging
import time
from datetime import datetime

# --- Argument Parsing ---
parser = argparse.ArgumentParser(description="Run Scanpy preprocessing and/or SCENVI analysis.")
# Inputs
parser.add_argument('--input_st_h5ad', type=str, required=True, help='Path to the input spatial AnnData file (.h5ad)')
parser.add_argument('--input_sc_h5ad', type=str, required=True, help='Path to the input single-cell AnnData file (.h5ad)')
# Outputs
parser.add_argument('--output_dir', type=str, required=True, help='Directory to save outputs and logs.')
parser.add_argument('--output_prefix', type=str, default='scenvi_result', help='Prefix for output files.')
# Processing Flags
parser.add_argument('--skip_st_preprocessing', action='store_true', help='Skip spatial data preprocessing (QC, norm, HVG, PCA, UMAP, etc.). Assumes input ST is ready.')
parser.add_argument('--skip_sc_gene_conversion', action='store_true', help='Skip scRNA-seq gene name conversion (Ensembl to Symbol). Assumes input SC has correct symbols.')
parser.add_argument('--downsample_sc', type=float, default=None, help='Fraction to downsample scRNA-seq data (e.g., 0.1 for 10%). Skips if not provided.')
# SCENVI Parameters
parser.add_argument('--run_scenvi', action='store_true', help='Run the SCENVI analysis steps.')
parser.add_argument('--n_pcs', type=int, default=30, help='Number of PCs (used in ST preprocessing if not skipped).')
parser.add_argument('--leiden_res', type=float, default=0.8, help='Leiden resolution (used in ST preprocessing if not skipped).')
parser.add_argument('--n_neighbors', type=int, default=15, help='Number of neighbors (used in ST preprocessing if not skipped).')
# Resource/Save Flags
parser.add_argument('--gpu_id', type=str, default='0', help='GPU ID to use for SCENVI ("-1" for CPU).')
parser.add_argument('--n_jobs', type=int, default=4, help='Number of CPU cores for Scanpy parallel tasks.')
parser.add_argument('--save_processed_st', action='store_true', help='Save the spatial AnnData after preprocessing (if done).')
parser.add_argument('--save_processed_sc', action='store_true', help='Save the scRNA-seq AnnData after conversion/downsampling (if done).')


args = parser.parse_args()

# --- GPU/CPU Setup ---
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id
use_gpu = False if args.gpu_id == "-1" else True

# --- Logging Setup ---
os.makedirs(args.output_dir, exist_ok=True)
log_filename = os.path.join(args.output_dir, f"{args.output_prefix}_log_{datetime.now().strftime('%y%m%d_%H%M%S')}.log")
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(process)d - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_filename),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger()
start_time = time.time()

logger.info(f"Starting script with arguments: {args}")

# --- Import Libraries ---
try:
    import warnings
    warnings.filterwarnings('ignore')
    import matplotlib
    matplotlib.use('Agg') # Use non-interactive backend
    import matplotlib.pyplot as plt
    # import seaborn as sns # Keep if plotting needed
    import numpy as np
    import pandas as pd
    import scanpy as sc
    if not args.skip_sc_gene_conversion:
        import mygene
    if args.run_scenvi:
        import scenvi
except ImportError as e:
    logger.error(f"Failed to import libraries: {e}", exc_info=True)
    exit(1)

logger.info("Libraries imported successfully.")

# --- Settings ---
sc.settings.verbosity = 3
sc.settings.n_jobs = args.n_jobs
sc.settings.figdir = args.output_dir
logger.info(f"Scanpy settings: verbosity=3, n_jobs={args.n_jobs}, figdir={args.output_dir}")

# === Preprocessing/Conversion Functions (Keep them available) ===

def preprocess_spatial_data(adata_path, output_dir, output_prefix, n_pcs, n_neighbors, leiden_res):
    # (이전 답변의 preprocess_spatial_data 함수 내용과 동일)
    logger.info(f"--- Starting Spatial Data Preprocessing: {adata_path} ---")
    try:
        adata = sc.read_h5ad(adata_path)
        logger.info(f"Loaded spatial data: {adata}")

        # QC, Filtering, Normalization, Log, HVG, Scale, PCA, Neighbors, Leiden, UMAP
        sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)
        min_genes = 20; min_counts = 50; min_cells = 5
        logger.info(f"Cells before filtering: {adata.n_obs}")
        sc.pp.filter_cells(adata, min_genes=min_genes)
        sc.pp.filter_cells(adata, min_counts=min_counts)
        logger.info(f"Genes before filtering: {adata.n_vars}")
        sc.pp.filter_genes(adata, min_cells=min_cells)
        logger.info(f"Data shape after filtering: {adata.shape}")
        if adata.n_obs == 0 or adata.n_vars == 0:
             raise ValueError("Data is empty after filtering.")

        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)
        sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5, flavor='seurat_v3')
        n_hvg = adata.var['highly_variable'].sum()
        logger.info(f"Found {n_hvg} highly variable genes.")
        if n_hvg == 0:
            logger.warning("No highly variable genes found. Using all genes for PCA.")
            adata.var['highly_variable'] = True # Use all if none found

        sc.pp.scale(adata, max_value=10)
        sc.tl.pca(adata, svd_solver='arpack', use_highly_variable=True)
        # Ensure enough PCs were computed
        actual_n_pcs = min(n_pcs, adata.obsm['X_pca'].shape[1])
        if actual_n_pcs < n_pcs:
             logger.warning(f"Requested {n_pcs} PCs, but only {actual_n_pcs} could be computed.")
        if actual_n_pcs == 0:
             raise ValueError("PCA could not be computed.")

        sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=actual_n_pcs)
        sc.tl.leiden(adata, resolution=leiden_res, key_added=f'leiden_res{leiden_res}')
        sc.tl.umap(adata)

        logger.info("Spatial data preprocessing complete.")
        logger.info(f"Final spatial AnnData object after preprocessing: {adata}")
        # Optional: Save UMAP plot
        # sc.pl.umap(adata, color=[f'leiden_res{leiden_res}'], save=f"_{output_prefix}_st_umap_leiden.png", show=False, title=f'Leiden Clusters (res={leiden_res})')
        return adata

    except Exception as e:
        logger.error(f"Error during spatial data preprocessing: {e}", exc_info=True)
        raise

def convert_sc_gene_names(adata_path):
    # (이전 답변의 convert_sc_gene_names 함수 내용과 거의 동일, mygene 임포트는 위에서 처리)
    logger.info(f"--- Starting SC Data Gene Name Conversion: {adata_path} ---")
    try:
        sc_data = sc.read_h5ad(adata_path)
        logger.info(f"Loaded scRNA-seq data: {sc_data}")
        if not sc_data.var_names.str.startswith('ENSG').any():
             logger.warning("Input SC data does not seem to have Ensembl IDs starting with 'ENSG'. Skipping conversion.")
             return sc_data

        # Backup, process Ensembl IDs, query mygene, map, handle missing, update var_names
        sc_data.var['original_ensembl_id_with_version'] = sc_data.var_names
        sc_data.var['base_ensembl_id'] = sc_data.var_names.str.split('.').str[0]
        unique_base_ids = sc_data.var['base_ensembl_id'].unique().tolist()
        logger.info(f"Number of unique base Ensembl IDs to query: {len(unique_base_ids)}")
        mg = mygene.MyGeneInfo()
        logger.info(f"Querying {len(unique_base_ids)} unique Ensembl IDs...")
        query_start_time = time.time()
        gene_info = mg.querymany(unique_base_ids, scopes='ensembl.gene', fields='symbol', species='human', as_dataframe=True, returnall=True)
        logger.info(f"MyGene query finished in {time.time() - query_start_time:.2f} seconds.")

        symbol_map = gene_info['out']['symbol'].dropna().to_dict()
        logger.info(f"Created map for {len(symbol_map)} Ensembl IDs to symbols.")
        sc_data.var['gene_symbol'] = sc_data.var['base_ensembl_id'].map(symbol_map)
        missing_symbols_mask = sc_data.var['gene_symbol'].isna()
        n_missing = missing_symbols_mask.sum()
        logger.info(f"Number of genes with missing symbols (will use base Ensembl ID): {n_missing}")
        if n_missing > 0:
            sc_data.var.loc[missing_symbols_mask, 'gene_symbol'] = sc_data.var.loc[missing_symbols_mask, 'base_ensembl_id']

        logger.info("Updating sc_data.var_names with gene symbols...")
        sc_data.var_names = sc_data.var['gene_symbol'].astype(str)
        if not sc_data.var_names.is_unique:
            logger.warning("Duplicate gene symbols found. Making var_names unique...")
            sc_data.var_names_make_unique()
        else:
            logger.info("Gene symbols are unique.")

        # Clean up
        if 'gene_symbol' in sc_data.var.columns:
            sc_data.var.drop(columns=['gene_symbol'], inplace=True)
        sc_data.var.index.name = None
        logger.info("Cleaned up var metadata for saving.")
        logger.info("SC data gene name conversion complete.")
        return sc_data
    except Exception as e:
        logger.error(f"Error during SC data gene name conversion: {e}", exc_info=True)
        raise

# === Main Execution ===
try:
    # --- 1. Load/Process Spatial Data ---
    if args.skip_st_preprocessing:
        logger.info(f"Skipping spatial preprocessing. Loading directly: {args.input_st_h5ad}")
        st_data = sc.read_h5ad(args.input_st_h5ad)
        logger.info(f"Loaded spatial data: {st_data}")
        # Basic check for spatial coordinates
        if 'spatial' not in st_data.obsm_keys():
             logger.warning("'spatial' key not found in st_data.obsm. SCENVI might fail.")
    else:
        st_data = preprocess_spatial_data(args.input_st_h5ad, args.output_dir, args.output_prefix, args.n_pcs, args.n_neighbors, args.leiden_res)
        if args.save_processed_st:
            st_processed_path = os.path.join(args.output_dir, f"{args.output_prefix}_st_processed.h5ad")
            logger.info(f"Saving processed spatial data to {st_processed_path}")
            st_data.var.index.name = None # Ensure index name is None
            st_data.write_h5ad(st_processed_path)

    # --- 2. Load/Process SC Data ---
    if args.skip_sc_gene_conversion:
        logger.info(f"Skipping scRNA-seq gene conversion. Loading directly: {args.input_sc_h5ad}")
        sc_data = sc.read_h5ad(args.input_sc_h5ad)
        logger.info(f"Loaded scRNA-seq data: {sc_data}")
        if sc_data.var_names.str.startswith('ENSG').any():
             logger.warning("Input SC data seems to have Ensembl IDs, but conversion was skipped.")
    else:
        sc_data = convert_sc_gene_names(args.input_sc_h5ad)

    # Optional Downsampling (Applied after potential conversion)
    if args.downsample_sc is not None and 0 < args.downsample_sc < 1:
        n_original_cells = sc_data.n_obs
        logger.info(f"Downsampling scRNA-seq data to {args.downsample_sc*100:.1f}% fraction.")
        sc.pp.subsample(sc_data, fraction=args.downsample_sc, random_state=0)
        logger.info(f"Cells after downsampling: {sc_data.n_obs} (from {n_original_cells})")
    elif args.downsample_sc is not None:
        logger.warning(f"Invalid downsample fraction provided: {args.downsample_sc}. Skipping downsampling.")

    # Check/Apply Log Normalization for SC data (Crucial if SCTransform wasn't used externally)
    is_logged = False
    if 'log1p' in sc_data.uns:
         logger.info("Found 'log1p' in sc_data.uns, assuming SC data is log-transformed.")
         is_logged = True
    # Add more robust check if possible, e.g., check data range and type
    elif isinstance(sc_data.X, np.ndarray) and np.issubdtype(sc_data.X.dtype, np.floating) and sc_data.X.max() < 50: # Heuristic for dense
         logger.info("sc_data.X (dense) seems to contain log-like values.")
         is_logged = True
    elif hasattr(sc_data.X, 'dtype') and np.issubdtype(sc_data.X.dtype, np.floating) and sc_data.X.max() < 50: # Heuristic for sparse
         logger.info("sc_data.X (sparse) seems to contain log-like values.")
         is_logged = True


    if not is_logged:
         logger.warning("SC data does not appear to be log-transformed. Applying sc.pp.normalize_total() and sc.pp.log1p().")
         sc.pp.normalize_total(sc_data, target_sum=1e4)
         sc.pp.log1p(sc_data)

    if args.save_processed_sc:
        sc_processed_path = os.path.join(args.output_dir, f"{args.output_prefix}_sc_processed.h5ad")
        logger.info(f"Saving processed scRNA-seq data to {sc_processed_path}")
        sc_data.var.index.name = None # Ensure index name is None
        sc_data.write_h5ad(sc_processed_path)


    # --- 3. Run SCENVI (if requested) ---
    if args.run_scenvi:
        logger.info("--- Starting SCENVI Analysis ---")
        scenvi_start_time = time.time()

        # Check shared genes VERY IMPORTANT
        shared_genes = st_data.var_names.intersection(sc_data.var_names)
        logger.info(f"Number of shared genes between final ST and SC data: {len(shared_genes)}")
        if len(shared_genes) == 0:
            logger.error("No shared genes found. Cannot run SCENVI.")
            raise ValueError("No shared genes for SCENVI.")
        elif len(shared_genes) < 100: # Arbitrary threshold, but very few genes might be problematic
             logger.warning(f"Only {len(shared_genes)} shared genes found. SCENVI results might be suboptimal.")

        logger.info("Initializing SCENVI model...")
        envi_model = scenvi.ENVI(spatial_data = st_data, sc_data = sc_data)

        logger.info("Training SCENVI model...")
        # Assuming train detects GPU via CUDA_VISIBLE_DEVICES
        # Check docs if explicit use_gpu=use_gpu needed
        envi_model.train()

        logger.info("Running SCENVI auxiliary functions...")
        envi_model.impute_genes()
        envi_model.infer_niche_covet()
        envi_model.infer_niche_celltype()

        logger.info(f"SCENVI analysis finished in {time.time() - scenvi_start_time:.2f} seconds.")

        # Add results back and save final objects
        logger.info("Adding SCENVI results to AnnData objects.")
        st_data.obsm['envi_latent'] = envi_model.spatial_data.obsm['envi_latent']
        st_data.obsm['COVET'] = envi_model.spatial_data.obsm['COVET']
        st_data.obsm['COVET_SQRT'] = envi_model.spatial_data.obsm['COVET_SQRT']
        st_data.uns['COVET_genes'] = envi_model.CovGenes
        st_data.obsm['imputation'] = envi_model.spatial_data.obsm['imputation']
        st_data.obsm['cell_type_niche'] = envi_model.spatial_data.obsm['cell_type_niche']

        sc_data.obsm['envi_latent'] = envi_model.sc_data.obsm['envi_latent']
        sc_data.obsm['COVET'] = envi_model.sc_data.obsm['COVET']
        sc_data.obsm['COVET_SQRT'] = envi_model.sc_data.obsm['COVET_SQRT']
        sc_data.obsm['cell_type_niche'] = envi_model.sc_data.obsm['cell_type_niche']
        sc_data.uns['COVET_genes'] = envi_model.CovGenes

        st_final_path = os.path.join(args.output_dir, f"{args.output_prefix}_st_final_with_scenvi.h5ad")
        sc_final_path = os.path.join(args.output_dir, f"{args.output_prefix}_sc_final_with_scenvi.h5ad")

        logger.info(f"Saving final spatial data with SCENVI results to {st_final_path}")
        st_data.var.index.name = None
        st_data.write_h5ad(st_final_path)

        logger.info(f"Saving final scRNA-seq data with SCENVI results to {sc_final_path}")
        sc_data.var.index.name = None
        sc_data.write_h5ad(sc_final_path)
    else:
        logger.info("Skipping SCENVI analysis as --run_scenvi flag was not provided.")


    total_time = time.time() - start_time
    logger.info(f"--- Script finished successfully in {total_time:.2f} seconds ---")

except Exception as e:
    logger.error(f"An error occurred during the main execution: {e}", exc_info=True)
    total_time = time.time() - start_time
    logger.info(f"--- Script failed after {total_time:.2f} seconds ---")
    exit(1)

# function - st preprocess

In [None]:
adata_path="/data/kjc2/projects/P330.CSA/rds/250210_CRC_BJM_0050585_Region1_25-04-14-16-53.h5ad"
output_dir="/data/kjc2/projects/P330.CSA/rds/"
output_prefix="scenvi_result"
n_pcs=30
n_neighbors=15
leiden_res=0.8
preprocess_spatial_data(adata_path, output_dir, output_prefix, n_pcs, n_neighbors, leiden_res)

--- Starting Spatial Data Preprocessing: /data/kjc2/projects/P330.CSA/rds/250210_CRC_BJM_0050585_Region1_25-04-14-16-53.h5ad ---
Loaded spatial data: AnnData object with n_obs × n_vars = 62941 × 4999
    obs: 'x_centroid', 'y_centroid', 'transcript_counts', 'control_probe_counts', 'genomic_control_counts', 'control_codeword_counts', 'unassigned_codeword_counts', 'deprecated_codeword_counts', 'total_counts', 'cell_area', 'nucleus_area', 'nucleus_count', 'segmentation_method', 'n_genes_by_counts', 'n_genes', 'n_counts', 'leiden_res0.8'
    var: 'gene_ids', 'feature_types', 'genome', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'mean', 'std'
    uns: 'hvg', 'leiden_res0.8', 'leiden_res0.8_colors', 'log1p', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap', 'spatial'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'
Cells before filtering: 62941
Genes bef

In [40]:
import scanpy as sc
import pandas as pd
import numpy as np

# --- 1. 데이터 로딩 및 메타데이터 추가 ---
# Xenium 출력 폴더 경로
xenium_folder = '/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050585__Region_1__20250205__113422/'

# Cell-feature matrix 로딩
adata = sc.read_10x_h5(f'{xenium_folder}/cell_feature_matrix.h5')
adata.var_names_make_unique() # 유전자 이름 고유하게 만들기
# adata.var -> gene_ids(ENSG gene name), feature_types="Gene Expression", genome="Unknown"


In [56]:
adata.obs_names
# Index(['aaaagckh-1', 'aaaaghho-1', 'aaaalnmj-1', 'aaacelao-1', 'aaacjopg-1',
#        'aaadfbpe-1', 'aaaecfmd-1', 'aaagfiei-1', 'aaajbacd-1', 'aaakiamn-1',
#        ...
#        'oioloked-1', 'oiomafdd-1', 'oiommibn-1', 'oiompanb-1', 'oiompmli-1',
#        'oiooahog-1', 'oioomlkl-1', 'oiopjhfe-1', 'oiopjmpb-1', 'oiopochn-1'],
#       dtype='object', length=67645)

Index(['aaaagckh-1', 'aaaaghho-1', 'aaaalnmj-1', 'aaacelao-1', 'aaacjopg-1',
       'aaadfbpe-1', 'aaaecfmd-1', 'aaagfiei-1', 'aaajbacd-1', 'aaakiamn-1',
       ...
       'oioloked-1', 'oiomafdd-1', 'oiommibn-1', 'oiompanb-1', 'oiompmli-1',
       'oiooahog-1', 'oioomlkl-1', 'oiopjhfe-1', 'oiopjmpb-1', 'oiopochn-1'],
      dtype='object', length=67645)

In [44]:
cells_df = pd.read_parquet(f'{xenium_folder}/cells.parquet')
#cells_df.keys()
# Index(['cell_id', 'x_centroid', 'y_centroid', 'transcript_counts',
#        'control_probe_counts', 'genomic_control_counts',
#        'control_codeword_counts', 'unassigned_codeword_counts',
#        'deprecated_codeword_counts', 'total_counts', 'cell_area',
#        'nucleus_area', 'nucleus_count', 'segmentation_method'],
#       dtype='object')

In [None]:
# 세포 메타데이터 및 공간 좌표 로딩 (Parquet 기준)
cells_df = pd.read_parquet(f'{xenium_folder}/cells.parquet')

# AnnData의 obs 인덱스와 cells_df의 cell_id가 일치하는지 확인 필요
# 만약 다르다면, cells_df의 인덱스를 cell_id로 설정; 실제로 cells_df의 인덱스는 1~세포수이고, cell_id는 adata(h5)에 있는 cell name과 같다.
if 'cell_id' in cells_df.columns:
    cells_df = cells_df.set_index('cell_id')

# AnnData의 obs 인덱스와 정렬하여 메타데이터 추가
adata.obs = adata.obs.join(cells_df, how='left')

# 공간 좌표 추가 (x_centroid, y_centroid 컬럼 이름 확인 필요)
if 'x_centroid' in adata.obs.columns and 'y_centroid' in adata.obs.columns:
    adata.obsm['spatial'] = adata.obs[['x_centroid', 'y_centroid']].to_numpy()
else:
    print("Warning: Could not find 'x_centroid' or 'y_centroid' in adata.obs to populate adata.obsm['spatial']")
    # 필요시 다른 좌표 컬럼 사용 (예: fov_x, fov_y - 이는 FOV 내 상대좌표일 수 있음)

# 원하는 obs 컬럼 추가 (cells.parquet에 있는 정보 활용)
# 예시: fovID -> fov, center_x -> x_centroid, center_y -> y_centroid 등 이름 매핑
# 예시: adata.obs['fovID'] = adata.obs['fov']
# 다른 정보(slice_id, sample_id, label 등)는 실험 설계에 따라 수동으로 추가하거나 다른 파일에서 로드 필요

print("Initial AnnData object:")
print(adata)

# --- 2. 품질 관리 (QC) ---
# QC 지표 계산 (필요시 미토콘드리아 유전자 리스트 제공)
# mt_genes = adata.var_names.str.startswith('MT-') # 예시, 패널에 따라 다름
# sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'] if np.any(mt_genes) else None, percent_top=None, log1p=False, inplace=True)
sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)


# QC 시각화 (예시)
# sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts'], jitter=0.4, multi_panel=True)
# sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')

# QC 필터링 (값은 데이터에 맞게 조정 필요)
min_genes = 20  # 예시: 최소 유전자 수
min_counts = 50 # 예시: 최소 전사체 수
print(f"Cells before filtering: {adata.n_obs}")
sc.pp.filter_cells(adata, min_genes=min_genes)
sc.pp.filter_cells(adata, min_counts=min_counts)

min_cells = 5 # 예시: 최소 세포 수
print(f"Genes before filtering: {adata.n_vars}")
sc.pp.filter_genes(adata, min_cells=min_cells)
print(f"Cells after filtering: {adata.n_obs}")
print(f"Genes after filtering: {adata.n_vars}")


# --- 3. 정규화 ---
# 총 count에 대해 정규화 (target_sum=1e4는 일반적)
sc.pp.normalize_total(adata, target_sum=1e4)

# 로그 변환
sc.pp.log1p(adata)


# --- 4. 고변동 유전자 식별 ---
# HVG 식별 (flavor='seurat_v3'가 많이 사용됨)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5, flavor='seurat_v3')
# HVG 결과 시각화
# sc.pl.highly_variable_genes(adata)

# HVG만 남기려면 (선택 사항, 원본 보존하려면 복사 후 수행)
# adata.raw = adata # 원본 데이터를 adata.raw에 저장
# adata = adata[:, adata.var.highly_variable]


# --- 5. 스케일링 ---
# 데이터 스케일링 (주로 HVG에 대해 수행)
sc.pp.scale(adata, max_value=10) # max_value로 값 제한 가능


# --- 최종 AnnData 객체 확인 ---
print("\nProcessed AnnData object:")
print(adata)

# 원하는 obs, var 필드가 생성되었는지 확인
print("\nAvailable obs columns:")
print(list(adata.obs.columns))
print("\nAvailable var columns:")
print(list(adata.var.columns))
print("\nAvailable obsm keys:")
print(list(adata.obsm.keys()))



In [57]:
adata

AnnData object with n_obs × n_vars = 67645 × 5001
    var: 'gene_ids', 'feature_types', 'genome'

In [58]:
sc.pp.normalize_total(adata, target_sum=1e4)



In [59]:
adata

AnnData object with n_obs × n_vars = 67645 × 5001
    var: 'gene_ids', 'feature_types', 'genome'

In [60]:
sc.pp.log1p(adata)

In [61]:
adata

AnnData object with n_obs × n_vars = 67645 × 5001
    var: 'gene_ids', 'feature_types', 'genome'
    uns: 'log1p'

In [63]:
plt.figure()
plt.hist(
    adata.obs['n_genes_by_counts'],
    bins=50
)  # 막대형 히스토그램 생성&#8203;:contentReference[oaicite:1]{index=1}
plt.xlabel('Total counts per cell')
plt.ylabel('Number of cells')
plt.title('Library size distribution')
plt.tight_layout()
plt.show()

KeyError: 'n_genes_by_counts'

<Figure size 640x480 with 0 Axes>

In [64]:
ax = adata.obs['n_genes_by_counts'].plot.density()  # KDE 플롯 생성&#8203;:contentReference[oaicite:2]{index=2}
ax.set_xlabel('Total counts per cell')
ax.set_title('Library size density')
plt.tight_layout()
plt.show()

KeyError: 'n_genes_by_counts'

In [65]:
adata.obs

aaaagckh-1
aaaaghho-1
aaaalnmj-1
aaacelao-1
aaacjopg-1
...
oiooahog-1
oioomlkl-1
oiopjhfe-1
oiopjmpb-1
oiopochn-1


# test

In [1]:
urls={"S1R1":["/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050586__Region_1__20250205__113422/cell_feature_matrix.h5",
        "/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050586__Region_1__20250205__113422/cells.parquet",
        "/data/ARPAH/250210_CRC_BJM/cell_ID/slide1(0050568)_Region1"],
"S1R2":["/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050586__Region_2__20250205__113422/cell_feature_matrix.h5",
        "/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050586__Region_2__20250205__113422/cells.parquet",
        "/data/ARPAH/250210_CRC_BJM/cell_ID/slide1(0050568)_Region2"],
"S1R3":["/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050586__Region_3__20250205__113422/cell_feature_matrix.h5",
        "/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050586__Region_3__20250205__113422/cells.parquet",
        "/data/ARPAH/250210_CRC_BJM/cell_ID/slide1(0050568)_Region3"],
"S1R4":["/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050586__Region_4__20250205__113422/cell_feature_matrix.h5",
        "/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050586__Region_4__20250205__113422/cells.parquet",
        "/data/ARPAH/250210_CRC_BJM/cell_ID/slide1(0050568)_Region4"],
"S1R5":["/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050586__Region_5__20250205__113422/cell_feature_matrix.h5",
        "/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050586__Region_5__20250205__113422/cells.parquet",
        "/data/ARPAH/250210_CRC_BJM/cell_ID/slide1(0050568)_Region5"],
"S1R6":["/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050586__Region_6__20250205__113422/cell_feature_matrix.h5",
        "/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050586__Region_6__20250205__113422/cells.parquet",
        "/data/ARPAH/250210_CRC_BJM/cell_ID/slide1(0050568)_Region6"],
"S1R7":["/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050586__Region_7__20250205__113422/cell_feature_matrix.h5",
        "/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050586__Region_7__20250205__113422/cells.parquet",
        "/data/ARPAH/250210_CRC_BJM/cell_ID/slide1(0050568)_Region7"],
"S1R8":["/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050586__Region_8__20250205__113422/cell_feature_matrix.h5",
        "/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050586__Region_8__20250205__113422/cells.parquet",
        "/data/ARPAH/250210_CRC_BJM/cell_ID/slide1(0050568)_Region8"],
"S2R1":["/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050585__Region_1__20250205__113422/cell_feature_matrix.h5",
        "/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050585__Region_1__20250205__113422/cells.parquet",
        "/data/ARPAH/250210_CRC_BJM/cell_ID/slide2(0050585)_Region1"],
"S2R2":["/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050585__Region_2__20250205__113422/cell_feature_matrix.h5",
        "/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050585__Region_2__20250205__113422/cells.parquet",
        "/data/ARPAH/250210_CRC_BJM/cell_ID/slide2(0050585)_Region2"],
"S2R3":["/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050585__Region_3__20250205__113422/cell_feature_matrix.h5",
        "/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050585__Region_3__20250205__113422/cells.parquet",
        "/data/ARPAH/250210_CRC_BJM/cell_ID/slide2(0050585)_Region3"],
"S2R4":["/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050585__Region_4__20250205__113422/cell_feature_matrix.h5",
        "/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050585__Region_4__20250205__113422/cells.parquet",
        "/data/ARPAH/250210_CRC_BJM/cell_ID/slide2(0050585)_Region4"],
"S2R5":["/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050585__Region_5__20250205__113422/cell_feature_matrix.h5",
        "/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050585__Region_5__20250205__113422/cells.parquet",
        "/data/ARPAH/250210_CRC_BJM/cell_ID/slide2(0050585)_Region5"],
"S2R6":["/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050585__Region_6__20250205__113422/cell_feature_matrix.h5",
        "/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050585__Region_6__20250205__113422/cells.parquet",
        "/data/ARPAH/250210_CRC_BJM/cell_ID/slide2(0050585)_Region6"],
"S2R7":["/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050585__Region_7__20250205__113422/cell_feature_matrix.h5",
        "/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050585__Region_7__20250205__113422/cells.parquet",
        "/data/ARPAH/250210_CRC_BJM/cell_ID/slide2(0050585)_Region7"],
"S2R8":["/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050585__Region_8__20250205__113422/cell_feature_matrix.h5",
        "/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050585__Region_8__20250205__113422/cells.parquet",
        "/data/ARPAH/250210_CRC_BJM/cell_ID/slide2(0050585)_Region8"]}

In [6]:
for key in urls:
    print(key)

S1R1
S1R2
S1R3
S1R4
S1R5
S1R6
S1R7
S1R8
S2R1
S2R2
S2R3
S2R4
S2R5
S2R6
S2R7
S2R8
