# defaulting, installation

In [1]:
%load_ext autoreload
%autoreload 2

import os

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="-1" # Change to -1 if you want to use CPU!

import warnings
warnings.filterwarnings('ignore')

import scenvi

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd
import scanpy as sc
import colorcet
import umap.umap_ as umap

import rpy2.robjects as robjects
from rpy2.robjects import r

An NVIDIA GPU may be present on this machine, but a CUDA-enabled jaxlib is not installed. Falling back to cpu.


In [3]:
cell_type_palette = {'Astro': (0.843137, 0.0, 0.0, 1.0),
                     'Endo': (0.54902, 0.235294, 1.0, 1.0),
                     'L23_IT': (0.007843, 0.533333, 0.0, 1.0),
                     'L45_IT': (0.0, 0.67451, 0.780392, 1.0),
                     'L56_NP': (0.596078, 1.0, 0.0, 1.0),
                     'L5_ET': (1.0, 0.498039, 0.819608, 1.0),
                     'L5_IT': (0.423529, 0.0, 0.309804, 1.0),
                     'L5_PT': (1.0, 0.647059, 0.188235, 1.0),
                     'L6_CT': (0.345098, 0.231373, 0.0, 1.0),
                     'L6_IT': (0.0, 0.341176, 0.34902, 1.0),
                     'L6_IT_Car3': (0.0, 0.0, 0.866667, 1.0),
                     'L6b': (0.0, 0.992157, 0.811765, 1.0),
                     'Lamp5': (0.631373, 0.458824, 0.415686, 1.0),
                     'Microglia': (0.737255, 0.717647, 1.0, 1.0),
                     'OPC': (0.584314, 0.709804, 0.470588, 1.0),
                     'Oligo': (0.752941, 0.015686, 0.72549, 1.0),
                     'Pericytes': (0.392157, 0.329412, 0.454902, 1.0),
                     'Pvalb': (0.47451, 0.0, 0.0, 1.0),
                     'SMC': (0.027451, 0.454902, 0.847059, 1.0),
                     'Sncg': (0.996078, 0.960784, 0.564706, 1.0),
                     'Sst': (0.0, 0.294118, 0.0, 1.0),
                     'VLMC': (0.560784, 0.478431, 0.0, 1.0),
                     'Vip': (1.0, 0.447059, 0.4, 1.0)}

cell_label_palette = {'GABAergic': (0.843137, 0.0, 0.0, 1.0),
                      'Glutamatergic': (0.54902, 0.235294, 1.0, 1.0),
                      'Non-Neuronal': (0.007843, 0.533333, 0.0, 1.0)}

# File loading

In [22]:
crc_sc = sc.read_h5ad("/data/kjc2/rds/downsampled.h5ad")

In [15]:
print(crc_sc)
"""
AnnData object with n_obs × n_vars = 426492 × 28476
    obs: 'dataset', 'medical_condition', 'cancer_type', 'sample_id', 'sample_type', 'tumor_source', 'replicate', 'sample_tissue', 'anatomic_region', 'anatomic_location', 'tumor_stage', 'tumor_stage_TNM', 'tumor_stage_TNM_T', 'tumor_stage_TNM_N', 'tumor_stage_TNM_M', 'tumor_size', 'tumor_dimensions', 'tumor_grade', 'histological_type', 'microsatellite_status', 'mismatch_repair_deficiency_status', 'MLH1_promoter_methylation_status', 'MLH1_status', 'KRAS_status', 'BRAF_status', 'APC_status', 'TP53_status', 'PIK3CA_status', 'SMAD4_status', 'NRAS_status', 'MSH6_status', 'FBXW7_status', 'NOTCH1_status', 'MSH2_status', 'PMS2_status', 'POLE_status', 'ERBB2_status', 'STK11_status', 'HER2_status', 'CTNNB1_status', 'BRAS_status', 'patient_id', 'sex', 'age', 'treatment_status_before_resection', 'treatment_drug', 'treatment_response', 'RECIST', 'platform', 'platform_fine', 'cellranger_version', 'reference_genome', 'matrix_type', 'enrichment_cell_types', 'tissue_cell_state', 'tissue_processing_lab', 'hospital_location', 'country', 'NCBI_BioProject_accession', 'SRA_sample_accession', 'GEO_sample_accession', 'ENA_sample_accession', 'synapse_sample_accession', 'study_id', 'study_doi', 'study_pmid', 'original_obs_names', 'cell_type_coarse_study', 'cell_type_middle_study', 'cell_type_study', 'n_counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'pct_counts_in_top_20_genes', 'pct_counts_mito', 'S_score', 'G2M_score', 'phase', 'SOLO_doublet_prob', 'SOLO_singlet_prob', 'SOLO_doublet_status', 'cell_type_predicted', 'cell_type_coarse', 'cell_type_middle', 'cell_type_fine', 'CMS_type', 'immune_infiltration_type', 'is_primary_data', 'suspension_type', 'tissue_type', 'donor_id', 'disease', 'disease_ontology_term_id', 'assay', 'assay_ontology_term_id', 'tissue', 'tissue_ontology_term_id', 'sex_ontology_term_id', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'organism', 'organism_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'cell_type', 'cell_type_ontology_term_id'
    var: 'var_names', 'ensembl', 'Geneid', 'GeneSymbol', 'Chromosome', 'Start', 'End', 'Class', 'Strand', 'Length', 'Version', 'Dataset_25pct_Overlap', 'n_cells', 'n_counts', 'n_cells_by_counts', 'mean_counts', 'total_counts', 'highly_variable'
    obsm: 'X_scANVI', 'X_scVI', 'X_umap'
    layers: 'counts'
"""

AnnData object with n_obs × n_vars = 426492 × 28476
    obs: 'dataset', 'medical_condition', 'cancer_type', 'sample_id', 'sample_type', 'tumor_source', 'replicate', 'sample_tissue', 'anatomic_region', 'anatomic_location', 'tumor_stage', 'tumor_stage_TNM', 'tumor_stage_TNM_T', 'tumor_stage_TNM_N', 'tumor_stage_TNM_M', 'tumor_size', 'tumor_dimensions', 'tumor_grade', 'histological_type', 'microsatellite_status', 'mismatch_repair_deficiency_status', 'MLH1_promoter_methylation_status', 'MLH1_status', 'KRAS_status', 'BRAF_status', 'APC_status', 'TP53_status', 'PIK3CA_status', 'SMAD4_status', 'NRAS_status', 'MSH6_status', 'FBXW7_status', 'NOTCH1_status', 'MSH2_status', 'PMS2_status', 'POLE_status', 'ERBB2_status', 'STK11_status', 'HER2_status', 'CTNNB1_status', 'BRAS_status', 'patient_id', 'sex', 'age', 'treatment_status_before_resection', 'treatment_drug', 'treatment_response', 'RECIST', 'platform', 'platform_fine', 'cellranger_version', 'reference_genome', 'matrix_type', 'enrichment_cell_t

"\nAnnData object with n_obs × n_vars = 426492 × 28476\n    obs: 'dataset', 'medical_condition', 'cancer_type', 'sample_id', 'sample_type', 'tumor_source', 'replicate', 'sample_tissue', 'anatomic_region', 'anatomic_location', 'tumor_stage', 'tumor_stage_TNM', 'tumor_stage_TNM_T', 'tumor_stage_TNM_N', 'tumor_stage_TNM_M', 'tumor_size', 'tumor_dimensions', 'tumor_grade', 'histological_type', 'microsatellite_status', 'mismatch_repair_deficiency_status', 'MLH1_promoter_methylation_status', 'MLH1_status', 'KRAS_status', 'BRAF_status', 'APC_status', 'TP53_status', 'PIK3CA_status', 'SMAD4_status', 'NRAS_status', 'MSH6_status', 'FBXW7_status', 'NOTCH1_status', 'MSH2_status', 'PMS2_status', 'POLE_status', 'ERBB2_status', 'STK11_status', 'HER2_status', 'CTNNB1_status', 'BRAS_status', 'patient_id', 'sex', 'age', 'treatment_status_before_resection', 'treatment_drug', 'treatment_response', 'RECIST', 'platform', 'platform_fine', 'cellranger_version', 'reference_genome', 'matrix_type', 'enrichment_ce

In [None]:
import spatialdata as sd
from spatialdata_io import xenium


zarr_path = "./Xenium.zarr"

# Xenium 데이터 로드
sdata = xenium(xenium_path)

# Zarr 형식으로 저장
xenium.write(zarr_path)

# Zarr 파일에서 데이터 읽기
sdata = sd.read_zarr(zarr_path)


In [None]:
## rds loading for xenium

In [10]:
import pyreadr

# rds 파일 읽기 (경로는 실제 파일 위치에 맞게 수정)
crc_xe = pyreadr.read_r('/data/kjc2/projects/P330.CSA/rds/250210_CRC_BJM_0050585_Region1_25-04-14-13-14.rds')

# 읽은 결과는 딕셔너리로 반환됩니다.
# 만약 하나의 R 객체만 저장된 경우, 딕셔너리의 키는 None이 됩니다.
# print("키 목록:", list(result.keys()))

# # 데이터 객체 가져오기 (키가 None이면)
# data = result[None]
# print(data)

crc_


LibrdataError: The file contains an unrecognized object

In [3]:
import pandas as pd
import scanpy as sc


xenium_folder="/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050585__Region_1__20250205__113422/"
# 세포 단위 발현 데이터를 불러오기
cells_df = pd.read_parquet(xenium_folder+"cells.parquet")
transcripts_df = pd.read_parquet(xenium_folder+"transcripts.parquet")

# 발현 데이터 정리 및 행렬 구성 (필요한 경우)
# gene expression matrix 구성
expr_matrix = transcripts_df.pivot(index='cell_id', columns='feature_name', values='counts').fillna(0)

# AnnData 객체로 변환
adata = sc.AnnData(X=expr_matrix)


KeyError: 'feature_name'

In [20]:
import numpy

In [5]:
xenium_folder="/data/ARPAH/250210_CRC_BJM/output-XETG00274__0050585__Region_1__20250205__113422/"
# 세포 단위 발현 데이터를 불러오기
cells_df = pd.read_parquet(xenium_folder+"cells.parquet")
transcripts_df = pd.read_parquet(xenium_folder+"transcripts.parquet")

# 발현 데이터 정리 및 행렬 구성 (필요한 경우)
# gene expression matrix 구성
expr_matrix = cells_df.pivot(index='cell_id', columns='feature_name', values='counts').fillna(0)

KeyError: 'feature_name'

In [23]:
pip install --upgrade numpy


Collecting numpy
  Using cached numpy-2.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Using cached numpy-2.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.5 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
scvi-tools 1.1.6.post2 requires numpy<2.0,>=1.21.0, but you have numpy 2.0.2 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-2.0.2
Note: you may need to restart the kernel to use updated packages.


In [9]:
# gene expression matrix 구성
expr_matrix = transcripts_df.pivot(index='cell_id', columns='feature_name', values='counts').fillna(0)

# AnnData 객체로 변환
adata = sc.AnnData(X=expr_matrix)

KeyError: 'counts'

In [1]:
import numpy as np
print(np.__version__)


1.26.4


In [28]:
pip install numpy==1.26.4 --force-reinstall --no-deps


Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
Successfully installed numpy-1.26.4
Note: you may need to restart the kernel to use updated packages.


In [7]:
print(cells_df.columns)


Index(['cell_id', 'x_centroid', 'y_centroid', 'transcript_counts',
       'control_probe_counts', 'genomic_control_counts',
       'control_codeword_counts', 'unassigned_codeword_counts',
       'deprecated_codeword_counts', 'total_counts', 'cell_area',
       'nucleus_area', 'nucleus_count', 'segmentation_method'],
      dtype='object')


In [8]:
print(transcripts_df.columns)


Index(['transcript_id', 'cell_id', 'overlaps_nucleus', 'feature_name',
       'x_location', 'y_location', 'z_location', 'qv', 'fov_name',
       'nucleus_distance', 'codeword_index', 'codeword_category', 'is_gene'],
      dtype='object')


# scENVI 튜토리알

In [10]:
st_data = sc.read_h5ad('/data/kjc2/projects/P330.CSA/rds/st_data.h5ad')

In [11]:
print(st_data)

AnnData object with n_obs × n_vars = 276556 × 254
    obs: 'fovID', 'fov_x', 'fov_y', 'volume', 'center_x', 'center_y', 'slice_id', 'sample_id', 'label', 'subclass', 'class_label', 'cell_id', 'cell_type', 'batch', 'cell_label', 'Layer_Depth', 'Depth'
    var: 'n_iso', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    obsm: 'spatial'
