In [1]:
import scanpy as sc
import pandas as pd

In [11]:
# inpath for object to prepare
inpath = './20251008_Cevrim_XMens_Only_Fib_Dec_Cells.h5ad'

# .csv files for mouse and human genes
mouse_genes_path = './filter_transcripts/gene_transcript_csvs/mm_gene_names.csv'
human_genes_path = './filter_transcripts/gene_transcript_csvs/hs_gene_names.csv'

# fibroblast labels, in case of fibroblast only object
hs_fibro_labels = ['endometrial fibroblasts', 'decidual cells']
mm_fibro_labels = ['decidual cells', 'endometrial fibroblasts']

In [3]:
# read in the data
adata = sc.read_h5ad(inpath)

In [7]:
adata.obs['species'] = 'mouse'

In [14]:
# determine the species
species = adata.obs['species'].iloc[0]

# read in approprate gene list .csv file
if species == 'mouse':
    genes_list = pd.read_csv(mouse_genes_path)
elif species == 'human':
    genes_list = pd.read_csv(human_genes_path)

In [15]:
# create gene_name .obs column
adata.var['gene_name'] = adata.var.index.tolist().copy()

# merge the .csv along gene_name
adata.var = adata.var.merge(
    genes_list,
    on='gene_name',
    how='left'
)

# delete adata.raw if it exists
del adata.raw

# make sure .X has log-normalized counts
adata.X = adata.layers['log1p'].copy()

# make indices canonical transcript ids for SAMap
adata.var.index = adata.var['canonical_transcript_id'].tolist().copy()

In [16]:
# subset object along genes with non-na transcript ids
non_na = (~adata.var.index.isna()).tolist()
adata_non_na = adata[:,non_na].copy()

In [None]:
# plot one last time to be sure
sc.pl.embedding(adata_non_na,basis = 'X_umap_scVI', color = 'cell_type_coarse')

In [19]:
adata_non_na.var

Unnamed: 0,mito,ribo,highly_variable,highly_variable_rank,highly_variable_nbatches,mean,std,gene_name,gene_id,canonical_transcript_id
ENSMUST00000070533,False,False,True,836.0,16.0,0.190801,0.469210,Xkr4,ENSMUSG00000051951,ENSMUST00000070533
ENSMUST00000161581,False,False,True,1535.0,13.0,0.035377,0.178709,Gm1992,ENSMUSG00000089699,ENSMUST00000161581
ENSMUST00000192692,False,False,False,,,0.012358,0.099997,Gm19938,ENSMUSG00000102331,ENSMUST00000192692
ENSMUST00000192427,False,False,True,1885.0,5.0,0.000049,0.004389,Gm37381,ENSMUSG00000102343,ENSMUST00000192427
ENSMUST00000027032,False,False,True,178.0,11.0,0.000682,0.024410,Rp1,ENSMUSG00000025900,ENSMUST00000027032
...,...,...,...,...,...,...,...,...,...,...
ENSMUST00000082419,True,False,False,,,0.040539,0.193989,mt-Nd6,ENSMUSG00000064368,ENSMUST00000082419
ENSMUST00000082421,True,False,False,,,0.017810,0.125365,mt-Cytb,ENSMUSG00000064370,ENSMUST00000082421
ENSMUST00000177783,False,False,False,,,0.000000,1.000000,Vmn1r186,ENSMUSG00000096776,ENSMUST00000177783
ENSMUST00000078827,False,False,False,,,0.000127,0.011349,Csprs,ENSMUSG00000062783,ENSMUST00000078827


In [20]:
# write it out to disk
adata_non_na.write_h5ad('./20251008_Cevrim_XMens_Only_Fib_Dec_Cells_for_samap.h5ad')