In [1]:
from scipy import sparse, io
import numpy as np
import pandas as pd
import cellex
import matplotlib

In [2]:
# Read sparse scRNA athero meta analyzed scRNA sct norm counts matrix from Endothelial cells. Loading the data like this takes only 4-5 mins as opposed of 1-2 hours 
# by using pd.read_csv
endo_sparse_matrix = io.mmread("/project/cphg-millerlab/Jose/human_scRNA_meta_analysis/rds_objects/integration_rds_objects/rPCA/alsaigh_pan_wirka_hu_int/CELLECT_inputs/Endothelial/input_matrix_files/rpca_endo_sct_sparse_matrix.txt")
endo_mat_dense = endo_sparse_matrix.toarray()

In [3]:
# Read rownames and colnames csv files
endo_row_names = np.genfromtxt("/project/cphg-millerlab/Jose/human_scRNA_meta_analysis/rds_objects/integration_rds_objects/rPCA/alsaigh_pan_wirka_hu_int/CELLECT_inputs/Endothelial/input_matrix_files/rpca_endo_matrix_rownames.txt",
                         dtype=str)
endo_col_names = np.genfromtxt("/project/cphg-millerlab/Jose/human_scRNA_meta_analysis/rds_objects/integration_rds_objects/rPCA/alsaigh_pan_wirka_hu_int/CELLECT_inputs/Endothelial/input_matrix_files/rpca_endo_matrix_colnames.txt",
                         dtype=str)

In [4]:
# Export matrix to df format used as input for CELLEX
endo_data = pd.DataFrame(endo_mat_dense, columns=endo_col_names, index=endo_row_names)
endo_data.head()

Unnamed: 0,AAACGCTAGCGAAACC_1,AAAGGATCAAGGGTCA_1,AAAGGGCGTACGATGG_1,AACCTTTAGCTGCCTG_1,AAGCATCAGGCCTGCT_1,ACCATTTAGGGATCAC_1,ACGCACGTCTTCGCTG_1,ACTATCTCATAATGAG_1,AGAAGTACATTCACAG_1,AGATGAAGTCAGTCTA_1,...,TTGGCAAAGTCCGGTC-1_15,TTGTAGGAGCGGATCA-1_15,TTGTAGGCAATTGCTG-1_15,TTGTAGGGTGGTTTCA-1_15,TTTACTGGTTCCCTTG-1_15,TTTCCTCAGGGCACTA-1_15,TTTGCGCAGACATAAC-1_15,TTTGGTTAGCTGTCTA-1_15,TTTGGTTCAGCTGCTG-1_15,TTTGGTTTCGTCCGTT-1_15
AL627309.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL669831.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LINC00115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FAM41C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL645608.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Load metadata for cells from Endothelial cells
endo_metadata = pd.read_csv("/project/cphg-millerlab/Jose/human_scRNA_meta_analysis/rds_objects/integration_rds_objects/rPCA/alsaigh_pan_wirka_hu_int/CELLECT_inputs/Endothelial/input_matrix_files/rpca_endo_level2_annotations_metadata.csv")
endo_metadata.head()

Unnamed: 0.1,Unnamed: 0,cell_type
0,AAACGCTAGCGAAACC_1,Angiogenic/Vasa_vasorum_EC
1,AAAGGATCAAGGGTCA_1,EndoMT_EC
2,AAAGGGCGTACGATGG_1,EndoMT_EC
3,AACCTTTAGCTGCCTG_1,Intimal_EC
4,AAGCATCAGGCCTGCT_1,EndoMT_EC


In [6]:
# Check how many cells we have per cell type from the Endothelial compartment 
print(endo_metadata.groupby("cell_type").cell_type.count())

cell_type
Angiogenic/Vasa_vasorum_EC    6406
EndoMT_EC                     2888
Inflammatory_EC                937
Intimal_EC                    2142
Lymphatic_EC                   306
Name: cell_type, dtype: int64


In [7]:
# Set first columns as index. This needs to be done so that genes don't get mistaken as expression values. Also the metadata file should have only the cell_type column and cell barcodes
# as row names
endo_metadata = endo_metadata.set_index("Unnamed: 0")
endo_metadata.head()

Unnamed: 0_level_0,cell_type
Unnamed: 0,Unnamed: 1_level_1
AAACGCTAGCGAAACC_1,Angiogenic/Vasa_vasorum_EC
AAAGGATCAAGGGTCA_1,EndoMT_EC
AAAGGGCGTACGATGG_1,EndoMT_EC
AACCTTTAGCTGCCTG_1,Intimal_EC
AAGCATCAGGCCTGCT_1,EndoMT_EC


In [8]:
# Create ESObject and compute Expression Specificity for cells from the Endothelial compartment 
eso = cellex.ESObject(data=endo_data, annotation=endo_metadata, verbose=True)
eso.compute(verbose=True)

Preprocessing - checking input ... input parsed in 0 min 0 sec
Preprocessing - running remove_non_expressed ... excluded 4743 / 23381 genes in 0 min 0 sec
Preprocessing - normalizing data ... data normalized in 0 min 5 sec
Preprocessing - running ANOVA ... excluded 10414 / 18638 genes in 0 min 5 sec
Computing DET ... 
    esw ...
    empirical p-values ...
    esw_s ...
    finished in 0 min 0 sec
Computing EP ...
    esw ...
    empirical p-values ...
    esw_s ...
    finished in 0 min 0 sec
Computing GES ...
    esw ...
    empirical p-values ...
    esw_s ...
    finished in 0 min 1 sec
Computing NSI ...
    esw ...
    empirical p-values ...
    esw_s ...
    finished in 0 min 0 sec
Computing ESmu ...
    finished in 0 min 0 sec
Computing ESsd ...
    finished in 0 min 0 sec
Computed ['det.esw', 'det.esw_null', 'det.pvals', 'det.esw_s', 'ep.esw', 'ep.esw_null', 'ep.pvals', 'ep.esw_s', 'ges.esw', 'ges.esw_null', 'ges.pvals', 'ges.esw_s', 'nsi.esw', 'nsi.esw_null', 'nsi.pvals', 'nsi

In [24]:
# Check expression specificity results for some known Endo markers 
eso.results["esmu"].head()
eso.results["esmu"].loc[["PECAM1", "CDH5", "SELE", "VWF", "VCAM1", "ICAM1", "S100A4", "FN1", "CRTAC1", "VIM", "COL1A2", "COL3A1", "MMP2", "LYVE1"]]

Unnamed: 0_level_0,Angiogenic/Vasa_vasorum_EC,EndoMT_EC,Inflammatory_EC,Intimal_EC,Lymphatic_EC
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
PECAM1,0.164201,0.21134,0.0,0.144833,0.0
CDH5,0.0,0.136176,0.0,0.378366,0.014922
SELE,0.142154,0.0,0.949924,0.0,0.0
VWF,0.15714,0.262693,0.070452,0.0,0.0
VCAM1,0.0,0.0,0.747749,0.0,0.0
ICAM1,0.135053,0.0,0.841732,0.0,0.0
S100A4,0.0,0.656762,0.0,0.025763,0.0
FN1,0.0,0.726301,0.0,0.0,0.0
CRTAC1,0.0,0.62008,0.0,0.0,0.0
VIM,0.234221,0.249186,0.226962,0.240902,0.074725


In [25]:
# Check conversion and save results
eso.results["esmu"].head()
eso.save_as_csv(file_prefix="rpca_endo_sct_v3_level2_annotations_expression_specificity_gene_symbols", 
                path="/project/cphg-millerlab/Jose/human_scRNA_meta_analysis/rds_objects/integration_rds_objects/rPCA/alsaigh_pan_wirka_hu_int/CELLECT_inputs/Endothelial/CELLEX_outputs/")

In [26]:
# Map gene symbols to human Ensembl gene IDs. Ensembl IDs are required for CELLECT. 
ensembl_ids = cellex.utils.mapping.human_symbol_to_human_ens(eso.results["esmu"], drop_unmapped=True, verbose=True)

Mapping: human gene symbols --> human ensembl gene id's ...
3.79 pct of genes are unmapped ...
Removed 312 unmapped genes ...


In [1]:
# Save expression specificity results with Ensembl IDs 
eso.results["esmu"].head()
eso.save_as_csv(file_prefix="rpca_endo_sct_v3_level2_annotations_expression_specificity_Ensembl_IDs", 
                path="/project/cphg-millerlab/Jose/human_scRNA_meta_analysis/rds_objects/integration_rds_objects/rPCA/alsaigh_pan_wirka_hu_int/CELLECT_inputs/Endothelial/CELLEX_outputs/")

NameError: name 'eso' is not defined