In [1]:
from scipy import sparse, io
import numpy as np
import pandas as pd
import cellex
import matplotlib

In [3]:
# Read sparse scRNA athero meta analyzed scRNA sct norm counts matrix from Myeloid cells. Loading the data like this takes only 4-5 mins as opposed of 1-2 hours 
# by using pd.read_csv
mac_sparse_matrix = io.mmread("/project/cphg-millerlab/Jose/human_scRNA_meta_analysis/rds_objects/integration_rds_objects/rPCA/alsaigh_pan_wirka_hu_int/CELLECT_inputs/Myeloid/input_matrix_files/rpca_myeloid_sct_sparse_matrix.txt")
mac_mat_dense = mac_sparse_matrix.toarray()

In [6]:
# Read rownames and colnames csv files
mac_row_names = np.genfromtxt("/project/cphg-millerlab/Jose/human_scRNA_meta_analysis/rds_objects/integration_rds_objects/rPCA/alsaigh_pan_wirka_hu_int/CELLECT_inputs/Myeloid/input_matrix_files/rpca_myeloid_matrix_rownames.txt",
                         dtype=str)
mac_col_names = np.genfromtxt("/project/cphg-millerlab/Jose/human_scRNA_meta_analysis/rds_objects/integration_rds_objects/rPCA/alsaigh_pan_wirka_hu_int/CELLECT_inputs/Myeloid/input_matrix_files/rpca_myeloid_matrix_colnames.txt",
                         dtype=str)

In [8]:
# Export matrix to df format used as input for CELLEX
mac_data = pd.DataFrame(mac_mat_dense, columns=mac_col_names, index=mac_row_names)
mac_data.head()

Unnamed: 0,AAAGGATCACAAGTTC_1,AAAGTCCGTCTGTCCT_1,AAAGTGAAGTGCTCGC_1,AAATGGATCAAGGAGC_1,AACAAAGCACTTCAAG_1,AACCACATCGCATGAT_1,AACGTCATCTAGTACG_1,AAGAACATCCACAGCG_1,AAGCATCTCGAGAAAT_1,AAGCGAGTCATAAGGA_1,...,TTTCCTCCATCCTAGA-1_15,TTTCCTCCATGCGCAC-1_15,TTTCCTCGTTATTCTC-1_15,TTTGCGCTCGCTTGTC-1_15,TTTGGTTCACATCCAA-1_15,TTTGGTTGTGGCGAAT-1_15,TTTGGTTTCTAGCACA-1_15,TTTGTCAAGGAATGGA-1_15,TTTGTCAAGTGGGATC-1_15,TTTGTCATCGGCATCG-1_15
AL627309.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL669831.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LINC00115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FAM41C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL645608.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Load metadata for cells from Myeloid cells
mac_metadata = pd.read_csv("/project/cphg-millerlab/Jose/human_scRNA_meta_analysis/rds_objects/integration_rds_objects/rPCA/alsaigh_pan_wirka_hu_int/CELLECT_inputs/Myeloid/input_matrix_files/rpca_myeloid_level2_annotations_metadata.csv")
mac_metadata.head()

Unnamed: 0.1,Unnamed: 0,cell_type
0,AAAGGATCACAAGTTC_1,Monocytes/DC
1,AAAGTCCGTCTGTCCT_1,Monocytes/DC
2,AAAGTGAAGTGCTCGC_1,Foamy_Mac2
3,AAATGGATCAAGGAGC_1,Monocytes/DC
4,AACAAAGCACTTCAAG_1,Monocytes/DC


In [10]:
# Check how many cells we have per cell type from the Myeloid compartment 
print(mac_metadata.groupby("cell_type").cell_type.count())

cell_type
Foamy_Mac1               1857
Foamy_Mac2               2052
Inflammatory_Mac         8222
Mast_cell                1228
Monocytes                2440
Monocytes/DC             3048
NAMPT_Neutrophils         139
Phagocytosis_Mac         1685
Proliferating_myeloid      74
Tissue_resident_Mac      1392
cDC                      2005
Name: cell_type, dtype: int64


In [11]:
# Set first columns as index. This needs to be done so that genes don't get mistaken as expression values. Also the metadata file should have only the cell_type column and cell barcodes
# as row names
mac_metadata = mac_metadata.set_index("Unnamed: 0")
mac_metadata.head()

Unnamed: 0_level_0,cell_type
Unnamed: 0,Unnamed: 1_level_1
AAAGGATCACAAGTTC_1,Monocytes/DC
AAAGTCCGTCTGTCCT_1,Monocytes/DC
AAAGTGAAGTGCTCGC_1,Foamy_Mac2
AAATGGATCAAGGAGC_1,Monocytes/DC
AACAAAGCACTTCAAG_1,Monocytes/DC


In [24]:
# Create ESObject and compute Expression Specificity for cells from the Myeloid compartment 
eso = cellex.ESObject(data=mac_data, annotation=mac_metadata, verbose=True)
eso.compute(verbose=True)

Preprocessing - checking input ... input parsed in 0 min 0 sec
Preprocessing - running remove_non_expressed ... excluded 4393 / 23381 genes in 0 min 1 sec
Preprocessing - normalizing data ... data normalized in 0 min 10 sec
Preprocessing - running ANOVA ... excluded 6136 / 18988 genes in 0 min 10 sec
Computing DET ... 
    esw ...
    empirical p-values ...
    esw_s ...
    finished in 0 min 2 sec
Computing EP ...
    esw ...
    empirical p-values ...
    esw_s ...
    finished in 0 min 0 sec
Computing GES ...
    esw ...
    empirical p-values ...
    esw_s ...
    finished in 0 min 4 sec
Computing NSI ...
    esw ...
    empirical p-values ...
    esw_s ...
    finished in 0 min 0 sec
Computing ESmu ...
    finished in 0 min 0 sec
Computing ESsd ...
    finished in 0 min 0 sec
Computed ['det.esw', 'det.esw_null', 'det.pvals', 'det.esw_s', 'ep.esw', 'ep.esw_null', 'ep.pvals', 'ep.esw_s', 'ges.esw', 'ges.esw_null', 'ges.pvals', 'ges.esw_s', 'nsi.esw', 'nsi.esw_null', 'nsi.pvals', 'ns

In [26]:
# Check expression specificity results for some known Myeloid markers 
eso.results["esmu"].head()
eso.results["esmu"].loc[["IL1B", "CXCL2", "TNF", "NFKB1", "APOE", "APOC1", "ABCA1", "CD68", "CD163", "VCAN", "CCL2", "TREM2", "FABP5", "LYVE1", "S100A8", "S100A9", "LYZ", "NAMPT"]]

Unnamed: 0_level_0,Foamy_Mac1,Foamy_Mac2,Inflammatory_Mac,Mast_cell,Monocytes,Monocytes/DC,NAMPT_Neutrophils,Phagocytosis_Mac,Proliferating_myeloid,Tissue_resident_Mac,cDC
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
IL1B,0.0,0.0,0.7518,0.0,0.248008,0.0,0.0,0.0,0.19341,0.0,0.327159
CXCL2,0.0,0.0,0.74634,0.0,0.039238,0.0,0.0,0.0,0.102913,0.0,0.0
TNF,0.0,0.0,0.835464,0.0,0.0,0.0,0.0,0.315199,0.289063,0.0,0.0
NFKB1,0.0,0.0,0.837271,0.0,0.0,0.0,0.0,0.0,0.183739,0.0,0.020258
APOE,0.587757,0.931774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
APOC1,0.754265,0.934518,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ABCA1,0.413746,0.221003,0.466537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CD68,0.273654,0.312204,0.20383,0.0,0.114608,0.230914,0.0,0.226022,0.0,0.229146,0.0
CD163,0.011596,0.12167,0.271014,0.0,0.0,0.22416,0.0,0.227895,0.0,0.438736,0.0
VCAN,0.89898,0.0,0.0,0.0,0.893531,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
# Check conversion and save results
eso.results["esmu"].head()
eso.save_as_csv(file_prefix="rpca_myeloid_sct_v3_level2_annotations_expression_specificity_gene_symbols", 
                path="/project/cphg-millerlab/Jose/human_scRNA_meta_analysis/rds_objects/integration_rds_objects/rPCA/alsaigh_pan_wirka_hu_int/CELLECT_inputs/Myeloid/CELLEX_outputs/")

In [28]:
# Map gene symbols to human Ensembl gene IDs. Ensembl IDs are required for CELLECT. 
ensembl_ids = cellex.utils.mapping.human_symbol_to_human_ens(eso.results["esmu"], drop_unmapped=True, verbose=True)

Mapping: human gene symbols --> human ensembl gene id's ...
3.78 pct of genes are unmapped ...
Removed 486 unmapped genes ...


In [29]:
# Save expression specificity results with Ensembl IDs 
eso.results["esmu"].head()
eso.save_as_csv(file_prefix="rpca_myeloid_sct_v3_level2_annotations_expression_specificity_Ensembl_IDs", 
                path="/project/cphg-millerlab/Jose/human_scRNA_meta_analysis/rds_objects/integration_rds_objects/rPCA/alsaigh_pan_wirka_hu_int/CELLECT_inputs/Myeloid/CELLEX_outputs/")