In [1]:
from scipy import sparse, io
import numpy as np
import pandas as pd
import cellex
import matplotlib

In [2]:
# Read sparse scRNA athero meta analyzed scRNA sct norm counts matrix from lesion cells. Loading the data like this takes only 4-5 mins as opposed of 1-2 hours 
# by using pd.read_csv
lesion_sparse_matrix = io.mmread("/project/cphg-millerlab/Jose/human_scRNA_meta_analysis/rds_objects/integration_rds_objects/rPCA/alsaigh_pan_wirka_hu_int/CELLECT_inputs/Lesion_status/lesion/input_matrix_files/rpca_lesion_sct_sparse_matrix.txt")
lesion_mat_dense = lesion_sparse_matrix.toarray()

In [3]:
# Read rownames and colnames csv files
lesion_row_names = np.genfromtxt("/project/cphg-millerlab/Jose/human_scRNA_meta_analysis/rds_objects/integration_rds_objects/rPCA/alsaigh_pan_wirka_hu_int/CELLECT_inputs/Lesion_status/lesion/input_matrix_files/rpca_lesion_matrix_rownames.txt",
                         dtype=str)
lesion_col_names = np.genfromtxt("/project/cphg-millerlab/Jose/human_scRNA_meta_analysis/rds_objects/integration_rds_objects/rPCA/alsaigh_pan_wirka_hu_int/CELLECT_inputs/Lesion_status/lesion/input_matrix_files/rpca_lesion_matrix_colnames.txt",
                         dtype=str)

In [5]:
# Export matrix to df format used as input for CELLEX
lesion_data = pd.DataFrame(lesion_mat_dense, columns=lesion_col_names, index=lesion_row_names)
lesion_data.head()

Unnamed: 0,AAACCCACAAAGGATT_1,AAACCCAGTCACCACG_1,AAACCCAGTGTGTGGA_1,AAACCCATCTTGGTCC_1,AAACGAAAGATGTAGT_1,AAACGAACACGCTTAA_1,AAACGAAGTTCGAGCC_1,AAACGAATCTGCGGAC_1,AAACGCTAGCGAAACC_1,AAAGGATCAAGGGTCA_1,...,TTTGCGCAGAACTCGG.8_10,TTTGCGCCACCCAGTG.8_10,TTTGCGCGTACAAGTA.8_10,TTTGCGCGTAGGCATG.8_10,TTTGGTTAGCTAGGCA.8_10,TTTGGTTCAATTGCTG.8_10,TTTGGTTTCTCTGTCG.8_10,TTTGTCACAAAGTGCG.8_10,TTTGTCACAGGACCCT.8_10,TTTGTCAGTCACCTAA.8_10
AL627309.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL669831.5,0.0,0.0,0.693147,0.0,0.0,0.0,0.693147,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LINC00115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FAM41C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL645608.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Load metadata for cells from lesion samples
lesion_metadata = pd.read_csv("/project/cphg-millerlab/Jose/human_scRNA_meta_analysis/rds_objects/integration_rds_objects/rPCA/alsaigh_pan_wirka_hu_int/CELLECT_inputs/Lesion_status/lesion/input_matrix_files/rpca_lesion_level1_annotations_metadata.csv")
lesion_metadata.head()

Unnamed: 0.1,Unnamed: 0,cell_type
0,AAACCCACAAAGGATT_1,SMC
1,AAACCCAGTCACCACG_1,SMC
2,AAACCCAGTGTGTGGA_1,SMC
3,AAACCCATCTTGGTCC_1,SMC
4,AAACGAAAGATGTAGT_1,SMC


In [7]:
# Check how many cells we have per cell type from lesion samples
print(lesion_metadata.groupby("cell_type").cell_type.count())

cell_type
B_cell          1919
Endothelial     8342
Fibroblast      3563
Macrophage     12540
Mast_cell        780
Neuron           267
Pericyte        2588
Plasma_cell      420
SMC            11781
T_NK           17342
pDC              149
Name: cell_type, dtype: int64


In [9]:
# Set first columns as index. This needs to be done so that genes don't get mistaken as expression values. Also the metadata file should have only the cell_type column and cell barcodes
# as row names
lesion_metadata = lesion_metadata.set_index("Unnamed: 0")
lesion_metadata.head()

Unnamed: 0_level_0,cell_type
Unnamed: 0,Unnamed: 1_level_1
AAACCCACAAAGGATT_1,SMC
AAACCCAGTCACCACG_1,SMC
AAACCCAGTGTGTGGA_1,SMC
AAACCCATCTTGGTCC_1,SMC
AAACGAAAGATGTAGT_1,SMC


In [10]:
# Create ESObject and compute Expression Specificity for cells from lesion samples
eso = cellex.ESObject(data=lesion_data, annotation=lesion_metadata, verbose=True)
eso.compute(verbose=True)

Preprocessing - checking input ... input parsed in 0 min 0 sec
Preprocessing - running remove_non_expressed ... excluded 4024 / 23381 genes in 0 min 4 sec
Preprocessing - normalizing data ... data normalized in 0 min 25 sec
Preprocessing - running ANOVA ... excluded 1702 / 19357 genes in 0 min 20 sec
Computing DET ... 
    esw ...
    empirical p-values ...
    esw_s ...
    finished in 0 min 8 sec
Computing EP ...
    esw ...
    empirical p-values ...
    esw_s ...
    finished in 0 min 0 sec
Computing GES ...
    esw ...
    empirical p-values ...
    esw_s ...
    finished in 0 min 12 sec
Computing NSI ...
    esw ...
    empirical p-values ...
    esw_s ...
    finished in 0 min 0 sec
Computing ESmu ...
    finished in 0 min 0 sec
Computing ESsd ...
    finished in 0 min 0 sec
Computed ['det.esw', 'det.esw_null', 'det.pvals', 'det.esw_s', 'ep.esw', 'ep.esw_null', 'ep.pvals', 'ep.esw_s', 'ges.esw', 'ges.esw_null', 'ges.pvals', 'ges.esw_s', 'nsi.esw', 'nsi.esw_null', 'nsi.pvals', 'n

In [11]:
# Check expression specificity results
eso.results["esmu"].head()
eso.results["esmu"].loc[["MYH11", "CNN1", "ACTA2", "TNFRSF11B", "KRT7", "IGFBP2", "TNFAIP6",  "VCAN", "CRTAC1" ,"FN1", "AEBP1", "TNFRSF11B", "CD14", "FBLN1", "PECAM1", "CDH5", "NKG7", "CD8A", "C7", "C3", 
                        "IGHM", "JCHAIN", "MZB1", "IGLC3", "IL1B", "IBSP", "PECAM1", "LMOD1", "COL4A1", "COL4A2", "COL6A3", "TCF21"]]

Unnamed: 0_level_0,B_cell,Endothelial,Fibroblast,Macrophage,Mast_cell,Neuron,Pericyte,Plasma_cell,SMC,T_NK,pDC
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
MYH11,0.0,0.0,0.0,0.0,0.0,0.0,0.692349,0.0,0.835107,0.0,0.0
CNN1,0.0,0.0,0.0,0.0,0.0,0.0,0.915337,0.0,0.893368,0.0,0.0
ACTA2,0.0,0.0,0.0,0.0,0.0,0.0,0.754401,0.0,0.718195,0.0,0.0
TNFRSF11B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.893487,0.0,0.0
KRT7,0.0,0.351597,0.0,0.0,0.0,0.0,0.0,0.0,0.909178,0.0,0.0
IGFBP2,0.0,0.500338,0.0,0.0,0.0,0.0,0.400981,0.0,0.863666,0.0,0.0
TNFAIP6,0.0,0.0,0.837076,0.0,0.0,0.312199,0.0,0.0,0.429746,0.0,0.0
VCAN,0.0,0.0,0.532348,0.400927,0.0,0.0,0.0,0.0,0.811837,0.0,0.0
CRTAC1,0.0,0.828373,0.0,0.0,0.0,0.0,0.0,0.0,0.404776,0.0,0.0
FN1,0.0,0.016245,0.499068,0.0,0.0,0.0,0.240548,0.0,0.739275,0.0,0.0


In [12]:
# Check conversion and save results
eso.results["esmu"].head()
eso.save_as_csv(file_prefix="rpca_lesion_sct_v3_level1_annotations_expression_specificity_gene_symbols", 
                path="/project/cphg-millerlab/Jose/human_scRNA_meta_analysis/rds_objects/integration_rds_objects/rPCA/alsaigh_pan_wirka_hu_int/CELLECT_inputs/Lesion_status/lesion/CELLEX_outputs/")

In [13]:
# Map gene symbols to human Ensembl gene IDs. Ensembl IDs are required for CELLECT. 
ensembl_ids = cellex.utils.mapping.human_symbol_to_human_ens(eso.results["esmu"], drop_unmapped=True, verbose=True)

Mapping: human gene symbols --> human ensembl gene id's ...
6.85 pct of genes are unmapped ...
Removed 1209 unmapped genes ...


In [15]:
# Check conversion and save results
eso.results["esmu"].head()
eso.save_as_csv(file_prefix="rpca_lesion_sct_v3_level1_annotations_expression_specificity_Ensembl_IDs", 
                path="/project/cphg-millerlab/Jose/human_scRNA_meta_analysis/rds_objects/integration_rds_objects/rPCA/alsaigh_pan_wirka_hu_int/CELLECT_inputs/Lesion_status/lesion/CELLEX_outputs/")