In [2]:
from scipy import sparse, io
import numpy as np
import pandas as pd
import cellex
import matplotlib

In [3]:
# In this script we'll generate the gene expression specificity matrix for level 2 SMC annotations required for the LDSC-SEG analysis

# Read sparse scRNA athero meta analyzed scRNA sct norm counts matrix. Loading the data like this takes only 4-5 mins as opposed of 1-2 hours 
# by using pd.read_csv
sparse_matrix = io.mmread("/project/cphg-millerlab/Jose/human_scRNA_meta_analysis/rds_objects/integration_rds_objects/rPCA/alsaigh_pan_wirka_hu_int/CELLECT_inputs/LDSC_Peri_SMC_Fibro/rpca_smc_fibro_peri_sct_sparse_matrix.txt")
m_dense = sparse_matrix.toarray()

In [4]:
# Read rownames and colnames csv files
row_names = np.genfromtxt("/project/cphg-millerlab/Jose/human_scRNA_meta_analysis/rds_objects/integration_rds_objects/rPCA/alsaigh_pan_wirka_hu_int/CELLECT_inputs/LDSC_Peri_SMC_Fibro/rpca_smc_fibro_peri_sct_matrix_rownames.txt",
                         dtype=str)
col_names = np.genfromtxt("/project/cphg-millerlab/Jose/human_scRNA_meta_analysis/rds_objects/integration_rds_objects/rPCA/alsaigh_pan_wirka_hu_int/CELLECT_inputs/LDSC_Peri_SMC_Fibro/rpca_smc_fibro_peri_sct_matrix_colnames.txt",
                         dtype=str)

In [5]:
# Export matrix to df format used as input for CELLEX
data = pd.DataFrame(m_dense, columns=col_names, index=row_names)
data.head()

Unnamed: 0,AAACCCACAAAGGATT_1,AAACCCAGTCACCACG_1,AAACCCAGTGTGTGGA_1,AAACCCATCTTGGTCC_1,AAACGAAAGATGTAGT_1,AAACGAACACGCTTAA_1,AAACGAAGTTCGAGCC_1,AAACGAATCTGCGGAC_1,AAAGGATCACCCTGAG_1,AAAGGATCAGACAAAT_1,...,TTTCCTCCAGGACCCT-1_15,TTTCCTCTCCGTAGTA-1_15,TTTGCGCAGTCATCCA-1_15,TTTGCGCGTTGCGCAC-1_15,TTTGCGCTCCTGCAGG-1_15,TTTGGTTAGGAGTACC-1_15,TTTGGTTCAGACGCTC-1_15,TTTGGTTGTAAAGGAG-1_15,TTTGGTTGTCGACTAT-1_15,TTTGTCAGTTAAGACA-1_15
AL627309.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL669831.5,0.0,0.0,0.693147,0.0,0.0,0.0,0.693147,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LINC00115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.693147,0.0,0.0,0.0,0.0
FAM41C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL645608.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Load metadata
metadata = pd.read_csv("/project/cphg-millerlab/Jose/human_scRNA_meta_analysis/rds_objects/integration_rds_objects/rPCA/alsaigh_pan_wirka_hu_int/CELLECT_inputs/LDSC_Peri_SMC_Fibro/rpca_smc_fibro_prelim_annotations_metadata.csv")
metadata.head()

Unnamed: 0.1,Unnamed: 0,cell_type
0,AAACCCACAAAGGATT_1,Transitional-ECM-SMC
1,AAACCCAGTCACCACG_1,Transitional-ECM-SMC
2,AAACCCAGTGTGTGGA_1,Transitional-ECM-SMC
3,AAACCCATCTTGGTCC_1,Unknown
4,AAACGAAAGATGTAGT_1,Pericyte2


In [7]:
# Check how many cells we have per cell type
print(metadata.groupby("cell_type").cell_type.count())

cell_type
Contractile_SMC          7141
Fibroblast               3368
Fibrochondrocyte         1097
Fibromyocyte             2975
Foam-like                1021
Pericyte1                2610
Pericyte2                3449
SMC2                     1205
Transitional-ECM-SMC    11061
Unknown                  3105
Name: cell_type, dtype: int64


In [8]:
# Set first columns as index. This needs to be done so that genes don't get mistaken as expression values. Also the metadata file should have only the cell_type column and cell barcodes
# as row names
metadata = metadata.set_index("Unnamed: 0")
metadata.head()

Unnamed: 0_level_0,cell_type
Unnamed: 0,Unnamed: 1_level_1
AAACCCACAAAGGATT_1,Transitional-ECM-SMC
AAACCCAGTCACCACG_1,Transitional-ECM-SMC
AAACCCAGTGTGTGGA_1,Transitional-ECM-SMC
AAACCCATCTTGGTCC_1,Unknown
AAACGAAAGATGTAGT_1,Pericyte2


In [9]:
# Create ESObject and compute Expression Specificity
eso = cellex.ESObject(data=data, annotation=metadata, verbose=True)
eso.compute(verbose=True)

Preprocessing - checking input ... input parsed in 0 min 0 sec
Preprocessing - running remove_non_expressed ... excluded 3836 / 23381 genes in 0 min 3 sec
Preprocessing - normalizing data ... data normalized in 0 min 19 sec
Preprocessing - running ANOVA ... excluded 6547 / 19545 genes in 0 min 24 sec
Computing DET ... 
    esw ...
    empirical p-values ...
    esw_s ...
    finished in 0 min 4 sec
Computing EP ...
    esw ...
    empirical p-values ...
    esw_s ...
    finished in 0 min 0 sec
Computing GES ...
    esw ...
    empirical p-values ...
    esw_s ...
    finished in 0 min 9 sec
Computing NSI ...
    esw ...
    empirical p-values ...
    esw_s ...
    finished in 0 min 0 sec
Computing ESmu ...
    finished in 0 min 0 sec
Computing ESsd ...
    finished in 0 min 0 sec
Computed ['det.esw', 'det.esw_null', 'det.pvals', 'det.esw_s', 'ep.esw', 'ep.esw_null', 'ep.pvals', 'ep.esw_s', 'ges.esw', 'ges.esw_null', 'ges.pvals', 'ges.esw_s', 'nsi.esw', 'nsi.esw_null', 'nsi.pvals', 'ns

In [35]:
# Check expression specificity results
# Note: the unknown annotation was later changed to "SMC3"
eso.results["esmu"].head()
eso.results["esmu"].loc[["MYH11", "CNN1", "ACTA2", "LMOD1", "MYOCD", "RGS5", "TNFRSF11B", "KRT17", "IGFBP2", "TNFAIP6",  "VCAN", "CRTAC1" ,"FN1", "TNFRSF11B", "COMP", 
                         "MGP", "CYTL1", "CLU", "COL1A2", "COL1A1", "COL6A3", "SOX9", "RUNX2", "APOE", "APOC1", "FTL", "FBLN1", "APOD", "SERPINF1"]]

Unnamed: 0_level_0,Contractile_SMC,Fibroblast,Fibrochondrocyte,Fibromyocyte,Foam-like,Pericyte1,Pericyte2,SMC2,Transitional-ECM-SMC,Unknown
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
MYH11,0.578203,0.0,0.0,0.228699,0.0,0.0,0.458285,0.452368,0.221283,0.238101
CNN1,0.842983,0.0,0.0,0.0,0.0,0.0,0.566888,0.034104,0.08444,0.17833
ACTA2,0.370171,0.0,0.0,0.237008,0.0,0.231151,0.27234,0.327368,0.226971,0.238462
LMOD1,0.613843,0.0,0.0,0.0989,0.0,0.0,0.336678,0.36584,0.111618,0.252894
MYOCD,0.894696,0.0,0.0,0.0,0.0,0.0,0.541672,0.367249,0.0,0.0
RGS5,0.57759,0.0,0.0,0.0,0.065642,0.237487,0.0,0.303551,0.20572,0.229808
TNFRSF11B,0.0,0.0,0.0,0.867405,0.0,0.0,0.0,0.0,0.901235,0.397815
KRT17,0.0,0.0,0.134249,0.912309,0.38412,0.0,0.0,0.019166,0.638465,0.108774
IGFBP2,0.194294,0.0,0.0,0.507328,0.170262,0.0,0.0,0.241122,0.611731,0.489019
TNFAIP6,0.0,0.535407,0.842139,0.548953,0.371849,0.0,0.0,0.0,0.234511,0.0


In [34]:
new_df = eso.results["esmu"][["Contractile_SMC", "Transitional-ECM-SMC", "Fibromyocyte", "Fibrochondrocyte"]]
new_df.loc[["MYH11", "CNN1", "ACTA2", "LMOD1", "MYOCD", "RGS5", "TNFRSF11B", "KRT17", "IGFBP2", "TNFAIP6",  "VCAN", "CRTAC1" ,"FN1", "TNFRSF11B", "COMP", 
                         "MGP", "CYTL1", "CLU", "COL1A2", "COL1A1", "COL6A3", "SOX9", "RUNX2", "APOE", "APOC1", "FTL", "FBLN1", "SERPINF1"]]

Unnamed: 0_level_0,Contractile_SMC,Transitional-ECM-SMC,Fibromyocyte,Fibrochondrocyte
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MYH11,0.578203,0.221283,0.228699,0.0
CNN1,0.842983,0.08444,0.0,0.0
ACTA2,0.370171,0.226971,0.237008,0.0
LMOD1,0.613843,0.111618,0.0989,0.0
MYOCD,0.894696,0.0,0.0,0.0
RGS5,0.57759,0.20572,0.0,0.0
TNFRSF11B,0.0,0.901235,0.867405,0.0
KRT17,0.0,0.638465,0.912309,0.134249
IGFBP2,0.194294,0.611731,0.507328,0.0
TNFAIP6,0.0,0.234511,0.548953,0.842139


In [36]:
# Check conversion and save results
eso.results["esmu"].head()
eso.save_as_csv(file_prefix="smc_peri_fibro_prelim_annotations_expression_specificity_gene_symbols", path="/project/cphg-millerlab/Jose/human_scRNA_meta_analysis/rds_objects/integration_rds_objects/rPCA/alsaigh_pan_wirka_hu_int/CELLECT_inputs/LDSC_Peri_SMC_Fibro/CELLEX_files/")

In [37]:
# Map gene symbols to human Ensembl gene IDs. Ensembl IDs are required for CELLECT. 
ensembl_ids = cellex.utils.mapping.human_symbol_to_human_ens(eso.results["esmu"], drop_unmapped=True, verbose=True)

Mapping: human gene symbols --> human ensembl gene id's ...
4.87 pct of genes are unmapped ...
Removed 633 unmapped genes ...


In [39]:
# Check expression specificity df with ensembl ids
eso.results["esmu"].head()
eso.save_as_csv(file_prefix="smc_peri_fibro_prelim_annotations_expression_specificity_Ensembl_IDs", path="/project/cphg-millerlab/Jose/human_scRNA_meta_analysis/rds_objects/integration_rds_objects/rPCA/alsaigh_pan_wirka_hu_int/CELLECT_inputs/LDSC_Peri_SMC_Fibro/CELLEX_files/")