### Import libraries

In [1]:
library(CMSclassifier)
library(CMScaller)
library(limma)
library(hdf5r)

Loading required package: randomForest

randomForest 4.6-14

Type rfNews() to see new features/changes/bug fixes.



### Initialize function to read in .h5ad data

In [6]:
#This function uses hdf5r to interface with the .h5ad filetype output from scanpy
prepare_data <- function(h5ad_in) {
    data = h5file(h5ad_in)
    data_x = data[['X']][,]
    data_obs = data[['obs']][['_index']][] #Cells / _index, sometimes these attributes are saved as _index or Cells, replace as needed
    data_var = data[['var']][['_index']][] #Genes / _index, sometimes these attributes are saved as _index or Genes, replace as needed
    rownames(data_x) = data_var #set labels
    colnames(data_x) = data_obs
    return(data_x)
}

### Read in data and replace gene symbols with entrez IDs

In [10]:
data = replaceGeneId(prepare_data("../Broad_Data/colon10x_Epi_Tumor_refiltered_no_normal.h5ad"), id.in='symbol', id.out="entrez")

18259/43078 rownames [NA.number] (no valid translation)

0/43078 rownames [id.number] (translation gives duplicates)



### Prepare data and run CMS classification using single-sample predictor

In [13]:
#Subset the data for only those with mappable entrez IDs within the CMS model
subset_data = subset(data, rownames(data) %in% listModelGenes("SSP"))

In [15]:
#Normalize values according to CMSclassifier and CMScaller protocol
subset_data_normalized <- limma::normalizeQuantiles(log2(subset_data+.25))

In [17]:
#Run classification and output SSP scores
SScms <- CMSclassifier::classifyCMS(subset_data_normalized,method="SSP")[[3]]

In [20]:
head(SScms)

Unnamed: 0_level_0,SSP.min.corToCMS1,SSP.min.corToCMS2,SSP.min.corToCMS3,SSP.min.corToCMS4,SSP.median.corToCMS1,SSP.median.corToCMS2,SSP.median.corToCMS3,SSP.median.corToCMS4,SSP.max.corToCMS1,SSP.max.corToCMS2,SSP.max.corToCMS3,SSP.max.corToCMS4,SSP.nearestCMS,SSP.predictedCMS
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>
GATGTAGTTTAAACGCCT-3,0.008044251,-0.11792792,0.3153605,-0.2583123,0.07656679,-0.112010782,0.3925024,-0.206413,0.11566034,-0.0572519,0.4414273,-0.1710869,CMS3,CMS3
AACCTGACTTGCATAT-3,-0.070948107,0.01869697,0.1926658,-0.2172093,-0.03659633,0.03639819,0.2569164,-0.1986609,0.01296089,0.06887815,0.2635608,-0.12127642,CMS3,CMS3
CCAACCGTAAACTCGA-3,0.019962228,-0.07161556,0.2041007,-0.2093783,0.07103005,-0.066670551,0.2583983,-0.1668311,0.11157239,-0.03878854,0.3144507,-0.14138274,CMS3,CMS3
TGAACTAGCCAGTCCAAAG-3,0.022274911,-0.09121392,0.2063515,-0.1911201,0.07326977,-0.080381143,0.2554334,-0.1314142,0.12295705,-0.05250844,0.2650384,-0.08964501,CMS3,CMS3
TGAGGGAAATCATTGGGCC-3,-0.063886517,-0.01213462,0.2251656,-0.2164686,-0.01340275,-0.004580135,0.2695236,-0.175315,0.03763653,0.03211592,0.2920198,-0.11777471,CMS3,CMS3
TCCAGGGAACTAGCCA-3,0.003532066,-0.09111382,0.2516898,-0.2183364,0.05583372,-0.082175614,0.3054318,-0.1554695,0.11482162,-0.05067024,0.3282224,-0.09646975,CMS3,CMS3


In [19]:
#Save values, typically use median values
write.csv(SScms,"SScms.csv")