# Table of predictor frequency in each cluster

## AML HM 450 predictor frequencies

File location: `/oak/stanford/groups/andrewg/users/szmamie/repos/MethylationPrediction/data/results/LAML/vbsr-hm27-M-data/`

In [1]:
library(stringr)
source('helperFunctions.R')


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Loading required package: sp
Loading required package: maps
Loading required package: shapefiles
Loading required package: foreign

Attaching package: ‘shapefiles’

The following objects are masked from ‘package:foreign’:

    read.dbf, write.dbf



In [2]:
base.dir <- "../data/results/LAML/vbsr-hm27-M-data/"
load(paste0(base.dir, 'cluster1.RData'))

The predictors are in coef data frame, aggregate all the predictor in a single dataframe and save it in the data/results folder

In [3]:
coef
class(coef)

gene,coef
BBS7,-0.3374807
IRAK3,-0.1493213
KLF11,-0.1770719
MGC12982,0.2388038
NRSN2,-0.1253448
SLC38A2,0.4207201


In [4]:
str_match('cluster1.RData', 'cluster([0-9]*)\\.RData')[2]

In [17]:
predictors <- data.frame(gene=character(), coef=numeric(), ID=character(), stringsAsFactors=FALSE)
for (f in list.files(base.dir, 'cluster[0-9]*\\.RData')) {
    load(paste0(base.dir, f))
    idx <- str_match(f, 'cluster([0-9]*)\\.RData')[2]
    coef$cluster <- idx
    predictors <- rbind(predictors, coef)
}

In [18]:
dim(predictors)
length(unique(predictors$gene))
head(predictors)

gene,coef,cluster
BBS7,-0.3374807,1
IRAK3,-0.1493213,1
KLF11,-0.1770719,1
MGC12982,0.2388038,1
NRSN2,-0.1253448,1
SLC38A2,0.4207201,1


In [19]:
freq.table <- data.frame(sort(table(predictors$gene), decreasing=T))
annot.freq.table <- MergeHGNCDescription(predictors)

“Column `gene`/`hgnc_symbol` joining factor and character vector, coercing into character vector”

In [20]:
head(annot.freq.table)

gene,coef,cluster,description
BBS7,-0.3374807,1,Bardet-Biedl syndrome 7 [Source:HGNC Symbol;Acc:HGNC:18758]
IRAK3,-0.1493213,1,interleukin 1 receptor associated kinase 3 [Source:HGNC Symbol;Acc:HGNC:17020]
KLF11,-0.1770719,1,Kruppel like factor 11 [Source:HGNC Symbol;Acc:HGNC:11811]
MGC12982,0.2388038,1,
NRSN2,-0.1253448,1,neurensin 2 [Source:HGNC Symbol;Acc:HGNC:16229]
SLC38A2,0.4207201,1,solute carrier family 38 member 2 [Source:HGNC Symbol;Acc:HGNC:13448]


In [21]:
head(freq.table)

Var1,Freq
TNFSF12,5
C7orf46,5
SH3PXD2A,4
IRAK3,3
ZNF518B,3
ADPRH,3


In [27]:
annot.freq.table <- annot.freq.table %>%
    dplyr::select(gene, description) %>%
    dplyr::distinct() %>%
    dplyr::left_join(freq.table, by=c('gene'='Var1')) %>%
    arrange(desc(Freq))
annot.freq.table

“Column `gene`/`Var1` joining character vector and factor, coercing into character vector”

gene,description,Freq
TNFSF12,TNF superfamily member 12 [Source:HGNC Symbol;Acc:HGNC:11927],5
C7orf46,,5
SH3PXD2A,SH3 and PX domains 2A [Source:HGNC Symbol;Acc:HGNC:23664],4
IRAK3,interleukin 1 receptor associated kinase 3 [Source:HGNC Symbol;Acc:HGNC:17020],3
ZNF518B,zinc finger protein 518B [Source:HGNC Symbol;Acc:HGNC:29365],3
ADPRH,ADP-ribosylarginine hydrolase [Source:HGNC Symbol;Acc:HGNC:269],3
KLF11,Kruppel like factor 11 [Source:HGNC Symbol;Acc:HGNC:11811],2
MGC12982,,2
MANEAL,mannosidase endo-alpha like [Source:HGNC Symbol;Acc:HGNC:26452],2
RPP25,ribonuclease P and MRP subunit p25 [Source:HGNC Symbol;Acc:HGNC:30361],2


In [29]:
write.table(annot.freq.table, paste0(base.dir, 'predictors.summary.csv'), quote=F, sep=',', row.names=F, col.names=T)

# now construct it as a function and run for all folders

In [4]:
WriteFreqTable <- function(base.dirs) {
    for (base.dir in base.dirs) {
        print(base.dir)
        predictors <- data.frame(gene=character(), coef=numeric(), ID=character(), stringsAsFactors=FALSE)
        for (f in list.files(base.dir, 'cluster[0-9]*\\.RData')) {
            # print(paste0('Loading ', base.dir, f))
            load(paste0(base.dir, f))
            if(dim(coef)[1]==0) next 
            idx <- str_match(f, 'cluster([0-9]*)\\.RData')[2]
            coef$cluster <- idx
            predictors <- rbind(predictors, coef)
        }
        print(paste0('There are in total ', dim(predictors)[1], ' predictors from ', length(unique(predictors$cluster)), 
                     ' clusters. The number of unqiue genes are ', length(unique(predictors$gene)), '.'))
        freq.table <- data.frame(sort(table(predictors$gene), decreasing=T))
        annot.freq.table <- MergeHGNCDescription(predictors) 
        annot.freq.table <- annot.freq.table %>%
            dplyr::select(gene, description) %>%
            dplyr::distinct() %>%
            dplyr::left_join(freq.table, by=c('gene'='Var1')) %>%
            arrange(desc(Freq))
        write.table(annot.freq.table, paste0(base.dir, 'predictors.summary.csv'), quote=F, sep='\t', row.names=F, col.names=T)
        print(paste0("wrote results to ", base.dir, 'predictors.summary.csv.'))
    }
}

In [5]:
base.dirs <- paste0('../data/results/', c('LAML/vbsr-hm27-B-data/',  'LAML/vbsr-hm27-M-data/',  
                                          'LAML/vbsr-hm450-B-data/', 'LAML/vbsr-hm450-M-data/',
                                         'HNSC/vbsr-hm450-B-data/', 'HNSC/vbsr-hm450-M-data/',
                                         'SKCM/vbsr-hm450-B-data/', 'SKCM/vbsr-hm450-M-data/'))
WriteFreqTable(base.dirs)

[1] "../data/results/LAML/vbsr-hm27-B-data/"
[1] "There are in total 81 predictors from 18 clusters. The number of unqiue genes are 57."


“Column `gene`/`Var1` joining character vector and factor, coercing into character vector”

[1] "wrote results to ../data/results/LAML/vbsr-hm27-B-data/predictors.summary.csv."
[1] "../data/results/LAML/vbsr-hm27-M-data/"
[1] "There are in total 75 predictors from 18 clusters. The number of unqiue genes are 53."


“Column `gene`/`Var1` joining character vector and factor, coercing into character vector”

[1] "wrote results to ../data/results/LAML/vbsr-hm27-M-data/predictors.summary.csv."
[1] "../data/results/LAML/vbsr-hm450-B-data/"
[1] "There are in total 85 predictors from 20 clusters. The number of unqiue genes are 57."


“Column `gene`/`Var1` joining character vector and factor, coercing into character vector”

[1] "wrote results to ../data/results/LAML/vbsr-hm450-B-data/predictors.summary.csv."
[1] "../data/results/LAML/vbsr-hm450-M-data/"
[1] "There are in total 92 predictors from 20 clusters. The number of unqiue genes are 56."


“Column `gene`/`Var1` joining character vector and factor, coercing into character vector”

[1] "wrote results to ../data/results/LAML/vbsr-hm450-M-data/predictors.summary.csv."
[1] "../data/results/HNSC/vbsr-hm450-B-data/"
[1] "There are in total 234 predictors from 20 clusters. The number of unqiue genes are 158."


“Column `gene`/`Var1` joining character vector and factor, coercing into character vector”

[1] "wrote results to ../data/results/HNSC/vbsr-hm450-B-data/predictors.summary.csv."
[1] "../data/results/HNSC/vbsr-hm450-M-data/"
[1] "There are in total 196 predictors from 20 clusters. The number of unqiue genes are 134."


“Column `gene`/`Var1` joining character vector and factor, coercing into character vector”

[1] "wrote results to ../data/results/HNSC/vbsr-hm450-M-data/predictors.summary.csv."
[1] "../data/results/SKCM/vbsr-hm450-B-data/"
[1] "There are in total 183 predictors from 20 clusters. The number of unqiue genes are 110."


“Column `gene`/`Var1` joining character vector and factor, coercing into character vector”

[1] "wrote results to ../data/results/SKCM/vbsr-hm450-B-data/predictors.summary.csv."
[1] "../data/results/SKCM/vbsr-hm450-M-data/"
[1] "There are in total 159 predictors from 18 clusters. The number of unqiue genes are 105."


“Column `gene`/`Var1` joining character vector and factor, coercing into character vector”

[1] "wrote results to ../data/results/SKCM/vbsr-hm450-M-data/predictors.summary.csv."


## Ward.D2 with SE > 0.5 filter on HM27 LAML Figueroa

In [2]:
predictors <- read.csv('../data/LAML//processed/predictors.tsv', sep='\t', stringsAsFactor=F, header=F)

In [7]:
# get GO annotation
res <- queryMany(as.character(predictors), 
                 scopes='symbol', fields=c('go'), species='human',
                 returnall=T)

Finished


In [8]:
terms <- c()
for (i in seq(length(res$response$query))) {
  if (is.na(res$response$notfound[i])) {
    query <- res$response$query[i]
    termConcat <- paste(as.character(res$response$go.BP[[i]]$term), collapse=', ')
    terms <- rbind(terms, c(query, termConcat))
  }
}

In [10]:
terms

0,1
FOXD2,"regulation of transcription by RNA polymerase II, transcription by RNA polymerase II, anatomical structure morphogenesis, cell differentiation, positive regulation of transcription by RNA polymerase II"
KLF11,"regulation of transcription involved in G1/S transition of mitotic cell cycle, negative regulation of transcription by RNA polymerase II, regulation of transcription by RNA polymerase II, transcription by RNA polymerase II, apoptotic process, negative regulation of cell proliferation, positive regulation of apoptotic process, cellular response to peptide"
PPARA,"negative regulation of transcription by RNA polymerase II, negative regulation of transcription by RNA polymerase II, response to hypoxia, transcription initiation from RNA polymerase II promoter, lipid metabolic process, fatty acid metabolic process, fatty acid metabolic process, multicellular organism development, heart development, epidermis development, hormone-mediated signaling pathway, negative regulation of macrophage derived foam cell differentiation, negative regulation of macrophage derived foam cell differentiation, negative regulation of receptor biosynthetic process, negative regulation of cholesterol storage, negative regulation of cholesterol storage, negative regulation of sequestering of triglyceride, fatty acid transport, regulation of lipid metabolic process, regulation of lipid metabolic process, cell differentiation, intracellular receptor signaling pathway, response to nutrient levels, positive regulation of fatty acid beta-oxidation, negative regulation of protein binding, negative regulation of appetite, response to insulin, circadian regulation of gene expression, response to lipid, behavioral response to nicotine, wound healing, lipoprotein metabolic process, regulation of circadian rhythm, steroid hormone mediated signaling pathway, positive regulation of gluconeogenesis, negative regulation of blood pressure, negative regulation of glycolytic process, negative regulation of transcription, DNA-templated, positive regulation of transcription, DNA-templated, positive regulation of fatty acid metabolic process, positive regulation of transcription by RNA polymerase II, positive regulation of transcription by RNA polymerase II, positive regulation of fatty acid oxidation, negative regulation of inflammatory response, negative regulation of inflammatory response, enamel mineralization, cellular response to lipid, regulation of glycolytic process by positive regulation of transcription from RNA polymerase II promoter, regulation of cellular ketone metabolic process by positive regulation of transcription from RNA polymerase II promoter, regulation of lipid transport by positive regulation of transcription from RNA polymerase II promoter, negative regulation of neuron death, negative regulation of pri-miRNA transcription by RNA polymerase II, negative regulation of leukocyte cell-cell adhesion, negative regulation of transcription regulatory region DNA binding"
RB1,"G1/S transition of mitotic cell cycle, negative regulation of transcription by RNA polymerase II, regulation of cell growth, tissue homeostasis, chromatin remodeling, transcription, DNA-templated, regulation of transcription, DNA-templated, negative regulation of protein kinase activity, cell cycle arrest, mitotic cell cycle checkpoint, Ras protein signal transduction, regulation of mitotic cell cycle, regulation of mitotic cell cycle, negative regulation of gene expression, viral process, cell differentiation, neuron differentiation, androgen receptor signaling pathway, sister chromatid biorientation, neuron projection development, neuron projection development, maintenance of mitotic sister chromatid cohesion, glial cell apoptotic process, skeletal muscle cell differentiation, neuron maturation, enucleate erythrocyte differentiation, negative regulation of DNA binding transcription factor activity, regulation of lipid kinase activity, myoblast differentiation, positive regulation of macrophage differentiation, positive regulation of mitotic metaphase/anaphase transition, negative regulation of smoothened signaling pathway, negative regulation of transcription, DNA-templated, negative regulation of transcription, DNA-templated, positive regulation of transcription, DNA-templated, positive regulation of transcription by RNA polymerase II, digestive tract development, cell morphogenesis involved in neuron differentiation, cell morphogenesis involved in neuron differentiation, negative regulation of epithelial cell proliferation, striated muscle cell differentiation, cell division, neuron apoptotic process, protein localization to chromosome, centromeric region, cellular response to xenobiotic stimulus, regulation of cohesin loading, negative regulation of transcription involved in G1/S transition of mitotic cell cycle, regulation of centromere complex assembly, hepatocyte apoptotic process, negative regulation of G1/S transition of mitotic cell cycle, negative regulation of G1/S transition of mitotic cell cycle, positive regulation of transcription regulatory region DNA binding"
SMARCA1,"DNA strand renaturation, chromatin remodeling, transcription, DNA-templated, regulation of transcription by RNA polymerase II, brain development, neuron differentiation, ATP-dependent chromatin remodeling, positive regulation of transcription, DNA-templated"
ZBTB38,"regulation of DNA replication, transcription, DNA-templated, regulation of transcription, DNA-templated, cellular response to DNA damage stimulus, negative regulation of transcription, DNA-templated, positive regulation of transcription by RNA polymerase II"
ZC3H7A,"posttranscriptional regulation of gene expression, production of miRNAs involved in gene silencing by miRNA"
ZNF14,"transcription, DNA-templated, regulation of transcription by RNA polymerase II"
ZNF502,"transcription, DNA-templated, regulation of transcription by RNA polymerase II, positive regulation by host of viral release from host cell, positive regulation by host of viral process"
CBX1,"negative regulation of transcription, DNA-templated"
