# Table of predictor frequency in each cluster

## AML HM 450 predictor frequencies

File location: `/oak/stanford/groups/andrewg/users/szmamie/repos/MethylationPrediction/data/results/LAML/vbsr-hm27-M-data/`

In [1]:
library(stringr)
source('helperFunctions.R')


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Loading required package: sp
Loading required package: maps
Loading required package: shapefiles
Loading required package: foreign

Attaching package: ‘shapefiles’

The following objects are masked from ‘package:foreign’:

    read.dbf, write.dbf



In [2]:
base.dir <- "../data/results/LAML/vbsr-hm27-M-data/"
load(paste0(base.dir, 'cluster1.RData'))

The predictors are in coef data frame, aggregate all the predictor in a single dataframe and save it in the data/results folder

In [3]:
coef
class(coef)

gene,coef
BBS7,-0.3374807
IRAK3,-0.1493213
KLF11,-0.1770719
MGC12982,0.2388038
NRSN2,-0.1253448
SLC38A2,0.4207201


In [4]:
str_match('cluster1.RData', 'cluster([0-9]*)\\.RData')[2]

In [17]:
predictors <- data.frame(gene=character(), coef=numeric(), ID=character(), stringsAsFactors=FALSE)
for (f in list.files(base.dir, 'cluster[0-9]*\\.RData')) {
    load(paste0(base.dir, f))
    idx <- str_match(f, 'cluster([0-9]*)\\.RData')[2]
    coef$cluster <- idx
    predictors <- rbind(predictors, coef)
}

In [18]:
dim(predictors)
length(unique(predictors$gene))
head(predictors)

gene,coef,cluster
BBS7,-0.3374807,1
IRAK3,-0.1493213,1
KLF11,-0.1770719,1
MGC12982,0.2388038,1
NRSN2,-0.1253448,1
SLC38A2,0.4207201,1


In [19]:
freq.table <- data.frame(sort(table(predictors$gene), decreasing=T))
annot.freq.table <- MergeHGNCDescription(predictors)

“Column `gene`/`hgnc_symbol` joining factor and character vector, coercing into character vector”

In [20]:
head(annot.freq.table)

gene,coef,cluster,description
BBS7,-0.3374807,1,Bardet-Biedl syndrome 7 [Source:HGNC Symbol;Acc:HGNC:18758]
IRAK3,-0.1493213,1,interleukin 1 receptor associated kinase 3 [Source:HGNC Symbol;Acc:HGNC:17020]
KLF11,-0.1770719,1,Kruppel like factor 11 [Source:HGNC Symbol;Acc:HGNC:11811]
MGC12982,0.2388038,1,
NRSN2,-0.1253448,1,neurensin 2 [Source:HGNC Symbol;Acc:HGNC:16229]
SLC38A2,0.4207201,1,solute carrier family 38 member 2 [Source:HGNC Symbol;Acc:HGNC:13448]


In [21]:
head(freq.table)

Var1,Freq
TNFSF12,5
C7orf46,5
SH3PXD2A,4
IRAK3,3
ZNF518B,3
ADPRH,3


In [27]:
annot.freq.table <- annot.freq.table %>%
    dplyr::select(gene, description) %>%
    dplyr::distinct() %>%
    dplyr::left_join(freq.table, by=c('gene'='Var1')) %>%
    arrange(desc(Freq))
annot.freq.table

“Column `gene`/`Var1` joining character vector and factor, coercing into character vector”

gene,description,Freq
TNFSF12,TNF superfamily member 12 [Source:HGNC Symbol;Acc:HGNC:11927],5
C7orf46,,5
SH3PXD2A,SH3 and PX domains 2A [Source:HGNC Symbol;Acc:HGNC:23664],4
IRAK3,interleukin 1 receptor associated kinase 3 [Source:HGNC Symbol;Acc:HGNC:17020],3
ZNF518B,zinc finger protein 518B [Source:HGNC Symbol;Acc:HGNC:29365],3
ADPRH,ADP-ribosylarginine hydrolase [Source:HGNC Symbol;Acc:HGNC:269],3
KLF11,Kruppel like factor 11 [Source:HGNC Symbol;Acc:HGNC:11811],2
MGC12982,,2
MANEAL,mannosidase endo-alpha like [Source:HGNC Symbol;Acc:HGNC:26452],2
RPP25,ribonuclease P and MRP subunit p25 [Source:HGNC Symbol;Acc:HGNC:30361],2


In [29]:
write.table(annot.freq.table, paste0(base.dir, 'predictors.summary.csv'), quote=F, sep=',', row.names=F, col.names=T)

# now construct it as a function and run for all folders

In [44]:
WriteFreqTable <- function(base.dirs) {
    for (base.dir in base.dirs) {
        print(base.dir)
        predictors <- data.frame(gene=character(), coef=numeric(), ID=character(), stringsAsFactors=FALSE)
        for (f in list.files(base.dir, 'cluster[0-9]*\\.RData')) {
            # print(paste0('Loading ', base.dir, f))
            load(paste0(base.dir, f))
            if(dim(coef)[1]==0) next 
            idx <- str_match(f, 'cluster([0-9]*)\\.RData')[2]
            coef$cluster <- idx
            predictors <- rbind(predictors, coef)
        }
        print(paste0('There are in total ', dim(predictors)[1], ' predictors from ', length(unique(predictors$cluster)), 
                     ' clusters. The number of unqiue genes are ', length(unique(predictors$gene)), '.'))
        freq.table <- data.frame(sort(table(predictors$gene), decreasing=T))
        annot.freq.table <- MergeHGNCDescription(predictors) 
        annot.freq.table <- annot.freq.table %>%
            dplyr::select(gene, description) %>%
            dplyr::distinct() %>%
            dplyr::left_join(freq.table, by=c('gene'='Var1')) %>%
            arrange(desc(Freq))
        write.table(annot.freq.table, paste0(base.dir, 'predictors.summary.csv'), quote=F, sep=',', row.names=F, col.names=T)
        print(paste0("wrote results to ", base.dir, 'predictors.summary.csv.'))
    }
}

In [45]:
base.dirs <- paste0('../data/results/', c('LAML/vbsr-hm27-B-data/',  'LAML/vbsr-hm27-M-data/',  
                                          'LAML/vbsr-hm450-B-data/', 'LAML/vbsr-hm450-M-data/',
                                         'HNSC/vbsr-hm450-B-data/', 'HNSC/vbsr-hm450-M-data/',
                                         'SKCM/vbsr-hm450-B-data/', 'SKCM/vbsr-hm450-M-data/'))
WriteFreqTable(base.dirs)

[1] "../data/results/LAML/vbsr-hm27-B-data/"
[1] "There are in total 81 predictors from 18 clusters. The number of unqiue genes are 57."


“Column `gene`/`Var1` joining character vector and factor, coercing into character vector”

[1] "wrote results to ../data/results/LAML/vbsr-hm27-B-data/predictors.summary.csv."
[1] "../data/results/LAML/vbsr-hm27-M-data/"
[1] "There are in total 75 predictors from 18 clusters. The number of unqiue genes are 53."


“Column `gene`/`Var1` joining character vector and factor, coercing into character vector”

[1] "wrote results to ../data/results/LAML/vbsr-hm27-M-data/predictors.summary.csv."
[1] "../data/results/LAML/vbsr-hm450-B-data/"
[1] "There are in total 85 predictors from 20 clusters. The number of unqiue genes are 57."


“Column `gene`/`Var1` joining character vector and factor, coercing into character vector”

[1] "wrote results to ../data/results/LAML/vbsr-hm450-B-data/predictors.summary.csv."
[1] "../data/results/LAML/vbsr-hm450-M-data/"
[1] "There are in total 92 predictors from 20 clusters. The number of unqiue genes are 56."


“Column `gene`/`Var1` joining character vector and factor, coercing into character vector”

[1] "wrote results to ../data/results/LAML/vbsr-hm450-M-data/predictors.summary.csv."
[1] "../data/results/HNSC/vbsr-hm450-B-data/"
[1] "There are in total 234 predictors from 20 clusters. The number of unqiue genes are 158."


“Column `gene`/`Var1` joining character vector and factor, coercing into character vector”

[1] "wrote results to ../data/results/HNSC/vbsr-hm450-B-data/predictors.summary.csv."
[1] "../data/results/HNSC/vbsr-hm450-M-data/"
[1] "There are in total 196 predictors from 20 clusters. The number of unqiue genes are 134."


“Column `gene`/`Var1` joining character vector and factor, coercing into character vector”

[1] "wrote results to ../data/results/HNSC/vbsr-hm450-M-data/predictors.summary.csv."
[1] "../data/results/SKCM/vbsr-hm450-B-data/"
[1] "There are in total 183 predictors from 20 clusters. The number of unqiue genes are 110."


“Column `gene`/`Var1` joining character vector and factor, coercing into character vector”

[1] "wrote results to ../data/results/SKCM/vbsr-hm450-B-data/predictors.summary.csv."
[1] "../data/results/SKCM/vbsr-hm450-M-data/"
[1] "There are in total 159 predictors from 18 clusters. The number of unqiue genes are 105."


“Column `gene`/`Var1` joining character vector and factor, coercing into character vector”

[1] "wrote results to ../data/results/SKCM/vbsr-hm450-M-data/predictors.summary.csv."
