# Figueroa data set analysis

In [1]:
suppressMessages({
    library(hashmap)
    library(readr)
})

In [2]:
GEP <- read_tsv('../../data/Figueroa/rawdata/GSE14468.HGU133Plus2_EntrezCDF.MAS5.pcl') # Gene expression on 524 AML cases
head(GEP, n=2)

Parsed with column specification:
cols(
  .default = col_double(),
  Name = col_character(),
  Description = col_character()
)
See spec(...) for full column specifications.


Name,Description,GSM158711,GSM158712,GSM158713,GSM158714,GSM158715,GSM158716,GSM158717,GSM158718,⋯,GSM361376,GSM361377,GSM361378,GSM361379,GSM361380,GSM361381,GSM361382,GSM361383,GSM361384,GSM361385
10000_at,"AKT3 - ""v-akt murine thymoma viral oncogene homolog 3 (protein kinase B, gamma)""",74.3174,27.66245,67.34756,102.0721,101.6374,31.43173,74.90709,68.8868,⋯,120.7066,84.64849,126.016,139.1024,43.22202,236.6838,210.127,220.1624,104.8744,147.4783
10001_at,MED6 - mediator complex subunit 6,660.5697,536.1886,449.10047,314.0691,856.6095,708.81936,433.50528,528.4353,⋯,410.5169,486.99356,625.7184,547.8272,587.26037,549.8494,398.5402,457.6389,643.2808,384.2906


In [12]:
dim(GEP)

In [3]:
patients.GEP <- read_tsv('../../data/Figueroa/rawdata/GSE14468-samples.tsv')
head(patients.GEP, n=2)

Parsed with column specification:
cols(
  Accession = col_character(),
  Title = col_character(),
  `Sample Type` = col_character(),
  Taxonomy = col_character(),
  Channels = col_integer(),
  Platform = col_character(),
  Series = col_character(),
  `Supplementary Types` = col_character(),
  `Supplementary Links` = col_character(),
  `SRA Accession` = col_character(),
  Contact = col_character(),
  `Release Date` = col_character()
)


Accession,Title,Sample Type,Taxonomy,Channels,Platform,Series,Supplementary Types,Supplementary Links,SRA Accession,Contact,Release Date
GSM158711,AML 2199,RNA,Homo sapiens,1,GPL570,GSE6891;GSE14468,CEL,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM158nnn/GSM158711/suppl,,Roel Verhaak,"Mar 12, 2008"
GSM158712,AML 2200,RNA,Homo sapiens,1,GPL570,GSE6891;GSE14468,CEL,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM158nnn/GSM158712/suppl,,Roel Verhaak,"Mar 12, 2008"


In [4]:
methylation <- read_tsv('../../data/Figueroa/rawdata/GSE18700.GPL6604.matrix.txt') # methylation of 344 AML cases and 8 controls
head(methylation, n=2)

Parsed with column specification:
cols(
  .default = col_double(),
  Gene = col_character(),
  Name = col_character()
)
See spec(...) for full column specifications.


Gene,Name,GSM435212,GSM435213,GSM435214,GSM435215,GSM435216,GSM435217,GSM435218,GSM435219,⋯,GSM465071,GSM465072,GSM465073,GSM465074,GSM465075,GSM465076,GSM465077,GSM465078,GSM465079,GSM465080
MSPI0406S00402708,NOS3 - Nitric oxide synthase 3 (endothelial cell),-1.543586,-1.99278,-2.618254,-1.973418,-1.69127,-1.5271,-2.603303,-1.961328,⋯,-0.9308402,-0.3907199,1.470515,-0.1123418,-0.7597443,-0.3106185,-0.04199065,-1.358948,0.08236676,-0.4086701
MSPI0406S00519671,KIAA1279 - KIAA1279,2.798643,2.445831,2.363704,2.490647,3.304769,3.47255,2.680607,2.948648,⋯,2.6655187,1.8954379,2.289277,1.7177463,1.9399757,3.0420744,2.66983281,1.304129,0.86456765,3.1523175


In [11]:
dim(methylation)

In [5]:
patients.methylation <- read_tsv('../../data/Figueroa/rawdata/GSE18700-samples.tsv')
head(patients.methylation, n=2)

Parsed with column specification:
cols(
  Accession = col_character(),
  Title = col_character(),
  `Sample Type` = col_character(),
  Taxonomy = col_character(),
  Channels = col_integer(),
  Platform = col_character(),
  Series = col_character(),
  `Supplementary Types` = col_character(),
  `Supplementary Links` = col_character(),
  `SRA Accession` = col_character(),
  Contact = col_character(),
  `Release Date` = col_character()
)


Accession,Title,Sample Type,Taxonomy,Channels,Platform,Series,Supplementary Types,Supplementary Links,SRA Accession,Contact,Release Date
GSM464737,322,genomic,Homo sapiens,2,GPL6604,GSE18700,PAIR,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM464nnn/GSM464737/suppl,,Maria Eugenia Figueroa,"Jan 12, 2010"
GSM464738,1174,genomic,Homo sapiens,2,GPL6604,GSE18700,PAIR,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM464nnn/GSM464738/suppl,,Maria Eugenia Figueroa,"Jan 12, 2010"


## Mapping between GEP (14468) to methylation (18700)

In [6]:
preprocessTitle <- function(x) {
    ifelse(grepl('AML', x, fixed=T), substr(x, 5, nchar(x)), x)
}
PreprocessTitle <- Vectorize(preprocessTitle)
patients.titleGEP <- hashmap(PreprocessTitle(patients.GEP$Title), patients.GEP$Accession)
patients.titleMethylation <- hashmap(patients.methylation$Title, patients.methylation$Accession)
intersectTitles <- intersect(PreprocessTitle(patients.GEP$Title), patients.methylation$Title)

length(intersectTitles) # get 344 AML cases as in Figueroa et al. 2009

In [7]:
GEP2Methylation <- hashmap(patients.titleGEP[[intersectTitles]], patients.titleMethylation[[intersectTitles]])
GEP2Methylation

## (character) => (character)
## [GSM158788] => [GSM464759]
## [GSM158752] => [GSM464806]
## [GSM361343] => [GSM464920]
## [GSM159056] => [GSM464938]
## [GSM361373] => [GSM465020]
## [GSM159071] => [GSM464957]
##       [...] => [...]      

## Subset the methylation and GEP data set for overlapping patients

In [8]:
GEP.names <- GEP2Methylation[[colnames(GEP)]]

GEP.names.filtered <- na.omit(GEP.names)
omit.idx <- is.na(GEP.names)
omit.idx[c(1,2)] <- F

GEP.filtered <- GEP[,!omit.idx]

colnames(GEP.filtered) <- c('Name', 'Description', GEP.names.filtered)

head(GEP.filtered, n=2)
dim(GEP.filtered)

Name,Description,GSM464771,GSM464772,GSM464773,GSM464774,GSM464775,GSM464776,GSM464777,GSM464778,⋯,GSM465020,GSM465024,GSM465026,GSM465030,GSM465060,GSM465061,GSM465062,GSM465063,GSM465064,GSM465065
10000_at,"AKT3 - ""v-akt murine thymoma viral oncogene homolog 3 (protein kinase B, gamma)""",74.3174,27.66245,102.0721,101.6374,31.43173,74.90709,68.8868,114.4036,⋯,106.8033,207.4528,120.7066,84.64849,139.1024,43.22202,236.6838,210.127,220.1624,147.4783
10001_at,MED6 - mediator complex subunit 6,660.5697,536.1886,314.0691,856.6095,708.81936,433.50528,528.4353,448.2595,⋯,406.9814,586.143,410.5169,486.99356,547.8272,587.26037,549.8494,398.5402,457.6389,384.2906


In [9]:
findColumnIdx <- function(x) {
    which(colnames(methylation) %in% x)
}
FindColumnIdx <- Vectorize(findColumnIdx)

column.idx <- c(1, 2, na.omit(as.numeric(FindColumnIdx(colnames(GEP.filtered)[-c(1,2)]))))
methylation.filtered <- methylation[, column.idx]
head(methylation.filtered, n=2)
dim(methylation.filtered)

Gene,Name,GSM464771,GSM464772,GSM464773,GSM464774,GSM464775,GSM464776,GSM464777,GSM464778,⋯,GSM465020,GSM465024,GSM465026,GSM465030,GSM465060,GSM465061,GSM465062,GSM465063,GSM465064,GSM465065
MSPI0406S00402708,NOS3 - Nitric oxide synthase 3 (endothelial cell),-1.314768,-0.562936,-1.704894,0.3056685,-1.469609,1.524694,0.02468921,-0.02486799,⋯,-1.160726,-0.3174757,-1.294438,-0.338884,-1.118004,0.8936787,-1.384295,-0.6716157,0.5345794,0.08037697
MSPI0406S00519671,KIAA1279 - KIAA1279,2.850796,1.068262,1.846918,1.9675782,1.36142,2.623354,2.13396343,4.11175224,⋯,2.837818,1.8297567,3.449042,3.672064,2.315018,2.4546376,1.421856,2.5848064,1.4915783,1.15726762


In [10]:
write.table(methylation.filtered, file='../../data/Figueroa/matcheddata/methylation-filtered.tsv', sep='\t', quote=F, 
           col.names=T, row.names=F)

write.table(GEP.filtered, file='../../data/Figueroa/matcheddata/GEP-filtered.tsv', sep='\t', quote=F, 
           col.names=T, row.names=F)