# Differential Gene Expression Analysis

## Import Z scores and Format

In [1]:
### Read in z_scores and format ###
z_scores = read.csv("data_RNA_Seq_mRNA_median_all_sample_Zscores.txt", sep = "\t", stringsAsFactors = FALSE, header = TRUE)

map = z_scores[,c(1,2)]

inst_names = colnames(z_scores)[3:length(colnames(z_scores))]

z_scores = z_scores[,-2]
z_scores = as.data.frame(t(z_scores), stringsAsFactors = FALSE)
names(z_scores) = map[,1]
z_scores = z_scores[-1,]

z_scores = as.data.frame(apply(z_scores, 2, as.numeric))

name_fix = c()
for(name in inst_names){name_fix = c(name_fix,substr(name,11,nchar(name)-3))}


row.names(z_scores) = name_fix


In [2]:
z_scores[1:5,1:5]


Unnamed: 0,A1BG,A1BG-AS,A1CF,A2LD1,A2M
PAAPFA,-1.3428,-1.6542,-0.9681,-1.5144,0.939
PACLJN,-0.5008,1.2776,-1.1983,1.2333,0.2163
PACPJG,-0.2098,-1.6542,-1.1983,0.305,-0.4804
PACRYY,-1.3428,0.3708,-1.1983,-1.5144,-0.2457
PACRZM,0.5693,-0.5274,-0.5943,0.5219,-0.5965


## Import Clinical data and Format

In [3]:
patient = as.data.frame(read.csv("data_clinical_patient.txt", sep = "\t", stringsAsFactors = FALSE, header = TRUE, skip = 4))
p_nam = patient[,1]

p_nam_fix = c()
for(name in p_nam){p_nam_fix = c(p_nam_fix,substr(name,11,nchar(name)))}

patient = patient[,2:ncol(patient)]

row.names(patient) = p_nam_fix

In [4]:
head(patient)

Unnamed: 0,PROTOCOL,ICDO,SNOMED,ICDO_SNOMED_DESCRIPTION,AGE_IN_DAYS,AGE,YEAR_OF_DIAGNOSIS,INSS_STAGE,TUMOR_SAMPLE_HISTOLOGY,DIAGNOSIS,...,FIRST_EVENT,OS_STATUS,OS_DAYS,OS_MONTHS,LAST_FOLLOWUP_YEAR,PERCENTAGE_NECROSIS,PERCENT_TUMOR_VS_STROMA,PERCENT_TUMOR_CELLS_RELAPSE,RELAPSE_PERCENT_NECROSIS,RELAPSE_PERCENT_STROMA
PALPKZ,ANBL00B1,C38.2,,Posterior mediastinum,1157,4,2002,Stage 4,Unfavorable,Neuroblastoma,...,Progression,1:DECEASED,2103,70,2008,,,,,
PASFWL,"ANBL00B1, AEPI07N1",C64.9,,"Kidney, NOS Renal, NOS Kidney parenchyma",184,1,2008,Stage 2b,Favorable,Neuroblastoma,...,Censored,0:LIVING,2449,81,2015,,,,,
PAURYJ,ANBL00B1,C64.9,,"Kidney, NOS Renal, NOS Kidney parenchyma",10,1,2012,Stage 4s,Favorable,Neuroblastoma,...,Censored,0:LIVING,1294,43,2015,,,,,
PARVHG,"ANBL00B1, ANBL0532",C74.9,,"Adrenal gland, NOS Suprarenal gland Adrenal, NOS",730,2,2008,Stage 4,Unknown,Neuroblastoma,...,Censored,0:LIVING,1913,63,2013,,,,,
PALNMX,"ANBL00B1, A3973",C76.2,999.0,"Abdomen, NOS Abdominal wall, NOS Intra-abdominal site, NOS",1607,5,2002,Stage 4,Unfavorable,Neuroblastoma,...,Censored,0:LIVING,4679,154,2015,,,,,
PAMZSH,"ANBL00B1, P9641",C76.1,,"Thorax, NOS Axilla, NOS Chest, NOS Chest wall, NOS Intrathoracic site, NOS Thoracic wall, NOS Infraclavicular region, NOS Scapular region, NOS",315,1,2004,Stage 1,Favorable,Neuroblastoma,...,Censored,0:LIVING,3334,110,2013,,,,,


## Isolate Status and Merge

In [5]:
status = data.frame(patient[,"OS_STATUS"])
colnames(status) = "STATUS"
rownames(status) = p_nam_fix

mg = merge.data.frame(status,z_scores, by="row.names")

In [6]:
mg[1:5,1:5]

Row.names,STATUS,A1BG,A1BG-AS,A1CF
PAAPFA,1:DECEASED,-1.3428,-1.6542,-0.9681
PACLJN,0:LIVING,-0.5008,1.2776,-1.1983
PACPJG,,-0.2098,-1.6542,-1.1983
PACRYY,0:LIVING,-1.3428,0.3708,-1.1983
PACRZM,1:DECEASED,0.5693,-0.5274,-0.5943


## Find DEG between status groups

In [7]:
srv = mg[mg$STATUS == "0:LIVING",]
dec = mg[mg$STATUS == "1:DECEASED",]

get_mean = function(dat){ #finds the mean z score for each attribute in a dataframe
  atribs = colnames(dat)[colnames(dat)!= "Row.names" & colnames(dat)!= "STATUS"]
  ret = c()
  for(atrib in atribs){
    mea = mean(dat[,atrib])
    ret = cbind(ret, mea)
  }
  
  ret = as.data.frame(ret)
  colnames(ret) = atribs
  return(ret)
}

srv_mean = get_mean(srv)
dec_mean = get_mean(dec)



Note: Certain known prognostic markers do not show much of a difference in expression in this dataset. 

In [8]:
cat("Survived MYCN: ",srv_mean[,"MYCN"], "\n", "Deceased MYCN: ",dec_mean[,"MYCN"])

Survived MYCN:  -1.158347 
 Deceased MYCN:  -1.047521

In [9]:
cat("Survived TP53: ",srv_mean[,"TP53"], "\n", "Deceased TP53: ",dec_mean[,"TP53"])

Survived TP53:  -1.185329 
 Deceased TP53:  -1.011371

In [10]:
cat("Survived NTRK1: ",srv_mean[,"NTRK1"], "\n", "Deceased NTRK1: ",dec_mean[,"NTRK1"])

Survived NTRK1:  -0.5203187 
 Deceased NTRK1:  -1.209449

In [11]:
cat("Survived NTRK2: ",srv_mean[,"NTRK2"], "\n", "Deceased NTRK2: ",dec_mean[,"NTRK2"])

Survived NTRK2:  -0.9338617 
 Deceased NTRK2:  -0.8330129

In [12]:
## calculate the absolute difference in mean between both groups and threshold based on significance
## Need to get a value of thresh that accounts for multiple testing
get_deg = function(d1, d2, thresh=0.7){
  ab = abs(d1-d2)
  sig = apply(ab,2, function(imp){imp > thresh})
  deg = ab[,sig]
  return(deg)
}

deg = get_deg(srv_mean, dec_mean)

# List of differentially expressed genes


In [13]:
length(colnames(deg))

when comparing this list to the ML outputs from survival classification, many of these genes appeared in the output. None of these genes appear to be associated with Neuroblastoma in the lit -> do we need a different dataset?

# Perform dimensionality reduciton using list of differentially expressed genes

In [14]:
STATUS = mg$STATUS

out = cbind(mg[,colnames(deg)],STATUS)


out = out[!(out$STATUS == ""),] 
head(out)

Unnamed: 0,ADCY1,AHI1,ANKFN1,ARHGAP23,ARHGEF10L,ATXN1,BBS9,C17orf107,C7orf41,CLASP1,...,SYNRG,TIA1,TOM1L2,TRIM2,TRIM36,TRNAU1AP,UBE2D4,WSB1,ZNF608,STATUS
1,-1.9437,-0.6755,-1.9429,-1.716,-1.7751,-1.823,-1.9252,-1.5952,-1.7048,-1.6067,...,-1.9606,-2.1007,-1.7533,-1.8533,-2.1603,-1.9675,-1.7114,-1.4356,-2.0498,1:DECEASED
2,0.5537,-0.1667,0.0884,2.4192,1.2292,1.6252,-0.1094,0.998,1.0244,1.7758,...,1.5877,-2.1007,0.4086,1.7161,1.6862,-1.5245,-0.4018,-1.8105,1.1887,0:LIVING
4,0.3179,-0.2939,-1.0527,1.1932,-0.5461,-0.2243,1.4394,0.1907,1.0255,0.4239,...,0.8831,0.8409,-0.415,0.4772,-0.0438,1.0537,1.3997,0.1328,1.5408,0:LIVING
5,-1.5081,-1.8912,-0.0361,-0.2276,0.1814,1.2144,-1.9252,-1.5952,-0.7407,-1.6067,...,-1.9606,-1.0548,-1.1232,-0.2224,0.2868,-1.758,-1.7114,-1.8105,0.0924,1:DECEASED
6,-0.9632,0.4154,-1.9348,-1.716,-1.7751,1.3672,-0.0428,-1.5952,-1.7048,-0.2657,...,0.481,-0.1638,-1.7533,-1.8533,-0.1351,1.3582,-0.8373,1.3671,-2.0498,1:DECEASED
7,-1.9437,-1.8912,-1.9429,-0.8877,-1.7751,-1.823,-1.9252,0.5022,-1.7048,-1.6067,...,-1.9606,-2.1007,-1.7533,-1.8533,-2.1603,-1.9675,-1.7114,-1.8105,-2.0498,1:DECEASED


In [15]:
write.csv(out,"deg_dimred.csv")