# Differential Gene Expression Analysis

## Import Z scores and Format

In [4]:
### Read in z_scores and format ###
z_scores = read.csv("data_RNA_Seq_mRNA_median_all_sample_Zscores.txt", sep = "\t", stringsAsFactors = FALSE, header = TRUE)

map = z_scores[,c(1,2)]

inst_names = colnames(z_scores)[3:length(colnames(z_scores))]

z_scores = z_scores[,-2]
z_scores = as.data.frame(t(z_scores), stringsAsFactors = FALSE)
names(z_scores) = map[,1]
z_scores = z_scores[-1,]

z_scores = as.data.frame(apply(z_scores, 2, as.numeric))

name_fix = c()
for(name in inst_names){name_fix = c(name_fix,substr(name,11,nchar(name)-3))}


row.names(z_scores) = name_fix


In [5]:
z_scores[1:5,1:5]


Unnamed: 0,A1BG,A1BG-AS,A1CF,A2LD1,A2M
PAAPFA,-1.3428,-1.6542,-0.9681,-1.5144,0.939
PACLJN,-0.5008,1.2776,-1.1983,1.2333,0.2163
PACPJG,-0.2098,-1.6542,-1.1983,0.305,-0.4804
PACRYY,-1.3428,0.3708,-1.1983,-1.5144,-0.2457
PACRZM,0.5693,-0.5274,-0.5943,0.5219,-0.5965


## Import Clinical data and Format

In [6]:
patient = as.data.frame(read.csv("data_clinical_patient.txt", sep = "\t", stringsAsFactors = FALSE, header = TRUE, skip = 4))
p_nam = patient[,1]

p_nam_fix = c()
for(name in p_nam){p_nam_fix = c(p_nam_fix,substr(name,11,nchar(name)))}

patient = patient[,2:ncol(patient)]

row.names(patient) = p_nam_fix

In [7]:
head(patient)

Unnamed: 0,PROTOCOL,ICDO,SNOMED,ICDO_SNOMED_DESCRIPTION,AGE_IN_DAYS,AGE,YEAR_OF_DIAGNOSIS,INSS_STAGE,TUMOR_SAMPLE_HISTOLOGY,DIAGNOSIS,...,FIRST_EVENT,OS_STATUS,OS_DAYS,OS_MONTHS,LAST_FOLLOWUP_YEAR,PERCENTAGE_NECROSIS,PERCENT_TUMOR_VS_STROMA,PERCENT_TUMOR_CELLS_RELAPSE,RELAPSE_PERCENT_NECROSIS,RELAPSE_PERCENT_STROMA
PALPKZ,ANBL00B1,C38.2,,Posterior mediastinum,1157,4,2002,Stage 4,Unfavorable,Neuroblastoma,...,Progression,1:DECEASED,2103,70,2008,,,,,
PASFWL,"ANBL00B1, AEPI07N1",C64.9,,"Kidney, NOS Renal, NOS Kidney parenchyma",184,1,2008,Stage 2b,Favorable,Neuroblastoma,...,Censored,0:LIVING,2449,81,2015,,,,,
PAURYJ,ANBL00B1,C64.9,,"Kidney, NOS Renal, NOS Kidney parenchyma",10,1,2012,Stage 4s,Favorable,Neuroblastoma,...,Censored,0:LIVING,1294,43,2015,,,,,
PARVHG,"ANBL00B1, ANBL0532",C74.9,,"Adrenal gland, NOS Suprarenal gland Adrenal, NOS",730,2,2008,Stage 4,Unknown,Neuroblastoma,...,Censored,0:LIVING,1913,63,2013,,,,,
PALNMX,"ANBL00B1, A3973",C76.2,999.0,"Abdomen, NOS Abdominal wall, NOS Intra-abdominal site, NOS",1607,5,2002,Stage 4,Unfavorable,Neuroblastoma,...,Censored,0:LIVING,4679,154,2015,,,,,
PAMZSH,"ANBL00B1, P9641",C76.1,,"Thorax, NOS Axilla, NOS Chest, NOS Chest wall, NOS Intrathoracic site, NOS Thoracic wall, NOS Infraclavicular region, NOS Scapular region, NOS",315,1,2004,Stage 1,Favorable,Neuroblastoma,...,Censored,0:LIVING,3334,110,2013,,,,,


## Isolate Status and Merge

In [8]:
status = data.frame(patient[,"OS_STATUS"])
colnames(status) = "STATUS"
rownames(status) = p_nam_fix

mg = merge.data.frame(status,z_scores, by="row.names")

In [9]:
mg[1:5,1:5]

Row.names,STATUS,A1BG,A1BG-AS,A1CF
PAAPFA,1:DECEASED,-1.3428,-1.6542,-0.9681
PACLJN,0:LIVING,-0.5008,1.2776,-1.1983
PACPJG,,-0.2098,-1.6542,-1.1983
PACRYY,0:LIVING,-1.3428,0.3708,-1.1983
PACRZM,1:DECEASED,0.5693,-0.5274,-0.5943


## Find DEG between status groups

In [10]:
srv = mg[mg$STATUS == "0:LIVING",]
dec = mg[mg$STATUS == "1:DECEASED",]

get_mean = function(dat){ #finds the mean z score for each attribute in a dataframe
  atribs = colnames(dat)[colnames(dat)!= "Row.names" & colnames(dat)!= "STATUS"]
  ret = c()
  for(atrib in atribs){
    mea = mean(dat[,atrib])
    ret = cbind(ret, mea)
  }
  
  ret = as.data.frame(ret)
  colnames(ret) = atribs
  return(ret)
}

srv_mean = get_mean(srv)
dec_mean = get_mean(dec)



Note: Certain known prognostic markers do not show a difference in expression in this dataset. 

In [None]:
## calculate the absolute difference in mean between both groups and threshold based on significance
get_deg = function(d1, d2, thresh=0.5){
  ab = abs(d1-d2)
  sig = apply(ab,2, function(imp){imp > thresh})
  deg = ab[,sig]
  return(deg)
}

deg = get_deg(srv_mean, dec_mean)

# List of differentially expressed genes

note: some of the most important prognostic markers (MYC) are absent from this list

In [13]:
colnames(deg)