# Calculations of the Effect Size (ES) for each microarray study 

###  Using Hedges' g value, an adjusted Cohen's d  value.

$$  {Enrichment} = \bar{X_1}-\bar{X_2}$$

where group1 is SCN Expression values and group2 are Whole Brain Expression values 
(SCN mean - WB mean) **(Logged values, so minus gives ratio)**

$$  {Pooled\ Standard\  Deviantion} = \sqrt\frac{(n_1-1)S_1^2 +(n_2-1)S_2^2}{(n_2 +n_2) -2}  $$  

$$  {Cohen's\ d\ value} = \frac{Enrichment}{Pooled\ Standard\ Deviation} $$

$$  {Correction\ Factor (J\ Factor)} = 1- \frac{3}{4df-1} $$

$$  {Hedges'\ g\ value} = Cohen's\ d\ \text{x}\ J\ $$

$$  {Variance\ in\ d (V_d)} = \frac{n_1- +n_2}{n_1 n_2} + \frac{d^2}{2(n_1 +n_2)}  $$

$$  {Variance\ in\ g (V_g)} = J^2\  \text{x}\ V_d  $$

$$  {Standard\ Error\ in\ g (SE_g)} = \sqrt{V_g}  $$

## Setup working environment and import data

In [27]:
import pandas as pd # Dataframes and File IO
import numpy as np # numerical calculations


In [28]:
prefix = 'RNA_Seq_'   # define a prefix to add to column names (making indexing easier later)

In [29]:
df=pd.read_table('../AltAnalyze_output/DATASET-RNASeq.txt', delimiter='\t',  index_col=0) #,nrows=500)  
df.tail()

Unnamed: 0_level_0,Definition,Symbol,Transcript_cluster_ids,Constitutive_exons_used,Constitutive_IDs_used,Putative microRNA binding sites,Select Cellular Compartments,Select Protein Classes,Chromosome,Strand,...,zt3_3.junctions.bed,avg-SCN,log_fold-SCN_vs_WB,fold-SCN_vs_WB,rawp-SCN_vs_WB,adjp-SCN_vs_WB,ANOVA-rawp,ANOVA-adjp,largest fold,maximum sample read count
Ensembl_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSMUSG00000028980,hexose-6-phosphate dehydrogenase (glucose 1-de...,H6pd,,ENSMUSE00000443217|ENSMUSE00000797477|ENSMUSE0...,ENSMUSG00000028980:E6.1-E8.1|ENSMUSG0000002898...,"mmu-let-7a(miRanda), mmu-let-7b(miRanda), mmu-...",,protein_coding,chr4,-,...,0.430413,0.408706,Insufficient Expression,Insufficient Expression,0.163555,0.135393,0.163971,0.308484,0.083302,104.0
ENSMUSG00000042678,myosin XV [Source:MGI Symbol;Acc:MGI:1261811],Myo15,,AY331132-8|ENSMUSE00000244777|ENSMUSE000002447...,ENSMUSG00000042678:E31.1-E32.1|ENSMUSG00000042...,"mmu-let-7a(miRanda), mmu-let-7b(miRanda), mmu-...",,protein_coding,chr11,+,...,0.189131,0.069396,Insufficient Expression,Insufficient Expression,0.444801,0.267044,0.418682,0.536443,0.029154,32.0
ENSMUSG00000094460,"predicted gene, 21560 [Source:MGI Symbol;Acc:M...",Gm21560,,ENSMUSE00001017596|ENSMUSE00001050200|ENSMUSE0...,ENSMUSG00000094460:E1.2-E2.1|ENSMUSG0000009446...,,,protein_coding,chr14,-,...,1.153791,0.691661,Insufficient Expression,Insufficient Expression,0.240458,0.178517,0.255921,0.408302,0.196272,27.0
ENSMUSG00000094463,predicted gene 8546 [Source:MGI Symbol;Acc:MGI...,Gm8546,,ENSMUSE00000970601|ENSMUSE00001022208,ENSMUSG00000094463:E2.1-E3.1|ENSMUSG0000009446...,,,pseudogene,chr13,-,...,0.520411,0.24443,Insufficient Expression,Insufficient Expression,0.623665,0.331281,0.630674,0.723023,0.042574,7.0
ENSMUSG00000028986,kelch-like 7 [Source:MGI Symbol;Acc:MGI:1196453],Klhl7,,ENSMUSE00000184232|ENSMUSE00000184232|ENSMUSE0...,ENSMUSG00000028986:E1.1-E2.1|ENSMUSG0000002898...,"mmu-miR-1(miRanda), mmu-miR-101a(RNAhybrid|miR...",,protein_coding,chr5,+,...,4.887626,4.962881,0.548488737209,1.46255282885,0.020746,0.027435,0.025533,0.075224,0.548489,4383.0


## Look at column names and then setup filters for grouping columns into SCN and WB groups

In [30]:
df.columns

Index([u'Definition', u'Symbol', u'Transcript_cluster_ids',
       u'Constitutive_exons_used', u'Constitutive_IDs_used',
       u'Putative microRNA binding sites', u'Select Cellular Compartments',
       u'Select Protein Classes', u'Chromosome', u'Strand',
       u'Genomic Gene Corrdinates', u'GO-Biological Process',
       u'GO-Molecular Function', u'GO-Cellular Component', u'WikiPathways',
       u'C3HHeJ.juctions.bed', u'C57BL6NJ.junctions.bed',
       u'GSM752615.juctions.bed', u'SRR306757.juctions.bed',
       u'SRR579545.juctions.bed', u'SRR579546.juctions.bed',
       u'SRR594393.juctions.bed', u'SRR594402.juctions.bed',
       u'SRR594410.juctions.bed', u'SRR636961.juctions.bed', u'avg-WB',
       u'SCN_NT.juctions.bed', u'zt15_1.junctions.bed',
       u'zt15_2.junctions.bed', u'zt15_3.junctions.bed',
       u'zt3_1.junctions.bed', u'zt3_2.junctions.bed', u'zt3_3.junctions.bed',
       u'avg-SCN', u'log_fold-SCN_vs_WB', u'fold-SCN_vs_WB', u'rawp-SCN_vs_WB',
       u'adjp-SCN_vs

In [31]:
# define regular expressions for filters 
scn_filt ='SCN_NT|zt'
wb_filt ='SRR|GSM|C\d'

In [32]:
df_scn=df.filter(regex= scn_filt)
df_scn.shape

(20356, 7)

In [33]:
df_wb=df.filter(regex= wb_filt)
df_wb.shape

(20356, 10)

## Calculations 

In [34]:
SCNcount = df.filter(regex=scn_filt).count(axis=1)

In [35]:
# Enrichment

df[prefix+'Enrich'] = df.filter(regex=scn_filt).mean(axis=1) - df.filter(regex=wb_filt).mean(axis=1)

In [36]:
df[prefix+'Enrich'].head()

Ensembl_gene
ENSMUSG00000002012    1.980025
ENSMUSG00000002010    0.092466
ENSMUSG00000002017    0.019566
ENSMUSG00000028184   -0.590370
ENSMUSG00000002015    0.824844
Name: RNA_Seq_Enrich, dtype: float64

In [37]:
# Pooled StDev
SCNcount = df.filter(regex=scn_filt).count(axis=1)
WBcount = df.filter(regex=wb_filt).count(axis=1)

StdevSCN = (SCNcount-1) * df.filter(regex=scn_filt).var(axis=1)
StdevWB = (WBcount-1) * df.filter(regex=wb_filt).var(axis=1)

df[prefix+'poolStDev'] = np.sqrt((StdevSCN+StdevWB)/(SCNcount+ WBcount-2))

In [38]:
#Cohen's d

df[prefix+'Cohens_d'] = df[prefix+'Enrich'] / df[prefix+'poolStDev']

In [39]:
#df[prefix+'poolStDev'].head()
df[prefix+'Cohens_d'] .head()

Ensembl_gene
ENSMUSG00000002012    6.191708
ENSMUSG00000002010    0.520721
ENSMUSG00000002017    0.081596
ENSMUSG00000028184   -1.794777
ENSMUSG00000002015    4.328033
Name: RNA_Seq_Cohens_d, dtype: float64

In [40]:
#J value (Correction factor)

df[prefix+'J'] = 1-(3/(4*(SCNcount+WBcount-1)))                              


In [41]:
#Hedge's g

df[prefix+'Hedges_g'] = df[prefix+'Cohens_d'] * df[prefix+'J']

In [42]:
#df[prefix+'J'].head()
df[prefix+'Hedges_g'] .head()

Ensembl_gene
ENSMUSG00000002012    5.901472
ENSMUSG00000002010    0.496312
ENSMUSG00000002017    0.077771
ENSMUSG00000028184   -1.710647
ENSMUSG00000002015    4.125157
Name: RNA_Seq_Hedges_g, dtype: float64

In [43]:
#Var_d
SCNcount = df.filter(regex=scn_filt).count(axis=1)
WBcount = df.filter(regex=wb_filt).count(axis=1)

Ftop1 = SCNcount + WBcount
Ftop2 = SCNcount * WBcount
Fbottom1 = np.square(df[prefix+'Cohens_d']) 
Fbottom2 =  2*(SCNcount + WBcount)


df[prefix+'Var_d'] = (Ftop1/Ftop2) + (Fbottom1 /Fbottom2)

In [53]:
# Check output
df[prefix+'Cohens_d'].head()

Ensembl_gene
ENSMUSG00000028753    14.169314
ENSMUSG00000029193    12.669593
ENSMUSG00000005447    12.157235
ENSMUSG00000020895    11.290300
ENSMUSG00000006930    11.204732
Name: RNA_Seq_Cohens_d, dtype: float64

In [45]:
df[prefix+'Var_g'] = df[prefix+'Var_d'] * np.square(df[prefix+'J'])

In [46]:
#SEg

df[prefix+'SEg'] = np.sqrt(df[prefix+'Var_g'])

In [47]:
df.sort_values(by=prefix+'Hedges_g', ascending=False, inplace=True)
df

Unnamed: 0_level_0,Definition,Symbol,Transcript_cluster_ids,Constitutive_exons_used,Constitutive_IDs_used,Putative microRNA binding sites,Select Cellular Compartments,Select Protein Classes,Chromosome,Strand,...,largest fold,maximum sample read count,RNA_Seq_Enrich,RNA_Seq_poolStDev,RNA_Seq_Cohens_d,RNA_Seq_J,RNA_Seq_Hedges_g,RNA_Seq_Var_d,RNA_Seq_Var_g,RNA_Seq_SEg
Ensembl_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSMUSG00000028753,von Willebrand factor A domain containing 5B1 ...,Vwa5b1,,ENSMUSE00000284979|ENSMUSE00000882199|ENSMUSE0...,ENSMUSG00000028753:E12.2-E13.1|ENSMUSG00000028...,"mmu-miR-124(miRanda), mmu-miR-290-5p(miRanda),...",extracellular,protein_coding,chr4,-,...,2.553127,1176.0,2.553127,0.180187,14.169314,0.953125,13.505128,6.147842,5.584990,2.363258
ENSMUSG00000029193,cholecystokinin A receptor [Source:MGI Symbol;...,Cckar,,ENSMUSE00000186771|ENSMUSE00001063695|ENSMUSE0...,ENSMUSG00000029193:E1.2-E2.1_53706453|ENSMUSG0...,"mmu-miR-10a(miRanda), mmu-miR-10b(miRanda), mm...",transmembrane,GPCR(Gq)|protein_coding,chr5,-,...,1.513818,81.0,1.513818,0.119484,12.669593,0.953125,12.075706,4.963992,4.509525,2.123564
ENSMUSG00000005447,"platelet-activating factor acetylhydrolase, is...",Pafah1b3,,ENSMUSE00000199497|ENSMUSE00000834052|ENSMUSE0...,ENSMUSG00000005447:E3.7-E4.1|ENSMUSG0000000544...,"mmu-miR-202(RNAhybrid), mmu-miR-381(miRanda), ...",,protein_coding,chr7,-,...,2.293477,1224.0,2.293477,0.188651,12.157235,0.953125,11.587365,4.589868,4.169653,2.041973
ENSMUSG00000020895,transmembrane protein 107 [Source:MGI Symbol;A...,Tmem107,,ENSMUSE00000739777|ENSMUSE00000841222|ENSMUSE0...,ENSMUSG00000020895:E1.7-E4.4|ENSMUSG0000002089...,"mmu-miR-133a(miRanda), mmu-miR-133b(miRanda), ...",transmembrane,protein_coding,chr11,+,...,1.518120,459.0,1.518120,0.134462,11.290300,0.953125,10.761068,3.992001,3.626522,1.904343
ENSMUSG00000006930,huntingtin-associated protein 1 [Source:MGI Sy...,Hap1,,ENSMUSE00000112621|ENSMUSE00000112621|ENSMUSE0...,ENSMUSG00000006930:E12.4-E12.6_100348839|ENSMU...,"mmu-miR-103(miRanda), mmu-miR-107(miRanda), mm...",,protein_coding,chr11,-,...,2.769519,21620.0,2.769519,0.247174,11.204732,0.953125,10.679511,3.935387,3.575092,1.890791
ENSMUSG00000019027,"dynein, axonemal, heavy chain 1 [Source:MGI Sy...",Dnahc1,,ENSMUSE00000121739|ENSMUSE00000121742|ENSMUSE0...,ENSMUSG00000019027:E75.1-I75.1_31262014|ENSMUS...,"mmu-miR-143(miRanda), mmu-miR-146a(miRanda), m...",,protein_coding,chr14,-,...,2.151916,2544.0,2.151916,0.192770,11.163138,0.953125,10.639866,3.908023,3.550233,1.884206
ENSMUSG00000022842,endothelin converting enzyme 2 [Source:MGI Sym...,Ece2,,AK035991-1|BC145126-17|ENSMUSE00000837329|ENSM...,ENSMUSG00000022842:E6.1_20631140-E6.2_20631214...,"mmu-let-7b(RNAhybrid|miRanda), mmu-let-7c(RNAh...",transmembrane,protein_coding,chr16,+,...,1.828516,3137.0,1.828516,0.165306,11.061410,0.953125,10.542906,3.841527,3.489825,1.868107
ENSMUSG00000031374,zinc finger protein 92 [Source:MGI Symbol;Acc:...,Zfp92,,ENSMUSE00000208714|ENSMUSE00000208708|ENSMUSE0...,ENSMUSG00000031374:E7.1_73427717-E7.1_73428382...,"mmu-miR-103(miRanda), mmu-miR-106a(miRanda), m...",nucleus,transcription regulator|protein_coding,chrX,+,...,1.350022,217.0,1.350022,0.123397,10.940516,0.953125,10.427679,3.763295,3.418755,1.848988
ENSMUSG00000031340,"gamma-aminobutyric acid (GABA) A receptor, sub...",Gabre,,AF189263-2|ENSMUSE00001310494|ENSMUSE000013097...,ENSMUSG00000031340:E4.1-E4.3|ENSMUSG0000003134...,"mmu-miR-139-5p(miRanda), mmu-miR-140(miRanda),...",,protein_coding,chrX,-,...,2.737916,790.0,2.737916,0.254071,10.776190,0.953125,10.271056,3.658336,3.323405,1.823021
ENSMUSG00000023964,calcitonin receptor [Source:MGI Symbol;Acc:MGI...,Calcr,,ENSMUSE00000136684|ENSMUSE00000136687|ENSMUSE0...,ENSMUSG00000023964:E1.2-E3.1|ENSMUSG0000002396...,"mmu-let-7d(pictar), mmu-miR-100(miRanda), mmu-...",transmembrane,GPCR(Gs)|protein_coding,chr6,-,...,3.961542,954.0,3.961542,0.377217,10.502014,0.953125,10.009732,3.486748,3.167527,1.779755


In [48]:
df.columns

Index([u'Definition', u'Symbol', u'Transcript_cluster_ids',
       u'Constitutive_exons_used', u'Constitutive_IDs_used',
       u'Putative microRNA binding sites', u'Select Cellular Compartments',
       u'Select Protein Classes', u'Chromosome', u'Strand',
       u'Genomic Gene Corrdinates', u'GO-Biological Process',
       u'GO-Molecular Function', u'GO-Cellular Component', u'WikiPathways',
       u'C3HHeJ.juctions.bed', u'C57BL6NJ.junctions.bed',
       u'GSM752615.juctions.bed', u'SRR306757.juctions.bed',
       u'SRR579545.juctions.bed', u'SRR579546.juctions.bed',
       u'SRR594393.juctions.bed', u'SRR594402.juctions.bed',
       u'SRR594410.juctions.bed', u'SRR636961.juctions.bed', u'avg-WB',
       u'SCN_NT.juctions.bed', u'zt15_1.junctions.bed',
       u'zt15_2.junctions.bed', u'zt15_3.junctions.bed',
       u'zt3_1.junctions.bed', u'zt3_2.junctions.bed', u'zt3_3.junctions.bed',
       u'avg-SCN', u'log_fold-SCN_vs_WB', u'fold-SCN_vs_WB', u'rawp-SCN_vs_WB',
       u'adjp-SCN_vs

### Columns from the list above can then easily be picked to produce files for use later. Examples below given:
 #### df3 = average SCN and WB expression for the platform and the log-fold changes
 #### df4 = Hedges g  values and associated variance for Meta-analysis (after indexing)

In [49]:
# df3 = df.loc[:,[u'Symbol',u'avg-WB', u'avg-SCN', u'log_fold-SCN_vs_WB']]
# df3.columns =[u'Symbol',prefix+'avg-WB', prefix+'avg-SCN', prefix+'log_fold-SCN_vs_WB']
# df3.to_csv('input_files/RNA_Seq_Expression.csv')

In [50]:
df4 = df.loc[:,[u'Symbol',u'RNA_Seq_Enrich',u'RNA_Seq_Hedges_g', u'RNA_Seq_Var_g',u'RNA_Seq_SEg']]
df4.to_csv('input_files/RNA_Seq_forIndexHedges.csv')

In [51]:
df.shape

(20356, 50)

In [52]:
df4 #final check on ouput (average on MGI symbol after indexing)

Unnamed: 0_level_0,Symbol,RNA_Seq_Enrich,RNA_Seq_Hedges_g,RNA_Seq_Var_g,RNA_Seq_SEg
Ensembl_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSMUSG00000028753,Vwa5b1,2.553127,13.505128,5.584990,2.363258
ENSMUSG00000029193,Cckar,1.513818,12.075706,4.509525,2.123564
ENSMUSG00000005447,Pafah1b3,2.293477,11.587365,4.169653,2.041973
ENSMUSG00000020895,Tmem107,1.518120,10.761068,3.626522,1.904343
ENSMUSG00000006930,Hap1,2.769519,10.679511,3.575092,1.890791
ENSMUSG00000019027,Dnahc1,2.151916,10.639866,3.550233,1.884206
ENSMUSG00000022842,Ece2,1.828516,10.542906,3.489825,1.868107
ENSMUSG00000031374,Zfp92,1.350022,10.427679,3.418755,1.848988
ENSMUSG00000031340,Gabre,2.737916,10.271056,3.323405,1.823021
ENSMUSG00000023964,Calcr,3.961542,10.009732,3.167527,1.779755
