# Calculations of the Effect Size (ES) for each microarray study 

###  Using Hedges' g value, an adjusted Cohen's d  value.

$$  {Enrichment} = \bar{X_1}-\bar{X_2}$$

where group1 is SCN Expression values and group2 are Whole Brain Expression values 
(SCN mean - WB mean) **(Logged values, so minus gives ratio)**

$$  {Pooled\ Standard\  Deviantion} = \sqrt\frac{(n_1-1)S_1^2 +(n_2-1)S_2^2}{(n_2 +n_2) -2}  $$  

$$  {Cohen's\ d\ value} = \frac{Enrichment}{Pooled\ Standard\ Deviation} $$

$$  {Correction\ Factor (J\ Factor)} = 1- \frac{3}{4df-1} $$

$$  {Hedges'\ g\ value} = Cohen's\ d\ \text{x}\ J\ $$

$$  {Variance\ in\ d (V_d)} = \frac{n_1- +n_2}{n_1 n_2} + \frac{d^2}{2(n_1 +n_2)}  $$

$$  {Variance\ in\ g (V_g)} = J^2\  \text{x}\ V_d  $$

$$  {Standard\ Error\ in\ g (SE_g)} = \sqrt{V_g}  $$

## Setup working environment and import data

In [1]:
import pandas as pd # Dataframes and file IO
import numpy as np # numerical calculations

In [2]:
prefix = 'MoGene_'   # define a prefix to add to column names (making indexing easier later)

In [3]:
df=pd.read_table('../AltAnalyze_output/DATASET-MoGene.txt', delimiter='\t',  index_col=0) #,nrows=500)  
df.tail()

Unnamed: 0_level_0,Definition,Symbol,Transcript_cluster_ids,Constitutive_exons_used,Constitutive_IDs_used,Putative microRNA binding sites,Select Cellular Compartments,Select Protein Classes,Chromosome,Strand,...,WT_ZT63.CEL,WT_ZT64.CEL,avg-SCN,log_fold-SCN_vs_WB,fold-SCN_vs_WB,rawp-SCN_vs_WB,adjp-SCN_vs_WB,ANOVA-rawp,ANOVA-adjp,largest fold
Ensembl_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSMUSG00000042677,zinc finger CCCH type containing 12A [Source:M...,Zc3h12a,10516266,ENSMUSE00000524960|ENSMUSE00000736465,10516271,"mmu-miR-122(miRanda), mmu-miR-125a-5p(miRanda)...",nucleus,protein_coding,chr4,-,...,6.50487,6.98522,6.94932,Insufficient Expression,Insufficient Expression,0.07561425,0.021478,0.091104,0.124418,0.252459
ENSMUSG00000036957,leucine rich repeat and fibronectin type III d...,Lrfn3,10561911,BC066999-1|ENSMUSE00000442373|BC066999-2|ENSMU...,10561917|10561915|10561914|10561913,,transmembrane,protein_coding,chr7,-,...,7.68184,7.594073,7.671976,Insufficient Expression,Insufficient Expression,0.1973856,0.05019,0.189326,0.237432,0.114218
ENSMUSG00000036959,BCL6 co-repressor-like 1 [Source:MGI Symbol;Ac...,Bcorl1,10599514,ENSMUSE00000701318|ENSMUSE00000718262|ENSMUSE0...,10599515|10599519|10599520|10599522,"mmu-let-7b(RNAhybrid|miRanda), mmu-let-7c(RNAh...",nucleus,transcription regulator|protein_coding,chrX,+,...,7.789207,7.653052,7.980466,Insufficient Expression,Insufficient Expression,0.03341995,0.010388,0.035782,0.054409,0.22059
ENSMUSG00000036958,cystatin 11 [Source:MGI Symbol;Acc:MGI:1925490],Cst11,10488404,ENSMUSE00000246608|ENSMUSE00000332591|ENSMUSE0...,10488408|10488407|10488406|10488405,"mmu-miR-425(miRanda), mmu-miR-489(miRanda)",extracellular,protein_coding,chr2,-,...,6.155,6.298147,6.098615,Insufficient Expression,Insufficient Expression,5.387074e-05,3.6e-05,9e-05,0.000291,0.481079
ENSMUSG00000042678,myosin XV [Source:MGI Symbol;Acc:MGI:1261811],Myo15,10376615,ENSMUSE00000244478|ENSMUSE00000244487|ENSMUSE0...,10376619|10376620|10376622|10376631|10376634|1...,"mmu-let-7a(miRanda), mmu-let-7b(miRanda), mmu-...",,protein_coding,chr11,+,...,8.522415,8.386373,8.241636,Insufficient Expression,Insufficient Expression,9.67243e-07,1e-06,1e-06,8e-06,0.530067


## Look at column names and then setup filters for grouping columns into SCN and WB groups

In [4]:
df.columns

Index([u'Definition', u'Symbol', u'Transcript_cluster_ids',
       u'Constitutive_exons_used', u'Constitutive_IDs_used',
       u'Putative microRNA binding sites', u'Select Cellular Compartments',
       u'Select Protein Classes', u'Chromosome', u'Strand',
       u'Genomic Gene Corrdinates', u'GO-Biological Process',
       u'GO-Molecular Function', u'GO-Cellular Component', u'WikiPathways',
       u'GSM613009.CEL', u'GSM613010.CEL', u'GSM613011.CEL', u'GSM847026.CEL',
       u'GSM847027.CEL', u'GSM847029.CEL', u'GSM847031.CEL', u'GSM847033.CEL',
       u'GSM849761.CEL', u'GSM849762.CEL', u'GSM933084_E0460.CEL',
       u'GSM933085_E0461.CEL', u'avg-WB', u'WT_ZT111.CEL', u'WT_ZT112.CEL',
       u'WT_ZT113.CEL', u'WT_ZT114.CEL', u'WT_ZT131.CEL', u'WT_ZT132.CEL',
       u'WT_ZT133.CEL', u'WT_ZT134.CEL', u'WT_ZT61.CEL', u'WT_ZT62.CEL',
       u'WT_ZT63.CEL', u'WT_ZT64.CEL', u'avg-SCN', u'log_fold-SCN_vs_WB',
       u'fold-SCN_vs_WB', u'rawp-SCN_vs_WB', u'adjp-SCN_vs_WB', u'ANOVA-rawp',
   

In [5]:
# define regular expressions for filters and check the filters give the correct groups
scn_filt ='ZT'
wb_filt ='Brain|GSM'

In [6]:
df_scn=df.filter(regex= scn_filt)
df_scn.shape

(24451, 12)

In [7]:
df_wb=df.filter(regex= wb_filt)
df_wb.shape

(24451, 12)

## Calculations 

In [8]:
SCNcount = df.filter(regex=scn_filt).count(axis=1)

In [9]:
# Enrichment

df[prefix+'Enrich'] = df.filter(regex=scn_filt).mean(axis=1) - df.filter(regex=wb_filt).mean(axis=1)

In [10]:
df[prefix+'Enrich'].head()

Ensembl_gene
ENSMUSG00000028180    0.872461
ENSMUSG00000028182   -0.141201
ENSMUSG00000028185   -0.455638
ENSMUSG00000028184   -0.551870
ENSMUSG00000028187   -0.254581
Name: MoGene_Enrich, dtype: float64

In [11]:
# Pooled StDev
SCNcount = df.filter(regex=scn_filt).count(axis=1)
WBcount = df.filter(regex=wb_filt).count(axis=1)

StdevSCN = (SCNcount-1) * df.filter(regex=scn_filt).var(axis=1)
StdevWB = (WBcount-1) * df.filter(regex=wb_filt).var(axis=1)

df[prefix+'poolStDev'] = np.sqrt((StdevSCN+StdevWB)/(SCNcount+ WBcount-2))

In [12]:
#Cohen's d

df[prefix+'Cohens_d'] = df[prefix+'Enrich'] / df[prefix+'poolStDev']

In [13]:
#df[prefix+'poolStDev'].head()
df[prefix+'Cohens_d'] .head()

Ensembl_gene
ENSMUSG00000028180    1.876754
ENSMUSG00000028182   -0.512476
ENSMUSG00000028185   -1.736353
ENSMUSG00000028184   -2.923199
ENSMUSG00000028187   -0.735477
Name: MoGene_Cohens_d, dtype: float64

In [14]:
#J value (Correction factor)

df[prefix+'J'] = 1-(3/(4*(SCNcount+WBcount-1)))                              


In [15]:
#Hedge's g

df[prefix+'Hedges_g'] = df[prefix+'Cohens_d'] * df[prefix+'J']

In [16]:
#df[prefix+'J'].head()
df[prefix+'Hedges_g'] .head()

Ensembl_gene
ENSMUSG00000028180    1.815556
ENSMUSG00000028182   -0.495764
ENSMUSG00000028185   -1.679733
ENSMUSG00000028184   -2.827878
ENSMUSG00000028187   -0.711494
Name: MoGene_Hedges_g, dtype: float64

In [17]:
#Var_d
SCNcount = df.filter(regex=scn_filt).count(axis=1)
WBcount = df.filter(regex=wb_filt).count(axis=1)

Ftop1 = SCNcount + WBcount
Ftop2 = SCNcount * WBcount
Fbottom1 = np.square(df[prefix+'Cohens_d']) 
Fbottom2 =  2*(SCNcount + WBcount)


df[prefix+'Var_d'] = (Ftop1/Ftop2) + (Fbottom1 /Fbottom2)

In [18]:
# check output
df[prefix+'Var_d'].head()

Ensembl_gene
ENSMUSG00000028180    0.240046
ENSMUSG00000028182    0.172138
ENSMUSG00000028185    0.229478
ENSMUSG00000028184    0.344689
ENSMUSG00000028187    0.177936
Name: MoGene_Var_d, dtype: float64

In [19]:
df[prefix+'Var_g'] = df[prefix+'Var_d'] * np.square(df[prefix+'J'])

In [20]:
#SEg

df[prefix+'SEg'] = np.sqrt(df[prefix+'Var_g'])

In [21]:
df.sort_values(by='MoGene_Hedges_g', ascending=False, inplace=True)
df

Unnamed: 0_level_0,Definition,Symbol,Transcript_cluster_ids,Constitutive_exons_used,Constitutive_IDs_used,Putative microRNA binding sites,Select Cellular Compartments,Select Protein Classes,Chromosome,Strand,...,ANOVA-adjp,largest fold,MoGene_Enrich,MoGene_poolStDev,MoGene_Cohens_d,MoGene_J,MoGene_Hedges_g,MoGene_Var_d,MoGene_Var_g,MoGene_SEg
Ensembl_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSMUSG00000026787,glutamic acid decarboxylase 2 [Source:MGI Symb...,Gad2,10469672,ENSMUSE00000162776|ENSMUSE00000828009|ENSMUSE0...,10469676|10469677|10469678|10469679|10469680,"mmu-let-7b(RNAhybrid|miRanda), mmu-miR-101a(RN...",,protein_coding,chr2,+,...,1.346322e-18,1.736559,1.736559,0.098425,17.643421,0.967391,17.068092,6.651882,6.225136,2.495022
ENSMUSG00000006930,huntingtin-associated protein 1 [Source:MGI Sy...,Hap1,10391084,AK138436-11|ENSMUSE00000661332|ENSMUSE00000731...,10391098|10391097|10391096|10391095|10391094|1...,"mmu-miR-103(miRanda), mmu-miR-107(miRanda), mm...",,protein_coding,chr11,-,...,6.471473e-17,2.555813,2.555813,0.185197,13.800478,0.967391,13.350462,4.134441,3.869200,1.967028
ENSMUSG00000004317,chloride channel 5 [Source:MGI Symbol;Acc:MGI:...,Clcn5,10603289,ENSMUSE00000206809|ENSMUSE00000788912|ENSMUSE0...,10603300|10603299|10603298|10603297|10603296|1...,"mmu-let-7(TargetScan), mmu-miR-103a(TargetScan...",transmembrane,protein_coding,chrX,-,...,1.196476e-16,2.343954,2.343954,0.176797,13.257848,0.967391,12.825527,3.828553,3.582936,1.892864
ENSMUSG00000066357,WD repeat domain 6 [Source:MGI Symbol;Acc:MGI:...,Wdr6,10596931,ENSMUSE00000221376|ENSMUSE00000240629|ENSMUSE0...,10596939|10596938|10596937|10596936|10596935|1...,"mmu-miR-1(miRanda|pictar), mmu-miR-133a(miRand...",,protein_coding,chr9,-,...,1.423527e-16,1.900593,1.900593,0.145224,13.087346,0.967391,12.660585,3.734971,3.495358,1.869588
ENSMUSG00000021340,glycosylphosphatidylinositol specific phosphol...,Gpld1,10404218,AF050666-6|ENSMUSE00001207748|ENSMUSE000012824...,10404226|10404227|10404229|10404230|10404231|1...,"mmu-miR-145(miRanda), mmu-miR-15a(miRanda), mm...",extracellular,protein_coding,chr13,+,...,2.302986e-16,1.432149,1.432149,0.112397,12.741891,0.967391,12.326395,3.549079,3.321391,1.822468
ENSMUSG00000036699,"zinc finger, CCHC domain containing 12 [Source...",Zcchc12,10599187,ENSMUSE00000316943|ENSMUSE00000702209|ENSMUSE0...,10599191,"mmu-miR-103(miRanda), mmu-miR-107(miRanda), mm...",,transcription regulator|protein_coding,chrX,+,...,3.842080e-16,2.894750,2.894750,0.234465,12.346208,0.967391,11.943615,3.342268,3.127848,1.768572
ENSMUSG00000060177,kallikrein 1-related peptidase b22 [Source:MGI...,Klk1b22,10552594,ENSMUSE00000508132|ENSMUSE00000813835|ENSMUSE0...,10552595|10552596|10552597|10552598,,,protein_coding,chr7,+,...,5.860782e-16,3.310158,3.310158,0.275258,12.025660,0.967391,11.633519,3.179510,2.975532,1.724973
ENSMUSG00000031255,synaptotagmin-like 4 [Source:MGI Symbol;Acc:MG...,Sytl4,10606619,ENSMUSE00000207625|ENSMUSE00000207636|ENSMUSE0...,10606637|10606636|10606634|10606633|10606631|1...,"mmu-let-7a(miRanda), mmu-let-7c(miRanda), mmu-...",nucleus,protein_coding,chrX,-,...,6.002993e-16,2.743443,2.743443,0.229075,11.976199,0.967391,11.585671,3.154778,2.952386,1.718251
ENSMUSG00000057895,zinc finger protein 105 [Source:MGI Symbol;Acc...,Zfp105,10590489,ENSMUSE00000220628|ENSMUSE00000352918,10590491|10590492,"mmu-miR-135a(miRanda), mmu-miR-135b(miRanda), ...",,protein_coding,chr9,+,...,6.653046e-16,1.498041,1.498041,0.126696,11.823947,0.967391,11.438384,3.079286,2.881737,1.697568
ENSMUSG00000019986,Abelson helper integration site 1 [Source:MGI ...,Ahi1,10362005,AK005991-4|ENSMUSE00000615721|ENSMUSE000003797...,10362007|10362008|10362009|10362010|10362011|1...,"mmu-miR-1192(miRanda), mmu-miR-122(miRanda), m...",,protein_coding,chr10,+,...,7.334229e-16,2.206488,2.206488,0.187910,11.742252,0.967391,11.359352,3.039177,2.844201,1.686476


In [22]:
df.columns

Index([u'Definition', u'Symbol', u'Transcript_cluster_ids',
       u'Constitutive_exons_used', u'Constitutive_IDs_used',
       u'Putative microRNA binding sites', u'Select Cellular Compartments',
       u'Select Protein Classes', u'Chromosome', u'Strand',
       u'Genomic Gene Corrdinates', u'GO-Biological Process',
       u'GO-Molecular Function', u'GO-Cellular Component', u'WikiPathways',
       u'GSM613009.CEL', u'GSM613010.CEL', u'GSM613011.CEL', u'GSM847026.CEL',
       u'GSM847027.CEL', u'GSM847029.CEL', u'GSM847031.CEL', u'GSM847033.CEL',
       u'GSM849761.CEL', u'GSM849762.CEL', u'GSM933084_E0460.CEL',
       u'GSM933085_E0461.CEL', u'avg-WB', u'WT_ZT111.CEL', u'WT_ZT112.CEL',
       u'WT_ZT113.CEL', u'WT_ZT114.CEL', u'WT_ZT131.CEL', u'WT_ZT132.CEL',
       u'WT_ZT133.CEL', u'WT_ZT134.CEL', u'WT_ZT61.CEL', u'WT_ZT62.CEL',
       u'WT_ZT63.CEL', u'WT_ZT64.CEL', u'avg-SCN', u'log_fold-SCN_vs_WB',
       u'fold-SCN_vs_WB', u'rawp-SCN_vs_WB', u'adjp-SCN_vs_WB', u'ANOVA-rawp',
   

### Columns from the list above can then easily be picked to produce files for use later. Examples below given:
 #### df3 = average SCN and WB expression for the platform and the log-fold changes
 #### df4 = Hedges g  values and associated variance for Meta-analysis (after indexing)

In [23]:
# df3 = df.loc[:,[u'Symbol',u'avg-WB', u'avg-SCN', u'log_fold-SCN_vs_WB']]
# df3.columns =[u'Symbol',prefix+'avg-WB', prefix+'avg-SCN', prefix+'log_fold-SCN_vs_WB']
# df3.to_csv('input_files/MoGene_Expression.csv')

In [24]:
df4 = df.loc[:,[u'Symbol',u'MoGene_Enrich',u'MoGene_Hedges_g', u'MoGene_Var_g',u'MoGene_SEg']]
df4.to_csv('input_files/MoGene_forIndexHedges.csv')

In [25]:
df4 #.shape

Unnamed: 0_level_0,Symbol,MoGene_Enrich,MoGene_Hedges_g,MoGene_Var_g,MoGene_SEg
Ensembl_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSMUSG00000026787,Gad2,1.736559,17.068092,6.225136,2.495022
ENSMUSG00000006930,Hap1,2.555813,13.350462,3.869200,1.967028
ENSMUSG00000004317,Clcn5,2.343954,12.825527,3.582936,1.892864
ENSMUSG00000066357,Wdr6,1.900593,12.660585,3.495358,1.869588
ENSMUSG00000021340,Gpld1,1.432149,12.326395,3.321391,1.822468
ENSMUSG00000036699,Zcchc12,2.894750,11.943615,3.127848,1.768572
ENSMUSG00000060177,Klk1b22,3.310158,11.633519,2.975532,1.724973
ENSMUSG00000031255,Sytl4,2.743443,11.585671,2.952386,1.718251
ENSMUSG00000057895,Zfp105,1.498041,11.438384,2.881737,1.697568
ENSMUSG00000019986,Ahi1,2.206488,11.359352,2.844201,1.686476
