# Calculations of the Effect Size (ES) for each microarray study 

###  Using Hedges' g value, an adjusted Cohen's d  value

$$  {Enrichment} = \bar{X_1}-\bar{X_2}$$

where group1 is SCN Expression values and group2 are Whole Brain Expression values 
(SCN mean - WB mean) **(Logged values, so minus gives ratio)**

$$  {Pooled\ Standard\  Deviantion} = \sqrt\frac{(n_1-1)S_1^2 +(n_2-1)S_2^2}{(n_2 +n_2) -2}  $$  

$$  {Cohen's\ d\ value} = \frac{Enrichment}{Pooled\ Standard\ Deviation} $$

$$  {Correction\ Factor (J\ Factor)} = 1- \frac{3}{4df-1} $$

$$  {Hedges'\ g\ value} = Cohen's\ d\ \text{x}\ J\ $$

$$  {Variance\ in\ d (V_d)} = \frac{n_1- +n_2}{n_1 n_2} + \frac{d^2}{2(n_1 +n_2)}  $$

$$  {Variance\ in\ g (V_g)} = J^2\  \text{x}\ V_d  $$

$$  {Standard\ Error\ in\ g (SE_g)} = \sqrt{V_g}  $$

## Setup working environment and import data

In [7]:
import pandas as pd # Dataframes and good-glue
import numpy as np # numerical calculations


In [8]:
prefix = '430V2_'   # define a prefix to add to column names (making indexing easier later)

In [9]:
df=pd.read_table('../AltAnalyze_output/DATASET-430v2.txt', delimiter='\t',  index_col=0) #,nrows=500)  
df.head()


Unnamed: 0_level_0,Symbol,Definition,Ensembl_id,Entrez_id,Unigene_id,GO-Process,GO-Function,GO-Component,Pathway_info,Putative microRNA binding sites,...,GSM414572.CEL,GSM707557.CEL,avg-SCN,log_fold-SCN_vs_WB,fold-SCN_vs_WB,rawp-SCN_vs_WB,adjp-SCN_vs_WB,ANOVA-rawp,ANOVA-adjp,largest fold
Probesets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1427138_at,Ccdc88c,coiled-coil domain containing 88C,ENSMUSG00000021182,68339,,protein destabilization // protein homooligome...,protein binding // PDZ domain binding // prote...,insoluble fraction,,"mmu-let-7c(RNAhybrid|miRanda), mmu-let-7d(RNAh...",...,6.6755,6.92059,6.787867,-0.198398,-1.147423,0.047875,0.165634,0.030113,0.124554,0.198398
1448093_s_at,C77405,expressed sequence C77405,,98108,,,,,,,...,3.86671,3.79172,3.84788,-0.118651,-1.085719,0.229335,0.439689,0.185091,0.387711,0.118651
1425600_a_at,Plcb1,"phospholipase C, beta 1",ENSMUSG00000051177,18795,,signal transduction // lipid catabolic process...,calcium ion binding // hydrolase activity // p...,cytoplasm,Alzheimers Disease:WP2075(WikiPathways) // Che...,"mmu-miR-103(miRanda), mmu-miR-106ab(TargetScan...",...,7.3107,6.46585,7.021803,-0.064888,-1.046004,0.916641,0.957518,0.923744,0.961256,0.064888
1457168_at,AI225934,expressed sequence AI225934,,98884,,,,,,,...,5.6138,5.44322,5.554147,0.029068,1.020353,0.756483,0.867974,0.725764,0.847279,0.029068
1450135_at,Fzd3,frizzled homolog 3 (Drosophila),ENSMUSG00000007989,14365,,neural tube closure // multicellular organisma...,non-G-protein coupled 7TM receptor activity //...,apical part of cell // integral to membrane //...,ESC Pluripotency Pathways:WP339(WikiPathways) ...,"mmu-let-7(TargetScan), mmu-miR-105(TargetScan)...",...,5.71821,5.29435,5.568443,0.36405,1.287034,0.008122,0.049437,0.008269,0.05244,0.36405


In [10]:
# remove cross-hybridising probes
df =df[~df.index.str.contains('_x_|_s_')]    #   important reverse selector ~ 
df.shape

(40569, 37)

## Look at column names and then setup filters for grouping columns into SCN and WB groups

In [11]:
df.columns

Index(['Symbol', 'Definition', 'Ensembl_id', 'Entrez_id', 'Unigene_id',
       'GO-Process', 'GO-Function', 'GO-Component', 'Pathway_info',
       'Putative microRNA binding sites', 'Select Cellular Compartments',
       'Select Protein Classes', 'GSM252077.CEL', 'GSM252078.CEL',
       'GSM252079.CEL', 'GSM511616.CEL', 'GSM511617.CEL', 'GSM511618.CEL',
       'GSM511619.CEL', 'GSM511620.CEL', 'GSM511621.CEL', 'GSM511622.CEL',
       'GSM511623.CEL', 'GSM511624.CEL', 'GSM511625.CEL', 'avg-WB',
       'GSM414571.CEL', 'GSM414572.CEL', 'GSM707557.CEL', 'avg-SCN',
       'log_fold-SCN_vs_WB', 'fold-SCN_vs_WB', 'rawp-SCN_vs_WB',
       'adjp-SCN_vs_WB', 'ANOVA-rawp', 'ANOVA-adjp', 'largest fold'],
      dtype='object')

In [12]:
# define regular expressions for filters 
scn_filt ='414|707'
wb_filt ='252|511'

In [13]:
df_scn=df.filter(regex= scn_filt)
df_scn.shape

(40569, 3)

In [14]:
df_wb=df.filter(regex= wb_filt)
df_wb.shape

(40569, 13)

## Calculations 

In [15]:
SCNcount = df.filter(regex=scn_filt).count(axis=1)

In [16]:
# Enrichment

df[prefix+'Enrich'] = df.filter(regex=scn_filt).mean(axis=1) - df.filter(regex=wb_filt).mean(axis=1)

In [17]:
df[prefix+'Enrich'].head()

Probesets
1427138_at     -0.198398
1425600_a_at   -0.064888
1457168_at      0.029068
1450135_at      0.364050
1424014_at     -0.200386
Name: 430V2_Enrich, dtype: float64

In [18]:
# Pooled StDev
SCNcount = df.filter(regex=scn_filt).count(axis=1)
WBcount = df.filter(regex=wb_filt).count(axis=1)

StdevSCN = (SCNcount-1) * df.filter(regex=scn_filt).var(axis=1)
StdevWB = (WBcount-1) * df.filter(regex=wb_filt).var(axis=1)

df[prefix+'poolStDev'] = np.sqrt((StdevSCN+StdevWB)/(SCNcount+ WBcount-2))

In [19]:
#Cohen's d

df[prefix+'Cohens_d'] = df[prefix+'Enrich'] / df[prefix+'poolStDev']

In [20]:
df[prefix+'Cohens_d'] .head()

Probesets
1427138_at     -1.545517
1425600_a_at   -0.062423
1457168_at      0.229233
1450135_at      1.968094
1424014_at     -1.021103
Name: 430V2_Cohens_d, dtype: float64

In [21]:
#J value (Correction factor)

df[prefix+'J'] = 1-(3/(4*(SCNcount+WBcount-1)))                              


In [22]:
#Hedge's g

df[prefix+'Hedges_g'] = df[prefix+'Cohens_d'] * df[prefix+'J']

In [23]:
#df[prefix+'J'].head()
df[prefix+'Hedges_g'] .head()

Probesets
1427138_at     -1.468241
1425600_a_at   -0.059302
1457168_at      0.217771
1450135_at      1.869689
1424014_at     -0.970048
Name: 430V2_Hedges_g, dtype: float64

In [24]:
#Var_d
SCNcount = df.filter(regex=scn_filt).count(axis=1)
WBcount = df.filter(regex=wb_filt).count(axis=1)

Ftop1 = SCNcount + WBcount
Ftop2 = SCNcount * WBcount
Fbottom1 = np.square(df[prefix+'Cohens_d']) 
Fbottom2 =  2*(SCNcount + WBcount)


df[prefix+'Var_d'] = (Ftop1/Ftop2) + (Fbottom1 /Fbottom2)

In [25]:
#check
df[prefix+'Var_d'].head()

Probesets
1427138_at      0.484901
1425600_a_at    0.410378
1457168_at      0.411899
1450135_at      0.531300
1424014_at      0.442839
Name: 430V2_Var_d, dtype: float64

In [26]:
df[prefix+'Var_g'] = df[prefix+'Var_d'] * np.square(df[prefix+'J'])

In [27]:
#SEg

df[prefix+'SEg'] = np.sqrt(df[prefix+'Var_g'])

In [28]:
df.sort_values(by=prefix+'Hedges_g', ascending=False, inplace=True)
df

Unnamed: 0_level_0,Symbol,Definition,Ensembl_id,Entrez_id,Unigene_id,GO-Process,GO-Function,GO-Component,Pathway_info,Putative microRNA binding sites,...,ANOVA-adjp,largest fold,430V2_Enrich,430V2_poolStDev,430V2_Cohens_d,430V2_J,430V2_Hedges_g,430V2_Var_d,430V2_Var_g,430V2_SEg
Probesets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1418603_at,Avpr1a,arginine vasopressin receptor 1A,ENSMUSG00000020123,54140,,G-protein coupled receptor protein signaling p...,peptide hormone binding // vasopressin recepto...,plasma membrane // integral to membrane // mem...,"GPCRs, Class A Rhodopsin-like:WP189(WikiPathwa...","mmu-let-7d(miRanda), mmu-miR-10a(miRanda), mmu...",...,8.849224e-10,2.410178,2.410178,0.131454,18.334705,0.95,17.417969,10.915300,9.851058,3.138640
1457265_at,Sfrs17b,"splicing factor, arginine/serine-rich 17b",ENSMUSG00000059708,338351,,RNA splicing // mRNA processing,nucleic acid binding // RNA binding,spliceosome // nucleus // nuclear speck,mRNA processing:WP310(WikiPathways),"mmu-miR-1192(miRanda), mmu-miR-124(miRanda), m...",...,1.472018e-09,1.660074,1.660074,0.097228,17.074045,0.95,16.220343,9.520351,8.592116,2.931231
1457396_at,LOC100045013,similar to ventral anterior homeobox-containin...,,100045013,,,,,,,...,2.525826e-09,2.974166,2.974166,0.185510,16.032354,0.95,15.230736,8.442643,7.619485,2.760341
1460006_at,Zfhx3,zinc finger homeobox 3 [Source:MGI Symbol;Acc:...,ENSMUSG00000038872,11906,,transcription // negative regulation of transc...,nucleic acid binding // transcription factor a...,mitochondrion // transcription factor complex ...,,"mmu-miR-101(TargetScan), mmu-miR-101ab(TargetS...",...,2.895053e-09,3.627305,3.627305,0.230275,15.752039,0.95,14.964437,8.164217,7.368206,2.714444
1442809_at,Scn9a,"sodium channel, voltage-gated, type IX, alpha ...",ENSMUSG00000075316,20274,,sodium ion transport // inflammatory response ...,voltage-gated sodium channel activity // sodiu...,voltage-gated sodium channel complex // integr...,,"mmu-miR-101a(miRanda), mmu-miR-101b(miRanda), ...",...,6.753430e-09,2.650461,2.650461,0.182478,14.524863,0.95,13.798620,7.003120,6.320316,2.514024
1419408_at,Six6,sine oculis-related homeobox 6 homolog (Drosop...,ENSMUSG00000021099,20476,,multicellular organismal development // regula...,transcription factor activity // DNA binding /...,nucleus,,"mmu-miR-103(miRanda), mmu-miR-107(miRanda), mm...",...,7.033843e-09,3.926071,3.926071,0.274588,14.298066,0.95,13.583163,6.798841,6.135954,2.477086
1454731_at,Myo10,myosin X [Source:MGI Symbol;Acc:MGI:107716],ENSMUSG00000022272,17909,,signal transduction,motor activity // nucleotide binding // bindin...,cytoskeleton // myosin complex,,"mmu-let-7g(RNAhybrid|miRanda), mmu-miR-101a(RN...",...,4.070012e-08,1.318415,1.318415,0.108384,12.164345,0.95,11.556128,5.034360,4.543510,2.131551
1423726_at,Vat1,vesicle amine transport protein 1 homolog (T c...,ENSMUSG00000034993,26949,,metabolic process // oxidation reduction,catalytic activity // binding // oxidoreductas...,,,"mmu-let-7a(miRanda), mmu-let-7f(miRanda), mmu-...",...,4.483654e-08,3.317615,3.317615,0.277102,11.972554,0.95,11.373927,4.889696,4.412950,2.100702
1434073_at,Gprasp2,G protein-coupled receptor associated sorting ...,ENSMUSG00000072966,245607,,,protein binding // G-protein-coupled receptor ...,cytoplasm // nucleus,,"mmu-miR-1271(TargetScan), mmu-miR-138(miRanda)...",...,6.232190e-08,1.675347,1.675347,0.144421,11.600451,0.95,11.020429,4.615584,4.165564,2.040971
1438729_at,Sox1,SRY-box containing gene 1 [Source:MGI Symbol;A...,ENSMUSG00000096014,20664,,forebrain neuron development // lens morphogen...,DNA binding // sequence-specific DNA binding,nucleus,Wnt Signaling Pathway NetPath:WP539(WikiPathways),"mmu-miR-139(pictar), mmu-miR-139-5p(TargetScan...",...,6.700320e-08,2.585644,2.585644,0.225324,11.475223,0.95,10.901462,4.525280,4.084065,2.020907


In [29]:
df.columns

Index(['Symbol', 'Definition', 'Ensembl_id', 'Entrez_id', 'Unigene_id',
       'GO-Process', 'GO-Function', 'GO-Component', 'Pathway_info',
       'Putative microRNA binding sites', 'Select Cellular Compartments',
       'Select Protein Classes', 'GSM252077.CEL', 'GSM252078.CEL',
       'GSM252079.CEL', 'GSM511616.CEL', 'GSM511617.CEL', 'GSM511618.CEL',
       'GSM511619.CEL', 'GSM511620.CEL', 'GSM511621.CEL', 'GSM511622.CEL',
       'GSM511623.CEL', 'GSM511624.CEL', 'GSM511625.CEL', 'avg-WB',
       'GSM414571.CEL', 'GSM414572.CEL', 'GSM707557.CEL', 'avg-SCN',
       'log_fold-SCN_vs_WB', 'fold-SCN_vs_WB', 'rawp-SCN_vs_WB',
       'adjp-SCN_vs_WB', 'ANOVA-rawp', 'ANOVA-adjp', 'largest fold',
       '430V2_Enrich', '430V2_poolStDev', '430V2_Cohens_d', '430V2_J',
       '430V2_Hedges_g', '430V2_Var_d', '430V2_Var_g', '430V2_SEg'],
      dtype='object')

In [30]:
dfX=pd.read_table('../BioMart_Ensmbl_index/mart_export72_430v2430Av2.txt',index_col=[2])

dfX.pop('Affy mouse430a 2 probeset') # remove 430Av2 IDs
dfX.head() 

Unnamed: 0_level_0,Ensembl Gene ID,Description,MGI symbol
Affy mouse430 2 probeset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1417126_a_at,ENSMUSG00000039221,ribosomal protein L22 like 1 [Source:MGI Symbo...,Rpl22l1
,ENSMUSG00000095611,predicted gene 10597 [Source:MGI Symbol;Acc:MG...,Gm10597
1417730_at,ENSMUSG00000061731,exostoses (multiple) 1 [Source:MGI Symbol;Acc:...,Ext1
1417730_at,ENSMUSG00000061731,exostoses (multiple) 1 [Source:MGI Symbol;Acc:...,Ext1
1458054_at,ENSMUSG00000061731,exostoses (multiple) 1 [Source:MGI Symbol;Acc:...,Ext1


In [31]:
df_Join = df.join(dfX, how='left', sort=True)
df_FINAL1 = df_Join.groupby('MGI symbol').mean()
df_FINAL1[df_FINAL1.index.duplicated()==True]  #Check that no duplicate entries for symbol exist

Unnamed: 0_level_0,GSM252077.CEL,GSM252078.CEL,GSM252079.CEL,GSM511616.CEL,GSM511617.CEL,GSM511618.CEL,GSM511619.CEL,GSM511620.CEL,GSM511621.CEL,GSM511622.CEL,...,ANOVA-adjp,largest fold,430V2_Enrich,430V2_poolStDev,430V2_Cohens_d,430V2_J,430V2_Hedges_g,430V2_Var_d,430V2_Var_g,430V2_SEg
MGI symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [32]:
# df_FINAL2 = df_Join.groupby('Ensembl Gene ID').mean()   # If you wanted indexing by ENSMBL GID
# df_FINAL2[df_FINAL2.index.duplicated()==True]

### Columns from the list above can then easily be picked to produce files for use later. Examples below given:
 #### df3 = average SCN and WB expression for the platform and the log-fold changes
 #### df4 = Hedges g  values and associated variance for Meta-analysis (after indexing)

In [33]:
# df3 = df_FINAL1.loc[:,[u'avg-WB', u'avg-SCN', u'log_fold-SCN_vs_WB']]
# df3.columns =[prefix+'avg-WB', prefix+'avg-SCN', prefix+'log_fold-SCN_vs_WB']
# df3.to_csv('input_files/430V2_SymbolExpression_forIndex.csv')

In [34]:
df4 = df_FINAL1.loc[:,[u'430V2_Enrich',u'430V2_Hedges_g', u'430V2_Var_g', u'430V2_SEg']]
df4.to_csv('input_files/430V2_SymbolforIndexHedges.csv')

In [35]:
df4

Unnamed: 0_level_0,430V2_Enrich,430V2_Hedges_g,430V2_Var_g,430V2_SEg
MGI symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0610005C13Rik,-0.065548,-0.308451,0.373230,0.610925
0610008F07Rik,-0.029420,-0.177580,0.371242,0.609296
0610009B14Rik,-0.326035,-1.337117,0.426128,0.652785
0610009B22Rik,0.392765,1.473526,0.438109,0.661898
0610009D07Rik,0.052437,0.135410,0.370857,0.608980
0610009L18Rik,-0.192890,-0.611839,0.381955,0.618025
0610009O20Rik,-0.573090,-1.797072,0.471177,0.686424
0610010F05Rik,0.094879,0.093928,0.373384,0.611052
0610010K14Rik,0.180634,0.287394,0.372838,0.610604
0610011F06Rik,0.664020,2.707147,0.602898,0.775627
