# Calculations of the Effect Size (ES) for each microarray study 

###  Using Hedges' g value, an adjusted Cohen's d  value.

$$  {Enrichment} = \bar{X_1}-\bar{X_2}$$

where group1 is SCN Expression values and group2 are Whole Brain Expression values 
(SCN mean - WB mean) **(Logged values, so minus gives ratio)**

$$  {Pooled\ Standard\  Deviantion} = \sqrt\frac{(n_1-1)S_1^2 +(n_2-1)S_2^2}{(n_2 +n_2) -2}  $$  

$$  {Cohen's\ d\ value} = \frac{Enrichment}{Pooled\ Standard\ Deviation} $$

$$  {Correction\ Factor (J\ Factor)} = 1- \frac{3}{4df-1} $$

$$  {Hedges'\ g\ value} = Cohen's\ d\ \text{x}\ J\ $$

$$  {Variance\ in\ d (V_d)} = \frac{n_1- +n_2}{n_1 n_2} + \frac{d^2}{2(n_1 +n_2)}  $$

$$  {Variance\ in\ g (V_g)} = J^2\  \text{x}\ V_d  $$

$$  {Standard\ Error\ in\ g (SE_g)} = \sqrt{V_g}  $$

## Setup working environment and import data

In [4]:
import pandas as pd # Dataframes and good-glue
import numpy as np # numerical calculations


In [5]:
prefix = 'MoEx_'   # define a prefix to add to column names (making indexing easier later)

In [6]:
df=pd.read_table('../AltAnalyze_output/DATASET-MoEx.txt', delimiter='\t',  index_col=0) #,nrows=500)  
df.tail()

Unnamed: 0_level_0,Definition,Symbol,Transcript_cluster_ids,Constitutive_exons_used,Constitutive_IDs_used,Putative microRNA binding sites,Select Cellular Compartments,Select Protein Classes,Chromosome,Strand,...,Opn4 250608 sham120-1 SCN WT (75).CEL,Opn4 250608 sham120-2 SCN WT (76).CEL,avg-SCN,log_fold-SCN_vs_WB,fold-SCN_vs_WB,rawp-SCN_vs_WB,adjp-SCN_vs_WB,ANOVA-rawp,ANOVA-adjp,largest fold
Ensembl_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSMUSG00000036955,RIKEN cDNA 2510003E04 gene [Source:MGI Symbol;...,2510003E04Rik,6774403,AK158730-1|ENSMUSE00000457012|ENSMUSE000008492...,4417814|5161702|4912355|4604127|5147940,"mmu-let-7e(RNAhybrid|miRanda), mmu-let-7g(RNAh...",,protein_coding,chr10,-,...,8.950672,9.066384,9.082246,Insufficient Expression,Insufficient Expression,0.8679972,0.2433386,0.851619,0.8881596,0.01311
ENSMUSG00000036957,leucine rich repeat and fibronectin type III d...,Lrfn3,6966274,BC066999-1|ENSMUSE00000442373|BC066999-2|ENSMU...,5614448|4737557|4730684|4972050|4382311|437589...,,transmembrane,protein_coding,chr7,-,...,5.337939,5.203753,5.272452,Insufficient Expression,Insufficient Expression,0.9386859,0.2580327,0.93218,0.9501286,0.006308
ENSMUSG00000036959,BCL6 co-repressor-like 1 [Source:MGI Symbol;Ac...,Bcorl1,7011040,ENSMUSE00000701318|ENSMUSE00000718262|ENSMUSE0...,5151160|4418096|4950131|4625808,"mmu-let-7b(RNAhybrid|miRanda), mmu-let-7c(RNAh...",nucleus,transcription regulator|protein_coding,chrX,+,...,5.245743,4.759582,5.102172,Insufficient Expression,Insufficient Expression,3.777439e-05,4.450841e-05,6.635677e-06,3.819534e-05,0.35492
ENSMUSG00000036958,cystatin 11 [Source:MGI Symbol;Acc:MGI:1925490],Cst11,6891902,ENSMUSE00000246608|ENSMUSE00000332591|ENSMUSE0...,5566771|4728631|5168106|5376143|4348970,"mmu-miR-425(miRanda), mmu-miR-489(miRanda)",extracellular,protein_coding,chr2,-,...,2.064786,2.104134,2.102115,Insufficient Expression,Insufficient Expression,0.06190996,0.02761328,0.05581913,0.09372553,0.197918
ENSMUSG00000042678,myosin XV [Source:MGI Symbol;Acc:MGI:1261811],Myo15,6781442,ENSMUSE00000244378|ENSMUSE00000244487|ENSMUSE0...,5342919|5120038|4477176|5266444|4506593|531601...,"mmu-let-7a(miRanda), mmu-let-7b(miRanda), mmu-...",,protein_coding,chr11,+,...,4.346678,4.131162,4.337207,Insufficient Expression,Insufficient Expression,3.176423e-08,9.587477e-08,1.545248e-08,2.066276e-07,0.623558


## Look at column names and then setup filters for grouping columns into SCN and WB groups

In [7]:
df.columns

Index(['Definition', 'Symbol', 'Transcript_cluster_ids',
       'Constitutive_exons_used', 'Constitutive_IDs_used',
       'Putative microRNA binding sites', 'Select Cellular Compartments',
       'Select Protein Classes', 'Chromosome', 'Strand',
       'Genomic Gene Corrdinates', 'GO-Biological Process',
       'GO-Molecular Function', 'GO-Cellular Component', 'WikiPathways',
       'GSM674605.CEL', 'GSM674606.CEL', 'GSM674607.CEL', 'GSM674608.CEL',
       'GSM674609.CEL', 'GSM674610.CEL', 'GSM674611.CEL', 'GSM674612.CEL',
       'GSM674613.CEL', 'GSM674614.CEL', 'GSM674615.CEL', 'GSM674616.CEL',
       'GSM674617.CEL', 'GSM674618.CEL', 'GSM674619.CEL', 'GSM674620.CEL',
       'GSM674621.CEL', 'GSM674622.CEL', 'GSM674623.CEL', 'GSM674624.CEL',
       'GSM674625.CEL', 'GSM674626.CEL', 'avg-WB',
       'Opn4 010508 sham30-1 SCN WT (46).CEL',
       'Opn4 010508 sham30-2 SCN WT (47).CEL',
       'Opn4 010508 sham60-1 SCN WT (55).CEL',
       'Opn4 010508 sham60-2 SCN WT (56).CEL',
      

In [8]:
# define regular expressions for filters 
scn_filt =' SCN '
wb_filt ='brain|GSM'

In [9]:
df_scn=df.filter(regex= scn_filt)
df_scn.shape

(28268, 6)

In [10]:
df_wb=df.filter(regex= wb_filt)
df_wb.shape

(28268, 22)

## Calculations 

In [11]:
SCNcount = df.filter(regex=scn_filt).count(axis=1)

In [12]:
# Enrichment

df[prefix+'Enrich'] = df.filter(regex=scn_filt).mean(axis=1) - df.filter(regex=wb_filt).mean(axis=1)

In [13]:
df[prefix+'Enrich'].head()

Ensembl_gene
ENSMUSG00000028180   -0.253207
ENSMUSG00000028182    0.018364
ENSMUSG00000028185    0.514565
ENSMUSG00000028184   -0.109753
ENSMUSG00000028187    0.033873
Name: MoEx_Enrich, dtype: float64

In [14]:
# Pooled StDev
SCNcount = df.filter(regex=scn_filt).count(axis=1)
WBcount = df.filter(regex=wb_filt).count(axis=1)

StdevSCN = (SCNcount-1) * df.filter(regex=scn_filt).var(axis=1)
StdevWB = (WBcount-1) * df.filter(regex=wb_filt).var(axis=1)

df[prefix+'poolStDev'] = np.sqrt((StdevSCN+StdevWB)/(SCNcount+ WBcount-2))

In [15]:
#Cohen's d

df[prefix+'Cohens_d'] = df[prefix+'Enrich'] / df[prefix+'poolStDev']

In [16]:
#df[prefix+'poolStDev'].head()
df[prefix+'Cohens_d'] .head()

Ensembl_gene
ENSMUSG00000028180   -1.051078
ENSMUSG00000028182    0.065600
ENSMUSG00000028185    0.813775
ENSMUSG00000028184   -0.670467
ENSMUSG00000028187    0.102296
Name: MoEx_Cohens_d, dtype: float64

In [17]:
#J value (Correction factor)

df[prefix+'J'] = 1-(3/(4*(SCNcount+WBcount-1)))                              


In [18]:
#Hedge's g

df[prefix+'Hedges_g'] = df[prefix+'Cohens_d'] * df[prefix+'J']

In [19]:
#df[prefix+'J'].head()
df[prefix+'Hedges_g'] .head()

Ensembl_gene
ENSMUSG00000028180   -1.021882
ENSMUSG00000028182    0.063777
ENSMUSG00000028185    0.791170
ENSMUSG00000028184   -0.651843
ENSMUSG00000028187    0.099454
Name: MoEx_Hedges_g, dtype: float64

In [20]:
#Var_d
SCNcount = df.filter(regex=scn_filt).count(axis=1)
WBcount = df.filter(regex=wb_filt).count(axis=1)

Ftop1 = SCNcount + WBcount
Ftop2 = SCNcount * WBcount
Fbottom1 = np.square(df[prefix+'Cohens_d']) 
Fbottom2 =  2*(SCNcount + WBcount)


df[prefix+'Var_d'] = (Ftop1/Ftop2) + (Fbottom1 /Fbottom2)

In [21]:
#Check output
df[prefix+'Var_d'].head()

Ensembl_gene
ENSMUSG00000028180    0.231849
ENSMUSG00000028182    0.212198
ENSMUSG00000028185    0.223947
ENSMUSG00000028184    0.220148
ENSMUSG00000028187    0.212308
Name: MoEx_Var_d, dtype: float64

In [22]:
df[prefix+'Var_g'] = df[prefix+'Var_d'] * np.square(df[prefix+'J'])

In [23]:
#SEg

df[prefix+'SEg'] = np.sqrt(df[prefix+'Var_g'])

In [24]:
df.sort_values(by='MoEx_Hedges_g', ascending=False, inplace=True)
df

Unnamed: 0_level_0,Definition,Symbol,Transcript_cluster_ids,Constitutive_exons_used,Constitutive_IDs_used,Putative microRNA binding sites,Select Cellular Compartments,Select Protein Classes,Chromosome,Strand,...,ANOVA-adjp,largest fold,MoEx_Enrich,MoEx_poolStDev,MoEx_Cohens_d,MoEx_J,MoEx_Hedges_g,MoEx_Var_d,MoEx_Var_g,MoEx_SEg
Ensembl_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSMUSG00000031255,synaptotagmin-like 4 [Source:MGI Symbol;Acc:MG...,Sytl4,7019492,ENSMUSE00000207625|ENSMUSE00000207636|ENSMUSE0...,5491588|4321195|5587738|5113599|4854071|508628...,"mmu-let-7a(miRanda), mmu-let-7c(miRanda), mmu-...",nucleus,protein_coding,chrX,-,...,2.742530e-19,3.683219,3.683219,0.251899,14.621815,0.972222,14.215653,4.029933,3.809157,1.951706
ENSMUSG00000019913,single-minded homolog 1 (Drosophila) [Source:M...,Sim1,6767762,ENSMUSE00000099061|ENSMUSE00000099064|ENSMUSE0...,5292792|5469531|5453453|5162343|5525538|549808...,"mmu-miR-100(miRanda), mmu-miR-101a(miRanda), m...",nucleus,transcription regulator|protein_coding,chr10,+,...,1.191824e-18,3.006581,3.006581,0.219965,13.668451,0.972222,13.288772,3.548310,3.353919,1.831371
ENSMUSG00000047507,BAI1-associated protein 3 [Source:MGI Symbol;A...,Baiap3,6854415,AK122358-19|ENSMUSE00001205525|ENSMUSE00001294...,4933467|5194883|5298554|4884036|4574366|560514...,"mmu-miR-190(miRanda), mmu-miR-190b(miRanda), m...",,protein_coding,chr17,-,...,1.502219e-18,2.661205,2.661205,0.197380,13.482657,0.972222,13.108139,3.458229,3.268774,1.807975
ENSMUSG00000032511,"sodium channel, voltage-gated, type V, alpha [...",Scn5a,6999438,AJ271477-16|ENSMUSE00000220333|ENSMUSE00000220...,4449545|4416706|5249296|5419217|5558498|497443...,"mmu-let-7(TargetScan), mmu-let-7a(TargetScan|m...",,protein_coding,chr9,-,...,2.974163e-18,2.080635,2.080635,0.159325,13.059036,0.972222,12.696285,3.257450,3.078994,1.754706
ENSMUSG00000006930,huntingtin-associated protein 1 [Source:MGI Sy...,Hap1,6791418,AK138436-11|ENSMUSE00000661332|ENSMUSE00000731...,4606594|4618758|4397638|5189366|5210825|505982...,"mmu-miR-103(miRanda), mmu-miR-107(miRanda), mm...",,protein_coding,chr11,-,...,1.067776e-17,1.876495,1.876495,0.152255,12.324713,0.972222,11.982360,2.924595,2.764375,1.662641
ENSMUSG00000027301,oxytocin [Source:MGI Symbol;Acc:MGI:97453],Oxt,6881205,ENSMUSE00000168184|ENSMUSE00000168186|ENSMUSE0...,4755071|5406618|4710608|4437925,"mmu-miR-34a(miRanda), mmu-miR-34b-5p(miRanda),...",extracellular,protein_coding,chr2,+,...,4.754955e-17,4.968081,4.968081,0.433811,11.452182,0.972222,11.134065,2.554129,2.414204,1.553771
ENSMUSG00000031212,G protein-coupled receptor 15-like [Source:MGI...,Pgr15l,7012716|7012717,ENSMUSE00000207324|ENSMUSE00000207327|ENSMUSE0...,5233209|5428128|4450935|5224094|4981560|530039...,"mmu-miR-106a(miRanda), mmu-miR-106b(miRanda), ...",,protein_coding,chrX,+,...,4.754955e-17,2.556678,2.556678,0.223288,11.450132,0.972222,11.132072,2.553291,2.413412,1.553516
ENSMUSG00000058838,"ribosomal protein S27A, pseudogene 2 [Source:M...",Rps27a-ps2,6990910,ENSMUSE00000892103,4407762,,,processed_pseudogene,chr9,+,...,8.382876e-17,1.760517,1.760517,0.158484,11.108473,0.972222,10.799904,2.415660,2.283320,1.511066
ENSMUSG00000037727,arginine vasopressin [Source:MGI Symbol;Acc:MG...,Avp,6890916,ENSMUSE00000259526|ENSMUSE00000259535|ENSMUSE0...,4654007|4766734|5279140|5089352|4855987,"mmu-miR-370(RNAhybrid|miRanda), mmu-miR-485-5p...",extracellular,protein_coding,chr2,-,...,8.382876e-17,3.255509,3.255509,0.293305,11.099385,0.972222,10.791069,2.412056,2.279914,1.509938
ENSMUSG00000056972,"melanoma antigen, family L, 2 [Source:MGI Symb...",Magel2,6961012,AK082944-1|AK086725-1|ENSMUSE00000497291,4440363|4966051|5122481|4934466|5487707|559658...,"mmu-miR-101a(miRanda), mmu-miR-101b(miRanda), ...",,protein_coding,chr7,+,...,3.195919e-16,2.113392,2.113392,0.202412,10.441037,0.972222,10.151008,2.158822,2.040553,1.428479


In [25]:
df.columns

Index(['Definition', 'Symbol', 'Transcript_cluster_ids',
       'Constitutive_exons_used', 'Constitutive_IDs_used',
       'Putative microRNA binding sites', 'Select Cellular Compartments',
       'Select Protein Classes', 'Chromosome', 'Strand',
       'Genomic Gene Corrdinates', 'GO-Biological Process',
       'GO-Molecular Function', 'GO-Cellular Component', 'WikiPathways',
       'GSM674605.CEL', 'GSM674606.CEL', 'GSM674607.CEL', 'GSM674608.CEL',
       'GSM674609.CEL', 'GSM674610.CEL', 'GSM674611.CEL', 'GSM674612.CEL',
       'GSM674613.CEL', 'GSM674614.CEL', 'GSM674615.CEL', 'GSM674616.CEL',
       'GSM674617.CEL', 'GSM674618.CEL', 'GSM674619.CEL', 'GSM674620.CEL',
       'GSM674621.CEL', 'GSM674622.CEL', 'GSM674623.CEL', 'GSM674624.CEL',
       'GSM674625.CEL', 'GSM674626.CEL', 'avg-WB',
       'Opn4 010508 sham30-1 SCN WT (46).CEL',
       'Opn4 010508 sham30-2 SCN WT (47).CEL',
       'Opn4 010508 sham60-1 SCN WT (55).CEL',
       'Opn4 010508 sham60-2 SCN WT (56).CEL',
      

### Columns from the list above can then easily be picked to produce files for use later. Examples below given:
 #### df3 = average SCN and WB expression for the platform and the log-fold changes
 #### df4 = Hedges g  values and associated variance for Meta-analysis (after indexing)

In [26]:
# df3 = df.loc[:,[u'Symbol',u'avg-WB', u'avg-SCN', u'log_fold-SCN_vs_WB']]
# df3.columns =[u'Symbol',prefix+'avg-WB', prefix+'avg-SCN', prefix+'log_fold-SCN_vs_WB']
# df3.to_csv('input_files/MoEx_Expression.csv')

In [27]:
df4 = df.loc[:,[u'Symbol',u'MoEx_Enrich',u'MoEx_Hedges_g', u'MoEx_Var_g',u'MoEx_SEg']]
df4.to_csv('input_files/MoEx_forIndexHedges.csv')

In [28]:
df.shape

(28268, 60)