In [1]:
from scipy.stats import bartlett
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import scipy.stats as stats
import numpy as np

## The following test used the arcsine square root transformed data
## The methylation levels are from excel files: CG/CHG/CHH_anova_new.xlsx

## CG context; TE; upstream

In [16]:
CG_Tdu_TE_upstream = [0.902809441, 0.890915138]
CG_Tpr_TE_upstream = [0.83815226, 0.854924864]
CG_Tms_TE_upstream = [0.868784358, 0.862389196]

Trans_CG_Tdu_TE_upstream = np.arcsin(np.sqrt(CG_Tdu_TE_upstream)).tolist()
Trans_CG_Tpr_TE_upstream = np.arcsin(np.sqrt(CG_Tpr_TE_upstream)).tolist()
Trans_CG_Tms_TE_upstream = np.arcsin(np.sqrt(CG_Tms_TE_upstream)).tolist()

### Variance homo test

In [17]:
statistic, p_value = bartlett(Trans_CG_Tdu_TE_upstream, Trans_CG_Tpr_TE_upstream, Trans_CG_Tms_TE_upstream)
print(statistic, p_value)

0.5164909223145695 0.7724056132601254


### ANOVA and post hoc Tukey

In [18]:
CG_TE_upstream = pd.DataFrame({'species': ['Tdu', 'Tdu', 'Tpr', 'Tpr', 'Tms', 'Tms'],
                          'methylation': Trans_CG_Tdu_TE_upstream + Trans_CG_Tpr_TE_upstream + Trans_CG_Tms_TE_upstream})
model = ols('methylation ~ species', data=CG_TE_upstream).fit()
anova_table = sm.stats.anova_lm(model)

posthoc = pairwise_tukeyhsd(CG_TE_upstream['methylation'], CG_TE_upstream['species'])

print("ANOVA results:\n", anova_table)
print("\nPost hoc test results:\n", posthoc)

ANOVA results:
            df    sum_sq   mean_sq          F    PR(>F)
species   2.0  0.005865  0.002933  17.382726  0.022389
Residual  3.0  0.000506  0.000169        NaN       NaN

Post hoc test results:
 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
   Tdu    Tms  -0.0485 0.0665 -0.1028  0.0057  False
   Tdu    Tpr  -0.0756 0.0205 -0.1299 -0.0213   True
   Tms    Tpr   -0.027 0.2408 -0.0813  0.0272  False
----------------------------------------------------


### Tms vs MPV

In [19]:
t_statistic, p_value = stats.ttest_1samp(CG_Tms_TE_upstream, np.mean(CG_Tdu_TE_upstream + CG_Tpr_TE_upstream))

print("T-Statistic:", t_statistic)
print("P-Value:", p_value)

T-Statistic: -1.9119605570586264
P-Value: 0.3067846368250601


## CG context; TE; body

In [2]:
CG_Tdu_TE_body = [0.929022168, 0.927088216]
CG_Tpr_TE_body = [0.880771565, 0.888415578]
CG_Tms_TE_body = [0.903621586, 0.903985638]

Trans_CG_Tdu_TE_body = np.arcsin(np.sqrt(CG_Tdu_TE_body)).tolist()
Trans_CG_Tpr_TE_body = np.arcsin(np.sqrt(CG_Tpr_TE_body)).tolist()
Trans_CG_Tms_TE_body = np.arcsin(np.sqrt(CG_Tms_TE_body)).tolist()

In [3]:
statistic, p_value = bartlett(Trans_CG_Tdu_TE_body, Trans_CG_Tpr_TE_body, Trans_CG_Tms_TE_body)
print(statistic, p_value)

3.630634599369503 0.16278624816951856


In [4]:
CG_TE_body = pd.DataFrame({'species': ['Tdu', 'Tdu', 'Tpr', 'Tpr', 'Tms', 'Tms'],
                          'methylation': Trans_CG_Tdu_TE_body + Trans_CG_Tpr_TE_body + Trans_CG_Tms_TE_body})
model = ols('methylation ~ species', data=CG_TE_body).fit()
anova_table = sm.stats.anova_lm(model)

posthoc = pairwise_tukeyhsd(CG_TE_body['methylation'], CG_TE_body['species'])

print("ANOVA results:\n", anova_table)
print("\nPost hoc test results:\n", posthoc)

ANOVA results:
            df    sum_sq   mean_sq          F    PR(>F)
species   2.0  0.005682  0.002841  108.21758  0.001599
Residual  3.0  0.000079  0.000026        NaN       NaN

Post hoc test results:
 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
   Tdu    Tms  -0.0438 0.0069 -0.0652 -0.0224   True
   Tdu    Tpr   -0.075 0.0014 -0.0964 -0.0536   True
   Tms    Tpr  -0.0312 0.0181 -0.0526 -0.0098   True
----------------------------------------------------


In [5]:
t_statistic, p_value = stats.ttest_1samp(CG_Tms_TE_body, np.mean(CG_Tdu_TE_body + CG_Tpr_TE_body))

print("T-Statistic:", t_statistic)
print("P-Value:", p_value)

T-Statistic: -13.84840489820144
P-Value: 0.04589096764788375


In [6]:
t_statistic, p_value = stats.ttest_1samp(CG_Tms_TE_body, np.mean(CG_Tdu_TE_body + CG_Tpr_TE_body), alternative='less')

print("T-Statistic:", t_statistic)
print("P-Value:", p_value)

T-Statistic: -13.84840489820144
P-Value: 0.022945483823941876


In [24]:
t_statistic, p_value = stats.ttest_1samp(Trans_CG_Tms_TE_body, np.arcsin(np.sqrt(np.mean(CG_Tdu_TE_body + CG_Tpr_TE_body))))

print("T-Statistic:", t_statistic)
print("P-Value:", p_value)

T-Statistic: -13.93017907824194
P-Value: 0.04562249849873204


## CG context; TE; downstream

In [25]:
CG_Tdu_TE_down = [0.904779773, 0.893568579]
CG_Tpr_TE_down = [0.842762567, 0.859031414]
CG_Tms_TE_down = [0.871698715, 0.86621498]

Trans_CG_Tdu_TE_down = np.arcsin(np.sqrt(CG_Tdu_TE_down)).tolist()
Trans_CG_Tpr_TE_down = np.arcsin(np.sqrt(CG_Tpr_TE_down)).tolist()
Trans_CG_Tms_TE_down = np.arcsin(np.sqrt(CG_Tms_TE_down)).tolist()

In [26]:
statistic, p_value = bartlett(Trans_CG_Tdu_TE_down, Trans_CG_Tpr_TE_down, Trans_CG_Tms_TE_down)
print(statistic, p_value)

0.6429103237890191 0.7250931411013195


In [27]:
CG_TE_down = pd.DataFrame({'species': ['Tdu', 'Tdu', 'Tpr', 'Tpr', 'Tms', 'Tms'],
                          'methylation': Trans_CG_Tdu_TE_down + Trans_CG_Tpr_TE_down + Trans_CG_Tms_TE_down})
model = ols('methylation ~ species', data=CG_TE_down).fit()
anova_table = sm.stats.anova_lm(model)

posthoc = pairwise_tukeyhsd(CG_TE_down['methylation'], CG_TE_down['species'])

print("ANOVA results:\n", anova_table)
print("\nPost hoc test results:\n", posthoc)

ANOVA results:
            df    sum_sq   mean_sq          F    PR(>F)
species   2.0  0.005527  0.002764  17.737971  0.021772
Residual  3.0  0.000467  0.000156        NaN       NaN

Post hoc test results:
 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
   Tdu    Tms  -0.0474 0.0639 -0.0995  0.0048  False
   Tdu    Tpr  -0.0733   0.02 -0.1255 -0.0211   True
   Tms    Tpr  -0.0259 0.2419 -0.0781  0.0262  False
----------------------------------------------------


In [28]:
t_statistic, p_value = stats.ttest_1samp(CG_Tms_TE_down, np.mean(CG_Tdu_TE_down + CG_Tpr_TE_down))

print("T-Statistic:", t_statistic)
print("P-Value:", p_value)

T-Statistic: -2.217005653993114
P-Value: 0.2697575125921614


## CHG; TE; upstream

In [2]:
CHG_Tdu_TE_upstream = [0.744685764, 0.730132785]
CHG_Tpr_TE_upstream = [0.667256525, 0.687964717]
CHG_Tms_TE_upstream = [0.689551242, 0.681802283]

Trans_CHG_Tdu_TE_upstream = np.arcsin(np.sqrt(CHG_Tdu_TE_upstream)).tolist()
Trans_CHG_Tpr_TE_upstream = np.arcsin(np.sqrt(CHG_Tpr_TE_upstream)).tolist()
Trans_CHG_Tms_TE_upstream = np.arcsin(np.sqrt(CHG_Tms_TE_upstream)).tolist()

In [3]:
statistic, p_value = bartlett(Trans_CHG_Tdu_TE_upstream, Trans_CHG_Tpr_TE_upstream, Trans_CHG_Tms_TE_upstream)
print(statistic, p_value)

0.5759502081667643 0.7497802584734805


In [4]:
CHG_TE_upstream = pd.DataFrame({'species': ['Tdu', 'Tdu', 'Tpr', 'Tpr', 'Tms', 'Tms'],
                          'methylation': Trans_CHG_Tdu_TE_upstream + Trans_CHG_Tpr_TE_upstream + Trans_CHG_Tms_TE_upstream})
model = ols('methylation ~ species', data=CHG_TE_upstream).fit()
anova_table = sm.stats.anova_lm(model)

posthoc = pairwise_tukeyhsd(CHG_TE_upstream['methylation'], CHG_TE_upstream['species'])

print("ANOVA results:\n", anova_table)
print("\nPost hoc test results:\n", posthoc)

ANOVA results:
            df    sum_sq   mean_sq          F    PR(>F)
species   2.0  0.005115  0.002557  18.398332  0.020697
Residual  3.0  0.000417  0.000139        NaN       NaN

Post hoc test results:
 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
   Tdu    Tms  -0.0572 0.0338 -0.1064 -0.0079   True
   Tdu    Tpr  -0.0658  0.023 -0.1151 -0.0165   True
   Tms    Tpr  -0.0086 0.7645 -0.0579  0.0406  False
----------------------------------------------------


### Tms vs MPV

In [5]:
t_statistic, p_value = stats.ttest_1samp(CHG_Tms_TE_upstream, np.mean(CHG_Tdu_TE_upstream + CHG_Tpr_TE_upstream))

print("T-Statistic:", t_statistic)
print("P-Value:", p_value)

T-Statistic: -5.635127311939618
P-Value: 0.11180946626957758


### Tms vs MPV transformed

In [6]:
t_statistic, p_value = stats.ttest_1samp(Trans_CHG_Tms_TE_upstream, np.arcsin(np.sqrt(np.mean(CHG_Tdu_TE_upstream + CHG_Tpr_TE_upstream))))

print("T-Statistic:", t_statistic)
print("P-Value:", p_value)

T-Statistic: -5.6895494139606315
P-Value: 0.11076153052837805


## CHG; TE; body

In [7]:
CHG_Tdu_TE_body = [0.792491292, 0.788490789]
CHG_Tpr_TE_body = [0.73317691, 0.744328893]
CHG_Tms_TE_body = [0.748785519, 0.747089245]

Trans_CHG_Tdu_TE_body = np.arcsin(np.sqrt(CHG_Tdu_TE_body)).tolist()
Trans_CHG_Tpr_TE_body = np.arcsin(np.sqrt(CHG_Tpr_TE_body)).tolist()
Trans_CHG_Tms_TE_body = np.arcsin(np.sqrt(CHG_Tms_TE_body)).tolist()

In [8]:
statistic, p_value = bartlett(Trans_CHG_Tdu_TE_body, Trans_CHG_Tpr_TE_body, Trans_CHG_Tms_TE_body)
print(statistic, p_value)

1.9557459205342056 0.37611025008276483


In [9]:
CHG_TE_body = pd.DataFrame({'species': ['Tdu', 'Tdu', 'Tpr', 'Tpr', 'Tms', 'Tms'],
                          'methylation': Trans_CHG_Tdu_TE_body + Trans_CHG_Tpr_TE_body + Trans_CHG_Tms_TE_body})
model = ols('methylation ~ species', data=CHG_TE_body).fit()
anova_table = sm.stats.anova_lm(model)

posthoc = pairwise_tukeyhsd(CHG_TE_body['methylation'], CHG_TE_body['species'])

print("ANOVA results:\n", anova_table)
print("\nPost hoc test results:\n", posthoc)

ANOVA results:
            df    sum_sq   mean_sq          F    PR(>F)
species   2.0  0.004261  0.002131  67.604378  0.003198
Residual  3.0  0.000095  0.000032        NaN       NaN

Post hoc test results:
 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
   Tdu    Tms  -0.0506 0.0059  -0.074 -0.0271   True
   Tdu    Tpr   -0.061 0.0034 -0.0845 -0.0376   True
   Tms    Tpr  -0.0105  0.291  -0.034   0.013  False
----------------------------------------------------


### Tms vs MPV

In [10]:
t_statistic, p_value = stats.ttest_1samp(CHG_Tms_TE_body, np.mean(CHG_Tdu_TE_body + CHG_Tpr_TE_body))

print("T-Statistic:", t_statistic)
print("P-Value:", p_value)

T-Statistic: -19.6720447286224
P-Value: 0.032333815686959304


In [11]:
t_statistic, p_value = stats.ttest_1samp(CHG_Tms_TE_body, np.mean(CHG_Tdu_TE_body + CHG_Tpr_TE_body), alternative='less')

print("T-Statistic:", t_statistic)
print("P-Value:", p_value)

T-Statistic: -19.6720447286224
P-Value: 0.016166907843479652


### Tms vs MPV; transformed data

In [12]:
t_statistic, p_value = stats.ttest_1samp(Trans_CHG_Tms_TE_body, np.arcsin(np.sqrt(np.mean(CHG_Tdu_TE_body + CHG_Tpr_TE_body))))

print("T-Statistic:", t_statistic)
print("P-Value:", p_value)

T-Statistic: -19.89725226904349
P-Value: 0.03196846302872938


## CHG; TE; downstream

In [3]:
CHG_Tdu_TE_down = [0.747819285, 0.734214965]
CHG_Tpr_TE_down = [0.670819452, 0.690928467]
CHG_Tms_TE_down = [0.692812895, 0.685654846]

Trans_CHG_Tdu_TE_down = np.arcsin(np.sqrt(CHG_Tdu_TE_down)).tolist()
Trans_CHG_Tpr_TE_down = np.arcsin(np.sqrt(CHG_Tpr_TE_down)).tolist()
Trans_CHG_Tms_TE_down = np.arcsin(np.sqrt(CHG_Tms_TE_down)).tolist()

In [4]:
statistic, p_value = bartlett(Trans_CHG_Tdu_TE_down, Trans_CHG_Tpr_TE_down, Trans_CHG_Tms_TE_down)
print(statistic, p_value)

0.62973785114316 0.7298845371979639


In [5]:
CHG_TE_down = pd.DataFrame({'species': ['Tdu', 'Tdu', 'Tpr', 'Tpr', 'Tms', 'Tms'],
                          'methylation': Trans_CHG_Tdu_TE_down + Trans_CHG_Tpr_TE_down + Trans_CHG_Tms_TE_down})
model = ols('methylation ~ species', data=CHG_TE_down).fit()
anova_table = sm.stats.anova_lm(model)

posthoc = pairwise_tukeyhsd(CHG_TE_down['methylation'], CHG_TE_down['species'])

print("ANOVA results:\n", anova_table)
print("\nPost hoc test results:\n", posthoc)

ANOVA results:
            df    sum_sq   mean_sq          F    PR(>F)
species   2.0  0.005193  0.002597  20.330606  0.018011
Residual  3.0  0.000383  0.000128        NaN       NaN

Post hoc test results:
 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
   Tdu    Tms  -0.0574 0.0297 -0.1047 -0.0102   True
   Tdu    Tpr  -0.0664   0.02 -0.1136 -0.0192   True
   Tms    Tpr   -0.009 0.7325 -0.0562  0.0383  False
----------------------------------------------------


In [6]:
t_statistic, p_value = stats.ttest_1samp(CHG_Tms_TE_down, np.mean(CHG_Tdu_TE_down + CHG_Tpr_TE_down))

print("T-Statistic:", t_statistic)
print("P-Value:", p_value)

T-Statistic: -6.066365779278698
P-Value: 0.10400718715325138


## CHH; TE; upstream

In [7]:
CHH_Tdu_TE_upstream = [0.108408375, 0.125128307]
CHH_Tpr_TE_upstream = [0.099338136, 0.107693327]
CHH_Tms_TE_upstream = [0.113732161, 0.109468584]

Trans_CHH_Tdu_TE_upstream = np.arcsin(np.sqrt(CHH_Tdu_TE_upstream)).tolist()
Trans_CHH_Tpr_TE_upstream = np.arcsin(np.sqrt(CHH_Tpr_TE_upstream)).tolist()
Trans_CHH_Tms_TE_upstream = np.arcsin(np.sqrt(CHH_Tms_TE_upstream)).tolist()

In [8]:
statistic, p_value = bartlett(Trans_CHH_Tdu_TE_upstream, Trans_CHH_Tpr_TE_upstream, Trans_CHH_Tms_TE_upstream)
print(statistic, p_value)

1.0871777549731945 0.5806605887160073


In [9]:
CHH_TE_upstream = pd.DataFrame({'species': ['Tdu', 'Tdu', 'Tpr', 'Tpr', 'Tms', 'Tms'],
                          'methylation': Trans_CHH_Tdu_TE_upstream + Trans_CHH_Tpr_TE_upstream + Trans_CHH_Tms_TE_upstream})
model = ols('methylation ~ species', data=CHH_TE_upstream).fit()
anova_table = sm.stats.anova_lm(model)

posthoc = pairwise_tukeyhsd(CHH_TE_upstream['methylation'], CHH_TE_upstream['species'])

print("ANOVA results:\n", anova_table)
print("\nPost hoc test results:\n", posthoc)

ANOVA results:
            df    sum_sq   mean_sq         F    PR(>F)
species   2.0  0.000451  0.000226  1.483755  0.356445
Residual  3.0  0.000456  0.000152       NaN       NaN

Post hoc test results:
 Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
   Tdu    Tms  -0.0079 0.8087 -0.0595 0.0436  False
   Tdu    Tpr   -0.021 0.3372 -0.0726 0.0305  False
   Tms    Tpr  -0.0131 0.5951 -0.0646 0.0384  False
---------------------------------------------------


## CHH; TE; body

In [10]:
CHH_Tdu_TE_body = [0.100231736, 0.120534747]
CHH_Tpr_TE_body = [0.096859226, 0.102202639]
CHH_Tms_TE_body = [0.109912719, 0.107765289]

Trans_CHH_Tdu_TE_body = np.arcsin(np.sqrt(CHH_Tdu_TE_body)).tolist()
Trans_CHH_Tpr_TE_body = np.arcsin(np.sqrt(CHH_Tpr_TE_body)).tolist()
Trans_CHH_Tms_TE_body = np.arcsin(np.sqrt(CHH_Tms_TE_body)).tolist()

In [11]:
statistic, p_value = bartlett(Trans_CHH_Tdu_TE_body, Trans_CHH_Tpr_TE_body, Trans_CHH_Tms_TE_body)
print(statistic, p_value)

2.781304893859587 0.2489128492164053


In [12]:
CHH_TE_body = pd.DataFrame({'species': ['Tdu', 'Tdu', 'Tpr', 'Tpr', 'Tms', 'Tms'],
                          'methylation': Trans_CHH_Tdu_TE_body + Trans_CHH_Tpr_TE_body + Trans_CHH_Tms_TE_body})
model = ols('methylation ~ species', data=CHH_TE_body).fit()
anova_table = sm.stats.anova_lm(model)

posthoc = pairwise_tukeyhsd(CHH_TE_body['methylation'], CHH_TE_body['species'])

print("ANOVA results:\n", anova_table)
print("\nPost hoc test results:\n", posthoc)

ANOVA results:
            df    sum_sq   mean_sq         F    PR(>F)
species   2.0  0.000360  0.000180  0.945827  0.480284
Residual  3.0  0.000572  0.000191       NaN       NaN

Post hoc test results:
 Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
   Tdu    Tms  -0.0021 0.9868 -0.0598 0.0555  False
   Tdu    Tpr  -0.0174 0.5022 -0.0751 0.0403  False
   Tms    Tpr  -0.0153 0.5739 -0.0729 0.0424  False
---------------------------------------------------


## CHH; TE; downstream

In [13]:
CHH_Tdu_TE_down = [0.109116204, 0.125886844]
CHH_Tpr_TE_down = [0.099315734, 0.107640439]
CHH_Tms_TE_down = [0.11406288, 0.109846765]

Trans_CHH_Tdu_TE_down = np.arcsin(np.sqrt(CHH_Tdu_TE_down)).tolist()
Trans_CHH_Tpr_TE_down = np.arcsin(np.sqrt(CHH_Tpr_TE_down)).tolist()
Trans_CHH_Tms_TE_down = np.arcsin(np.sqrt(CHH_Tms_TE_down)).tolist()

In [14]:
statistic, p_value = bartlett(Trans_CHH_Tdu_TE_down, Trans_CHH_Tpr_TE_down, Trans_CHH_Tms_TE_down)
print(statistic, p_value)

1.104385125454822 0.5756861975121152


In [15]:
CHH_TE_down = pd.DataFrame({'species': ['Tdu', 'Tdu', 'Tpr', 'Tpr', 'Tms', 'Tms'],
                          'methylation': Trans_CHH_Tdu_TE_down + Trans_CHH_Tpr_TE_down + Trans_CHH_Tms_TE_down})
model = ols('methylation ~ species', data=CHH_TE_down).fit()
anova_table = sm.stats.anova_lm(model)

posthoc = pairwise_tukeyhsd(CHH_TE_down['methylation'], CHH_TE_down['species'])

print("ANOVA results:\n", anova_table)
print("\nPost hoc test results:\n", posthoc)

ANOVA results:
            df    sum_sq   mean_sq         F  PR(>F)
species   2.0  0.000503  0.000252  1.658995  0.3272
Residual  3.0  0.000455  0.000152       NaN     NaN

Post hoc test results:
 Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
   Tdu    Tms  -0.0085 0.7848   -0.06  0.043  False
   Tdu    Tpr  -0.0222 0.3082 -0.0737 0.0292  False
   Tms    Tpr  -0.0137   0.57 -0.0652 0.0378  False
---------------------------------------------------
