In [1]:
from scipy.stats import bartlett
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import scipy.stats as stats
import numpy as np

## The following test used the arcsine square root transformed data
## The methylation levels are from excel files: CG/CHG/CHH_anova_new_LINE_DNA.xlsx

## CG context; LINE; upstream

In [2]:
CG_Tdu_LINE_upstream = [0.873092354, 0.842691017]
CG_Tpr_LINE_upstream = [0.76849903, 0.80350084]
CG_Tms_LINE_upstream = [0.815490141, 0.798978264]

Trans_CG_Tdu_LINE_upstream = np.arcsin(np.sqrt(CG_Tdu_LINE_upstream)).tolist()
Trans_CG_Tpr_LINE_upstream = np.arcsin(np.sqrt(CG_Tpr_LINE_upstream)).tolist()
Trans_CG_Tms_LINE_upstream = np.arcsin(np.sqrt(CG_Tms_LINE_upstream)).tolist()

### Variance homo test

In [3]:
statistic, p_value = bartlett(Trans_CG_Tdu_LINE_upstream, Trans_CG_Tpr_LINE_upstream, Trans_CG_Tms_LINE_upstream)
print(statistic, p_value)

0.39078127926689576 0.8225132890006389


### ANOVA and post hoc Tukey

In [4]:
CG_LINE_upstream = pd.DataFrame({'species': ['Tdu', 'Tdu', 'Tpr', 'Tpr', 'Tms', 'Tms'],
                          'methylation': Trans_CG_Tdu_LINE_upstream + Trans_CG_Tpr_LINE_upstream + Trans_CG_Tms_LINE_upstream})
model = ols('methylation ~ species', data=CG_LINE_upstream).fit()
anova_table = sm.stats.anova_lm(model)

posthoc = pairwise_tukeyhsd(CG_LINE_upstream['methylation'], CG_LINE_upstream['species'])

print("ANOVA results:\n", anova_table)
print("\nPost hoc test results:\n", posthoc)

ANOVA results:
            df    sum_sq   mean_sq         F    PR(>F)
species   2.0  0.009539  0.004770  6.875721  0.075789
Residual  3.0  0.002081  0.000694       NaN       NaN

Post hoc test results:
 Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
   Tdu    Tms  -0.0684 0.1551 -0.1785 0.0416  False
   Tdu    Tpr  -0.0946 0.0734 -0.2046 0.0155  False
   Tms    Tpr  -0.0262 0.6295 -0.1362 0.0839  False
---------------------------------------------------


## CG context; LINE; body

In [6]:
CG_Tdu_LINE_body = [0.945239867, 0.944587146]
CG_Tpr_LINE_body = [0.886832521, 0.896157275]
CG_Tms_LINE_body = [0.922242588, 0.922356429]

Trans_CG_Tdu_LINE_body = np.arcsin(np.sqrt(CG_Tdu_LINE_body)).tolist()
Trans_CG_Tpr_LINE_body = np.arcsin(np.sqrt(CG_Tpr_LINE_body)).tolist()
Trans_CG_Tms_LINE_body = np.arcsin(np.sqrt(CG_Tms_LINE_body)).tolist()

In [7]:
statistic, p_value = bartlett(Trans_CG_Tdu_LINE_body, Trans_CG_Tpr_LINE_body, Trans_CG_Tms_LINE_body)
print(statistic, p_value)

6.883573161609177 0.032007450319256614


In [8]:
CG_LINE_body = pd.DataFrame({'species': ['Tdu', 'Tdu', 'Tpr', 'Tpr', 'Tms', 'Tms'],
                          'methylation': Trans_CG_Tdu_LINE_body + Trans_CG_Tpr_LINE_body + Trans_CG_Tms_LINE_body})
model = ols('methylation ~ species', data=CG_LINE_body).fit()
anova_table = sm.stats.anova_lm(model)

posthoc = pairwise_tukeyhsd(CG_LINE_body['methylation'], CG_LINE_body['species'])

print("ANOVA results:\n", anova_table)
print("\nPost hoc test results:\n", posthoc)

ANOVA results:
            df    sum_sq   mean_sq           F    PR(>F)
species   2.0  0.009757  0.004879  129.004549  0.001232
Residual  3.0  0.000113  0.000038         NaN       NaN

Post hoc test results:
 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
   Tdu    Tms  -0.0456 0.0104 -0.0713 -0.0199   True
   Tdu    Tpr  -0.0987 0.0011 -0.1244  -0.073   True
   Tms    Tpr  -0.0531 0.0067 -0.0788 -0.0274   True
----------------------------------------------------


In [9]:
t_statistic, p_value = stats.ttest_1samp(CG_Tms_LINE_body, np.mean(CG_Tdu_LINE_body + CG_Tpr_LINE_body))

print("T-Statistic:", t_statistic)
print("P-Value:", p_value)

T-Statistic: 71.94782635429877
P-Value: 0.008847783387926291


In [12]:
t_statistic, p_value = stats.ttest_1samp(CG_Tms_LINE_body, np.mean(CG_Tdu_LINE_body + CG_Tpr_LINE_body), alternative='greater')

print("T-Statistic:", t_statistic)
print("P-Value:", p_value)

T-Statistic: 71.94782635429877
P-Value: 0.004423891693963146


### the result above means T. miscellus LINE body methylation level is significantly higher then MPV

## CG context; LINE; downstream

In [14]:
CG_Tdu_LINE_down = [0.874587176, 0.849780009]
CG_Tpr_LINE_down = [0.785396684, 0.813311397]
CG_Tms_LINE_down = [0.823882735, 0.81169017]

Trans_CG_Tdu_LINE_down = np.arcsin(np.sqrt(CG_Tdu_LINE_down)).tolist()
Trans_CG_Tpr_LINE_down = np.arcsin(np.sqrt(CG_Tpr_LINE_down)).tolist()
Trans_CG_Tms_LINE_down = np.arcsin(np.sqrt(CG_Tms_LINE_down)).tolist()

In [15]:
statistic, p_value = bartlett(Trans_CG_Tdu_LINE_down, Trans_CG_Tpr_LINE_down, Trans_CG_Tms_LINE_down)
print(statistic, p_value)

0.4744904835351841 0.7887978181239406


In [16]:
CG_LINE_down = pd.DataFrame({'species': ['Tdu', 'Tdu', 'Tpr', 'Tpr', 'Tms', 'Tms'],
                          'methylation': Trans_CG_Tdu_LINE_down + Trans_CG_Tpr_LINE_down + Trans_CG_Tms_LINE_down})
model = ols('methylation ~ species', data=CG_LINE_down).fit()
anova_table = sm.stats.anova_lm(model)

posthoc = pairwise_tukeyhsd(CG_LINE_down['methylation'], CG_LINE_down['species'])

print("ANOVA results:\n", anova_table)
print("\nPost hoc test results:\n", posthoc)

ANOVA results:
            df    sum_sq   mean_sq         F    PR(>F)
species   2.0  0.007569  0.003784  8.218982  0.060633
Residual  3.0  0.001381  0.000460       NaN       NaN

Post hoc test results:
 Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
   Tdu    Tms   -0.061 0.1275 -0.1506 0.0287  False
   Tdu    Tpr  -0.0842 0.0587 -0.1739 0.0054  False
   Tms    Tpr  -0.0233 0.5845 -0.1129 0.0664  False
---------------------------------------------------


## CHG; LINE; upstream

In [17]:
CHG_Tdu_LINE_upstream = [0.687069399, 0.661017079]
CHG_Tpr_LINE_upstream = [0.576862138, 0.61572338]
CHG_Tms_LINE_upstream = [0.61609, 0.598357536]

Trans_CHG_Tdu_LINE_upstream = np.arcsin(np.sqrt(CHG_Tdu_LINE_upstream)).tolist()
Trans_CHG_Tpr_LINE_upstream = np.arcsin(np.sqrt(CHG_Tpr_LINE_upstream)).tolist()
Trans_CHG_Tms_LINE_upstream = np.arcsin(np.sqrt(CHG_Tms_LINE_upstream)).tolist()

In [18]:
statistic, p_value = bartlett(Trans_CHG_Tdu_LINE_upstream, Trans_CHG_Tpr_LINE_upstream, Trans_CHG_Tms_LINE_upstream)
print(statistic, p_value)

0.3940910128580167 0.8211532647096631


In [19]:
CHG_LINE_upstream = pd.DataFrame({'species': ['Tdu', 'Tdu', 'Tpr', 'Tpr', 'Tms', 'Tms'],
                          'methylation': Trans_CHG_Tdu_LINE_upstream + Trans_CHG_Tpr_LINE_upstream + Trans_CHG_Tms_LINE_upstream})
model = ols('methylation ~ species', data=CHG_LINE_upstream).fit()
anova_table = sm.stats.anova_lm(model)

posthoc = pairwise_tukeyhsd(CHG_LINE_upstream['methylation'], CHG_LINE_upstream['species'])

print("ANOVA results:\n", anova_table)
print("\nPost hoc test results:\n", posthoc)

ANOVA results:
            df    sum_sq   mean_sq        F    PR(>F)
species   2.0  0.007685  0.003843  8.63009  0.056979
Residual  3.0  0.001336  0.000445      NaN       NaN

Post hoc test results:
 Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
   Tdu    Tms  -0.0698 0.0898 -0.1579 0.0184  False
   Tdu    Tpr  -0.0809 0.0624  -0.169 0.0073  False
   Tms    Tpr  -0.0111 0.8647 -0.0993 0.0771  False
---------------------------------------------------


## CHG; LINE; body

In [20]:
CHG_Tdu_LINE_body = [0.753312247, 0.753715445]
CHG_Tpr_LINE_body = [0.673849959, 0.684216014]
CHG_Tms_LINE_body = [0.703201399, 0.702283531]

Trans_CHG_Tdu_LINE_body = np.arcsin(np.sqrt(CHG_Tdu_LINE_body)).tolist()
Trans_CHG_Tpr_LINE_body = np.arcsin(np.sqrt(CHG_Tpr_LINE_body)).tolist()
Trans_CHG_Tms_LINE_body = np.arcsin(np.sqrt(CHG_Tms_LINE_body)).tolist()

In [21]:
statistic, p_value = bartlett(Trans_CHG_Tdu_LINE_body, Trans_CHG_Tpr_LINE_body, Trans_CHG_Tms_LINE_body)
print(statistic, p_value)

5.4511120742201555 0.06550976672770856


In [22]:
CHG_LINE_body = pd.DataFrame({'species': ['Tdu', 'Tdu', 'Tpr', 'Tpr', 'Tms', 'Tms'],
                          'methylation': Trans_CHG_Tdu_LINE_body + Trans_CHG_Tpr_LINE_body + Trans_CHG_Tms_LINE_body})
model = ols('methylation ~ species', data=CHG_LINE_body).fit()
anova_table = sm.stats.anova_lm(model)

posthoc = pairwise_tukeyhsd(CHG_LINE_body['methylation'], CHG_LINE_body['species'])

print("ANOVA results:\n", anova_table)
print("\nPost hoc test results:\n", posthoc)

ANOVA results:
            df    sum_sq   mean_sq           F    PR(>F)
species   2.0  0.007179  0.003589  172.991905  0.000797
Residual  3.0  0.000062  0.000021         NaN       NaN

Post hoc test results:
 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
   Tdu    Tms  -0.0571 0.0022 -0.0761 -0.0381   True
   Tdu    Tpr  -0.0828 0.0007 -0.1018 -0.0637   True
   Tms    Tpr  -0.0256 0.0225 -0.0447 -0.0066   True
----------------------------------------------------


### Tms vs MPV

In [23]:
t_statistic, p_value = stats.ttest_1samp(CHG_Tms_LINE_body, np.mean(CHG_Tdu_LINE_body + CHG_Tpr_LINE_body))

print("T-Statistic:", t_statistic)
print("P-Value:", p_value)

T-Statistic: -29.4834360714161
P-Value: 0.021584181015125502


In [24]:
t_statistic, p_value = stats.ttest_1samp(CHG_Tms_LINE_body, np.mean(CHG_Tdu_LINE_body + CHG_Tpr_LINE_body), alternative='less')

print("T-Statistic:", t_statistic)
print("P-Value:", p_value)

T-Statistic: -29.4834360714161
P-Value: 0.010792090507562751


## CHG; LINE; downstream

In [25]:
CHG_Tdu_LINE_down = [0.691416488, 0.663892062]
CHG_Tpr_LINE_down = [0.581703838, 0.61426825]
CHG_Tms_LINE_down = [0.616740981, 0.601547589]

Trans_CHG_Tdu_LINE_down = np.arcsin(np.sqrt(CHG_Tdu_LINE_down)).tolist()
Trans_CHG_Tpr_LINE_down = np.arcsin(np.sqrt(CHG_Tpr_LINE_down)).tolist()
Trans_CHG_Tms_LINE_down = np.arcsin(np.sqrt(CHG_Tms_LINE_down)).tolist()

In [26]:
statistic, p_value = bartlett(Trans_CHG_Tdu_LINE_down, Trans_CHG_Tpr_LINE_down, Trans_CHG_Tms_LINE_down)
print(statistic, p_value)

0.3796677383637922 0.8270965287532629


In [27]:
CHG_LINE_down = pd.DataFrame({'species': ['Tdu', 'Tdu', 'Tpr', 'Tpr', 'Tms', 'Tms'],
                          'methylation': Trans_CHG_Tdu_LINE_down + Trans_CHG_Tpr_LINE_down + Trans_CHG_Tms_LINE_down})
model = ols('methylation ~ species', data=CHG_LINE_down).fit()
anova_table = sm.stats.anova_lm(model)

posthoc = pairwise_tukeyhsd(CHG_LINE_down['methylation'], CHG_LINE_down['species'])

print("ANOVA results:\n", anova_table)
print("\nPost hoc test results:\n", posthoc)

ANOVA results:
            df    sum_sq   mean_sq          F    PR(>F)
species   2.0  0.008105  0.004053  10.987047  0.041634
Residual  3.0  0.001107  0.000369        NaN       NaN

Post hoc test results:
 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
   Tdu    Tms  -0.0717 0.0667 -0.1519  0.0086  False
   Tdu    Tpr   -0.083 0.0458 -0.1633 -0.0028   True
   Tms    Tpr  -0.0114 0.8341 -0.0916  0.0689  False
----------------------------------------------------


In [28]:
t_statistic, p_value = stats.ttest_1samp(CHG_Tms_LINE_down, np.mean(CHG_Tdu_LINE_down + CHG_Tpr_LINE_down))

print("T-Statistic:", t_statistic)
print("P-Value:", p_value)

T-Statistic: -3.774782418567247
P-Value: 0.16486362497058382


## CHH; LINE; upstream

In [29]:
CHH_Tdu_LINE_upstream = [0.133116838, 0.144265973]
CHH_Tpr_LINE_upstream = [0.114324337, 0.131024774]
CHH_Tms_LINE_upstream = [0.133233226, 0.124663596]

Trans_CHH_Tdu_LINE_upstream = np.arcsin(np.sqrt(CHH_Tdu_LINE_upstream)).tolist()
Trans_CHH_Tpr_LINE_upstream = np.arcsin(np.sqrt(CHH_Tpr_LINE_upstream)).tolist()
Trans_CHH_Tms_LINE_upstream = np.arcsin(np.sqrt(CHH_Tms_LINE_upstream)).tolist()

In [30]:
statistic, p_value = bartlett(Trans_CHH_Tdu_LINE_upstream, Trans_CHH_Tpr_LINE_upstream, Trans_CHH_Tms_LINE_upstream)
print(statistic, p_value)

0.34869605661800745 0.8400045014341114


In [31]:
CHH_LINE_upstream = pd.DataFrame({'species': ['Tdu', 'Tdu', 'Tpr', 'Tpr', 'Tms', 'Tms'],
                          'methylation': Trans_CHH_Tdu_LINE_upstream + Trans_CHH_Tpr_LINE_upstream + Trans_CHH_Tms_LINE_upstream})
model = ols('methylation ~ species', data=CHH_LINE_upstream).fit()
anova_table = sm.stats.anova_lm(model)

posthoc = pairwise_tukeyhsd(CHH_LINE_upstream['methylation'], CHH_LINE_upstream['species'])

print("ANOVA results:\n", anova_table)
print("\nPost hoc test results:\n", posthoc)

ANOVA results:
            df    sum_sq   mean_sq         F    PR(>F)
species   2.0  0.000578  0.000289  1.617108  0.333817
Residual  3.0  0.000536  0.000179       NaN       NaN

Post hoc test results:
 Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
   Tdu    Tms  -0.0143 0.5918 -0.0702 0.0416  False
   Tdu    Tpr  -0.0239 0.3133 -0.0798  0.032  False
   Tms    Tpr  -0.0096  0.771 -0.0655 0.0463  False
---------------------------------------------------


## CHH; LINE; body

In [32]:
CHH_Tdu_LINE_body = [0.128488793, 0.157003068]
CHH_Tpr_LINE_body = [0.124702754, 0.135907464]
CHH_Tms_LINE_body = [0.143681781, 0.139397546]

Trans_CHH_Tdu_LINE_body = np.arcsin(np.sqrt(CHH_Tdu_LINE_body)).tolist()
Trans_CHH_Tpr_LINE_body = np.arcsin(np.sqrt(CHH_Tpr_LINE_body)).tolist()
Trans_CHH_Tms_LINE_body = np.arcsin(np.sqrt(CHH_Tms_LINE_body)).tolist()

In [33]:
statistic, p_value = bartlett(Trans_CHH_Tdu_LINE_body, Trans_CHH_Tpr_LINE_body, Trans_CHH_Tms_LINE_body)
print(statistic, p_value)

1.9405980500157702 0.3789696997412878


In [34]:
CHH_LINE_body = pd.DataFrame({'species': ['Tdu', 'Tdu', 'Tpr', 'Tpr', 'Tms', 'Tms'],
                          'methylation': Trans_CHH_Tdu_LINE_body + Trans_CHH_Tpr_LINE_body + Trans_CHH_Tms_LINE_body})
model = ols('methylation ~ species', data=CHH_LINE_body).fit()
anova_table = sm.stats.anova_lm(model)

posthoc = pairwise_tukeyhsd(CHH_LINE_body['methylation'], CHH_LINE_body['species'])

print("ANOVA results:\n", anova_table)
print("\nPost hoc test results:\n", posthoc)

ANOVA results:
            df    sum_sq   mean_sq         F    PR(>F)
species   2.0  0.000392  0.000196  0.594532  0.606047
Residual  3.0  0.000990  0.000330       NaN       NaN

Post hoc test results:
 Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
   Tdu    Tms  -0.0013 0.9971 -0.0772 0.0746  False
   Tdu    Tpr  -0.0178 0.6368 -0.0937 0.0581  False
   Tms    Tpr  -0.0165 0.6738 -0.0924 0.0594  False
---------------------------------------------------


## CHH; LINE; downstream

In [35]:
CHH_Tdu_LINE_down = [0.138814965, 0.151124717]
CHH_Tpr_LINE_down = [0.114100494, 0.128921491]
CHH_Tms_LINE_down = [0.134881675, 0.126055793]

Trans_CHH_Tdu_LINE_down = np.arcsin(np.sqrt(CHH_Tdu_LINE_down)).tolist()
Trans_CHH_Tpr_LINE_down = np.arcsin(np.sqrt(CHH_Tpr_LINE_down)).tolist()
Trans_CHH_Tms_LINE_down = np.arcsin(np.sqrt(CHH_Tms_LINE_down)).tolist()

In [36]:
statistic, p_value = bartlett(Trans_CHH_Tdu_LINE_down, Trans_CHH_Tpr_LINE_down, Trans_CHH_Tms_LINE_down)
print(statistic, p_value)

0.20207518823840714 0.903899050958438


In [37]:
CHH_LINE_down = pd.DataFrame({'species': ['Tdu', 'Tdu', 'Tpr', 'Tpr', 'Tms', 'Tms'],
                          'methylation': Trans_CHH_Tdu_LINE_down + Trans_CHH_Tpr_LINE_down + Trans_CHH_Tms_LINE_down})
model = ols('methylation ~ species', data=CHH_LINE_down).fit()
anova_table = sm.stats.anova_lm(model)

posthoc = pairwise_tukeyhsd(CHH_LINE_down['methylation'], CHH_LINE_down['species'])

print("ANOVA results:\n", anova_table)
print("\nPost hoc test results:\n", posthoc)

ANOVA results:
            df    sum_sq   mean_sq         F    PR(>F)
species   2.0  0.001217  0.000608  3.678401  0.155899
Residual  3.0  0.000496  0.000165       NaN       NaN

Post hoc test results:
 Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
   Tdu    Tms   -0.021 0.3597 -0.0748 0.0327  False
   Tdu    Tpr  -0.0346 0.1437 -0.0884 0.0191  False
   Tms    Tpr  -0.0136 0.5973 -0.0673 0.0401  False
---------------------------------------------------
