In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
plt.rcParams['pdf.fonttype']=42

In [None]:
diag_code=pd.read_csv('~/Downloads/CeD_diagnostic_counts.csv')

In [None]:
file_path2 = '/run/user/220224/gvfs/smb-share:server=129.112.149.250,share=kong%20lab/AoU_CeD/AoU HLA/202409/20241002_HLA.csv'
HLA = pd.read_csv(file_path2)

diag_hla=HLA.merge(diag_code,how='left',on='person_id')
EHR=pd.read_csv('~/Downloads/EHR_patients.csv').person_id

In [None]:
diag_hla=diag_hla[(diag_hla['person_id'].isin(EHR))&(diag_hla['CeD']==2)]
diag_hla['count'].fillna(0,inplace=True)

In [None]:
diag_hla.rename({'Impact label':'Impact_Label','count':'diagnostic_code_count'},axis=1,inplace=True)
diag_median=diag_hla.groupby('Impact_Label').agg('median')['diagnostic_code_count']

In [None]:
# Fit the ANOVA model
model = ols('diagnostic_code_count ~ Impact_Label', data=diag_hla).fit()

# Perform ANOVA
diag_anova_table = sm.stats.anova_lm(model, typ=2)  # typ=2 is for Type II ANOVA
print(diag_anova_table)

In [None]:
# Sort data by the mean of each group
diag_hla['Impact_Label'].replace(['high','moderate','low','none'],['4:high','3:moderate','2:low','1:none'], inplace=True)

# Now perform post-hoc Tukey HSD test
diag_tukey = pairwise_tukeyhsd(endog=diag_hla['diagnostic_code_count'],    # dependent variable
                          groups=diag_hla['Impact_Label'], # independent variable (groups)
                          alpha=0.05)                       # significance level
print(diag_tukey.summary())


fig =diag_tukey.plot_simultaneous(xlabel='CeD diagnostic code count',ylabel='HLA-DQ risk')
fig.text(x=0.2,y=0.7,s=f'n= {str(len(diag_hla))}', fontsize=12)
fig.text(x=0.2,y=0.65,s=f"F= {str(round(diag_anova_table.loc['Impact_Label','F'],2))}", fontsize=12)
fig.text(x=0.2,y=0.6,s=f"p= {str(round(diag_anova_table.loc['Impact_Label','PR(>F)'],16))}", fontsize=12)

# Save the plot to a file
fig.savefig('tukey_hsd_plot_CeD_code_count.pdf', format='pdf', bbox_inches='tight')  # Save as pdf with high quality


In [None]:
from math import log10, floor

def round_to_1(x):
    return round(x, -int(floor(log10(abs(x)))-2))


In [None]:
tukey_df=pd.DataFrame(diag_tukey.summary())
tukey_df.columns=tukey_df.loc[0]
tukey_df.drop(0,inplace=True)
tukey_df.iloc[:,3]=diag_tukey.pvalues
tukey_df

In [None]:
impact_count=pd.DataFrame(diag_hla.value_counts('Impact_Label')[['high','moderate','low','none']])