In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
plt.rcParams['pdf.fonttype']=42

In [None]:
from scipy.stats import chi2_contingency

obs=np.array([[236,710,665,319],[315,1868,4431,2843]])
obs

In [None]:
Gliadin=pd.read_csv('gliadin_full.csv')
Gliadin.standard_concept_name.unique()

In [None]:
filt_Gliadin2=Gliadin[(Gliadin['value_as_number']<=400)&Gliadin['standard_concept_name'].isin(['Gliadin IgA RAST','Gliadin IgA Ab [Units/volume] in Serum',
       'Gliadin peptide IgA Ab [Units/volume] in Serum by Immunoassay','Gliadin IgA Ab [Units/volume] in Serum by Immunoassay','Gliadin peptide IgA Ab [Units/volume] in Serum','Gliadin peptide IgA Ab [Presence] in Serum by Immunoassay'])]
filt_group=filt_Gliadin2.groupby(['person_id']).agg({'value_as_number':max})

In [None]:
file_path2 = '20241002_HLA.csv'
HLA = pd.read_csv(file_path2)
Gliadin_hla=filt_group.merge(HLA,on='person_id')
CeD_Gliadin=Gliadin_hla[~pd.isna(Gliadin_hla['value_as_number'])&(Gliadin_hla['CeD']==2)]

CeD_Gliadin.rename({'Impact label':'Impact_Label','value_as_number':'Gliadin_IgA'},axis=1,inplace=True)
Gliadin_median=CeD_Gliadin.groupby('Impact_Label').agg('median')['Gliadin_IgA']

In [None]:
positive_cat=Gliadin[Gliadin['value_as_concept_name'].isin(['High','Positive','Detected'])].drop_duplicates('person_id')

count=pd.DataFrame(filt_Gliadin2.value_counts('person_id'))
filt_med=filt_Gliadin2.groupby(['person_id']).agg({'value_as_number':'median'})

table=pd.concat([count,filt_group,filt_med],axis=1)
table.columns=['count','highest','median']
table.loc[table.index.isin(positive_cat.person_id),'detection']='positive'
table2=pd.merge(table,HLA[['person_id','CeD']],on='person_id')
table2.to_csv('gliadin_iga_measure_count.csv')

In [None]:
# Fit the ANOVA model
model = ols('Gliadin_IgA ~ Impact_Label', data=CeD_Gliadin).fit()

# Perform ANOVA
Gliadin_anova_table = sm.stats.anova_lm(model, typ=2)  # typ=2 is for Type II ANOVA
print(Gliadin_anova_table)

In [None]:
#order
CeD_Gliadin['Impact_Label'].replace(['high','moderate','low','none'],['4:high','3:moderate','2:low','1:none'], inplace=True)
# Now perform post-hoc Tukey HSD test
Gliadin_tukey = pairwise_tukeyhsd(endog=CeD_Gliadin['Gliadin_IgA'],    # dependent variable
                          groups=CeD_Gliadin['Impact_Label'], # independent variable (groups)
                          alpha=0.05)                       # significance level
print(Gliadin_tukey.summary())
fig =Gliadin_tukey.plot_simultaneous(xlabel='Gliadin_IgA (U/mL)',ylabel='HLA-DQ risk')
fig.text(x=0.2,y=0.7,s=f'n= {str(len(CeD_Gliadin))}', fontsize=12)
fig.text(x=0.2,y=0.65,s=f"F= {str(round(Gliadin_anova_table.loc['Impact_Label','F'],2))}", fontsize=12)
fig.text(x=0.2,y=0.6,s=f"p= {str(round(Gliadin_anova_table.loc['Impact_Label','PR(>F)'],12))}", fontsize=12)

# Save the plot to a file
fig.savefig('tukey_hsd_plot_gliadin_iga.pdf', format='pdf', bbox_inches='tight')  # Save as pdf with high quality


In [None]:
from math import log10, floor

def round_to_1(x):
    return round(x, -int(floor(log10(abs(x)))-2))


In [None]:
tukey_df=pd.DataFrame(Gliadin_tukey.summary())
tukey_df.columns=tukey_df.loc[0]
tukey_df.drop(0,inplace=True)
tukey_df.iloc[:,3]=Gliadin_tukey.pvalues
tukey_df

In [None]:
impact_count=pd.DataFrame(CeD_Gliadin.value_counts('Impact_Label')[['high','moderate','low','none']])

In [None]:
plt.figure(figsize=(8, 6))
sns.violinplot(x='Impact_Label', y='Gliadin_IgA', data=CeD_Gliadin,order=[ 'high',  'moderate','low','none',],hue='Impact_Label')
unit='U/mL'
# Add title and labels
plt.title('Gliadin-IgA levels by hla risk')
plt.xlabel('HLA-DQ risk')

plt.ylabel(unit)

    
#t-statistics
plt.text(s='p = '+round_to_1(Gliadin_anova_table.loc['Impact_Label','PR(>F)']).astype(str),y=CeD_Gliadin.Gliadin_IgA.max(),x=0.3)
plt.text(s='F = '+round(Gliadin_anova_table.loc['Impact_Label','F'],2).astype(str),y=CeD_Gliadin.Gliadin_IgA.max(),x=2)    
plt.text(s='median: '+round(Gliadin_median['high'],2).astype(str),y=Gliadin_median['high']+1,x=0)
plt.text(s='median: '+round(Gliadin_median['moderate'],2).astype(str),y=Gliadin_median['moderate']+1,x=1)
plt.text(s='median: '+round(Gliadin_median['low'],2).astype(str),y=Gliadin_median['low']+1,x=2)
plt.text(s='median: '+round(Gliadin_median['none'],2).astype(str),y=Gliadin_median['none']+1,x=3)
plt.text(s='N = '+impact_count.loc['high',0].astype(str),y=-20,x=0)

plt.text(s='N = '+impact_count.loc['moderate',0].astype(str),y=-20,x=1)
plt.text(s='N = '+impact_count.loc['low',0].astype(str),y=-20,x=2)

plt.text(s='N = '+impact_count.loc['none',0].astype(str),y=-20,x=3)


# Show the plot
plt.show()

In [None]:
#chisq for for <10 & >10 Gliadin level
from scipy.stats import chi2_contingency

CeD_Gliadin.loc[CeD_Gliadin['Gliadin_IgA']>20,'test']='positive'
CeD_Gliadin.loc[CeD_Gliadin['Gliadin_IgA']<=20,'test']='negative'
pos_neg=CeD_Gliadin.value_counts('test')

iga_table=pd.DataFrame(CeD_Gliadin.value_counts(['Impact_Label','test'])).reset_index().pivot('Impact_Label','test',0)
res=chi2_contingency(iga_table)
iga_table['overall pval']=res[1]
ORs=[]
for i,row in iga_table.iterrows():
    obs=np.array([[row['positive'],row['negative']],[pos_neg['positive']-row['positive'],pos_neg['negative']-row['negative']]])
    res2=chi2_contingency(obs)
    OR=row['positive']*(pos_neg['negative']-row['negative'])/(row['negative']*(pos_neg['positive']-row['positive']))
    iga_table.loc[i,'OR']=OR
    iga_table.loc[i,'pvalue_ind']=res2[1]
    
iga_table

In [None]:
iga_table