In [None]:
import pandas as pd
from scipy.stats import ttest_ind
import numpy as np
import os
import statsmodels.stats.multitest as st

In [None]:
bucket = os.getenv('WORKSPACE_BUCKET')
matched = pd.read_csv(f'{bucket}/data/survey/celiac_matched_data_non_genetic.csv',sep='\t')

In [None]:
import pandas
import os

# This query represents dataset "measurements" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v7
dataset_14156130_measurement_sql = """
    SELECT DISTINCT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        m_unit.concept_name as unit_concept_name,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_value.concept_name as value_as_concept_name
    FROM
        `""" + os.environ["WORKSPACE_CDR"] + """.measurement` measurement 

        LEFT JOIN
            `""" + os.environ["WORKSPACE_CDR"] + """.concept` m_standard_concept 
                ON measurement.measurement_concept_id = m_standard_concept.concept_id 
        LEFT JOIN
            `""" + os.environ["WORKSPACE_CDR"] + """.concept` m_type 
                ON measurement.measurement_type_concept_id = m_type.concept_id 
        LEFT JOIN
            `""" + os.environ["WORKSPACE_CDR"] + """.concept` m_operator 
                ON measurement.operator_concept_id = m_operator.concept_id 
        LEFT JOIN
            `""" + os.environ["WORKSPACE_CDR"] + """.concept` m_value 
                ON measurement.value_as_concept_id = m_value.concept_id 
        LEFT JOIN
            `""" + os.environ["WORKSPACE_CDR"] + """.concept` m_unit 
                ON measurement.unit_concept_id = m_unit.concept_id 
        LEFT JOIN
            `""" + os.environ["WORKSPACE_CDR"] + """.concept` m_source_concept 
                ON measurement.measurement_source_concept_id = m_source_concept.concept_id"""

measurement_df = pandas.read_gbq(
    dataset_14156130_measurement_sql,
    dialect="standard",
    use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),
    progress_bar_type="tqdm_notebook")

measurement_df.head(5)

In [None]:
import pandas
import os

# This query represents dataset "ced2" for domain "person" and was generated for All of Us Controlled Tier Dataset v7
dataset_49731594_person_sql = """
    SELECT
        person.person_id 
    FROM
        `""" + os.environ["WORKSPACE_CDR"] + """.person` person   
    WHERE
        person.PERSON_ID IN (
            SELECT
                distinct person_id  
            FROM
                `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` cb_search_person  
            WHERE
                cb_search_person.person_id IN (
                    SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id,
                            entry_date,
                            concept_id 
                        FROM
                            `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_all_events` 
                        WHERE
                            (
                                concept_id IN (836793) 
                                AND is_standard = 0  
                                AND  value_source_concept_id IN (1384519)
                            )) criteria 
                    UNION
                    ALL SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id,
                            entry_date,
                            concept_id 
                        FROM
                            `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_all_events` 
                        WHERE
                            (
                                concept_id IN (
                                    SELECT
                                        DISTINCT c.concept_id 
                                    FROM
                                        `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` c 
                                    JOIN
                                        (
                                            select
                                                cast(cr.id as string) as id 
                                            FROM
                                                `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` cr 
                                            WHERE
                                                concept_id IN (194992) 
                                                AND full_text LIKE '%_rank1]%'
                                        ) a 
                                            ON (
                                                c.path LIKE CONCAT('%.',
                                            a.id,
                                            '.%') 
                                            OR c.path LIKE CONCAT('%.',
                                            a.id) 
                                            OR c.path LIKE CONCAT(a.id,
                                            '.%') 
                                            OR c.path = a.id) 
                                        WHERE
                                            is_standard = 1 
                                            AND is_selectable = 1
                                        ) 
                                        AND is_standard = 1 
                                )
                            ) criteria 
                        ) )"""

ced_person_df = pandas.read_gbq(
    dataset_49731594_person_sql,
    dialect="standard",
    use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),
    progress_bar_type="tqdm_notebook")

ced_person_df.head(5)

In [None]:
measurement_df['CeD']='healthy'
measurement_df.loc[measurement_df['person_id'].isin(ced_person_df['person_id']),'CeD']='celiac'

measurement_df['value_as_concept_name'].unique()

In [None]:
#clean measurement data; some fields do not have either standard or source concept name
measurement_df['concept_name']=measurement_df['standard_concept_name']
measurement_df.loc[measurement_df['standard_concept_name']=='No matching concept','concept_name']=measurement_df['source_concept_name']
measurement_df['concept_id']=measurement_df['measurement_concept_id']
measurement_df.loc[measurement_df['measurement_concept_id']==0,'concept_id']=measurement_df['measurement_source_concept_id']

measurement_df.drop(['standard_concept_name','measurement_concept_id','measurement_source_concept_id','source_concept_name'],axis=1,inplace=True)

In [None]:
#select quantitative measurements
measurement_numeric=measurement_df[~pandas.isna(measurement_df['value_as_number'])&(measurement_df['value_as_number']!=10000000)]
measurement_numeric

In [None]:
#count measurement responses and medians; we should count for each unit as these may differ
measurement_numeric.drop_duplicates(['person_id','value_as_number','unit_concept_name','concept_name'],inplace=True)

measurement_numeric['unit_concept_name'].fillna('None',inplace=True)

In [None]:
print(measurement_df.drop_duplicates('person_id').value_counts('Celiac'))
#337898 healthy, 2623 celiac patients total

In [None]:
#get median values
meas_med = measurement_numeric.groupby(['concept_name','unit_concept_name','Celiac']).agg({'value_as_number':'median'})

meas_med

In [None]:
meas_med.reset_index(inplace=True)

measures=meas_med.pivot(columns='Celiac',index=['concept_name','unit_concept_name'],values='value_as_number').fillna(0).reset_index().rename({'celiac':'CeD median value','healthy':'healthy median value'},axis=1)

measure_list = measurement_numeric[['concept_name','unit_concept_name']].drop_duplicates()

measure_list['unit_concept_name'].fillna('None',inplace=True)

In [None]:
# t-tests
df_list = [d for _, d in measurement_numeric.groupby(['concept_name','unit_concept_name'])]

In [None]:
for test in df_list:
    test.reset_index(inplace=True)
    test_ced=test.loc[test['Celiac']=='celiac','value_as_number']
    test_ctrl=test.loc[test['Celiac']=='healthy','value_as_number']
    result=ttest_ind(test_ced,test_ctrl)
    measure_list.loc[(measure_list['concept_name']==test['concept_name'][0])&(measure_list['unit_concept_name']==test['unit_concept_name'][0]),'t']=result.statistic
    measure_list.loc[(measure_list['concept_name']==test['concept_name'][0])&(measure_list['unit_concept_name']==test['unit_concept_name'][0]),'pval']=result.pvalue
    print('t-stat: ',result.statistic,'  p-value: ',result.pvalue)

In [None]:
measure_list.to_csv('measures_ttest.csv',index=False)

In [None]:
measure_list = pd.read_csv('measures_ttest.csv')

In [None]:
measure_list

In [None]:
measures.to_csv('measurement_median.csv',index=False)

In [None]:
measures = pd.read_csv('measurement_median.csv')

In [None]:
#now merge all values together
meas_patient_counts=pd.DataFrame(measurement_numeric[['person_id','Celiac','concept_name','unit_concept_name']].drop_duplicates().value_counts(['Celiac','concept_name','unit_concept_name'])).reset_index().pivot(columns='Celiac',index=['concept_name','unit_concept_name'],values='count').fillna(0)

meas_patient_counts.rename({'celiac':'celiac measure count','healthy':'healthy measure count'},axis=1,inplace=True)
meas_patient_counts.reset_index(inplace=True)

In [None]:
measure_med2=pd.merge(measures,meas_patient_counts,on=['concept_name','unit_concept_name']).drop_duplicates(

In [None]:
measurement_cids=measurement_df[['concept_id','concept_name']].drop_duplicates('concept_name')

In [None]:
measure_med3=pd.merge(measure_med2,measurement_cids,on='concept_name')#.drop_duplicates(inplace=True)

In [None]:
measure_med3.to_csv('measurement_median.csv',index=False)

In [None]:
def FDR(sample, pval):
    #first sort by pval
    sample.sort_values(by=pval, inplace = True)
    samp = sample.dropna(subset=[pval])
    pvals = samp[pval]
    #calculation of correction
    padj = st.fdrcorrection(pvals, is_sorted=True)
    padded_col = np.pad(padj[1], (0, len(sample) - len(padj[1])), 'constant', constant_values=(np.nan, np.nan))
    sample['Padj'] = padded_col

In [None]:
FDR(measure_list,'pval')

In [None]:
measures_padj=pd.merge(measure_med3,measure_list,on=['concept_name','unit_concept_name'])


In [None]:
measures_padj.to_csv('measurement_med_ttest_FDR.csv',index=False)

In [None]:
!gsutil cp measurement_median.csv $WORKSPACE_BUCKET/data/
!gsutil cp measurement_cat_counts.csv $WORKSPACE_BUCKET/data/

## repeat with propensity matched controls

In [None]:
measurement_num_matched = measurement_numeric[measurement_numeric['person_id'].isin(matched['person_id'])]

In [None]:
matched[matched['person_id'].isin(measurement_num_matched['person_id'])].value_counts('label')
#15060 control, 2617 cases

In [None]:
meas_med_match = measurement_num_matched.groupby(['concept_name','unit_concept_name','Celiac']).agg({'value_as_number':'median'})

meas_med_match

In [None]:
meas_med_match.reset_index(inplace=True)

measures_match=meas_med_match.pivot(columns='Celiac',index=['concept_name','unit_concept_name'],values='value_as_number').fillna(0).reset_index().rename({'celiac':'CeD median value','healthy':'healthy median value'},axis=1)

measure_list_match = measurement_numeric_match[['concept_name','unit_concept_name']].drop_duplicates()

measure_list_match['unit_concept_name'].fillna('None',inplace=True)

In [None]:
df_list_match = [d for _, d in measurement_numeric_match.groupby(['concept_name','unit_concept_name'])]

In [None]:
for test in df_list_match:
    test.reset_index(inplace=True)
    test_ced=test.loc[test['Celiac']=='celiac','value_as_number']
    test_ctrl=test.loc[test['Celiac']=='healthy','value_as_number']
    result=ttest_ind(test_ced,test_ctrl)
    measure_list_match.loc[(measure_list_match['concept_name']==test['concept_name'][0])&(measure_list_match['unit_concept_name']==test['unit_concept_name'][0]),'t']=result.statistic
    measure_list_match.loc[(measure_list_match['concept_name']==test['concept_name'][0])&(measure_list_match['unit_concept_name']==test['unit_concept_name'][0]),'pval']=result.pvalue
    print('t-stat: ',result.statistic,'  p-value: ',result.pvalue)

In [None]:
measure_list_match.to_csv('measures_ttest_matched.csv',index=False)

In [None]:
measures_match.to_csv('measurement_median_matched.csv',index=False)

In [None]:
meas_patient_counts_match=pd.DataFrame(measurement_numeric_match[['person_id','Celiac','concept_name','unit_concept_name']].drop_duplicates().value_counts(['Celiac','concept_name','unit_concept_name'])).reset_index().pivot(columns='Celiac',index=['concept_name','unit_concept_name'],values='count').fillna(0)

meas_patient_counts_match.rename({'celiac':'celiac measure count','healthy':'healthy measure count'},axis=1,inplace=True)
meas_patient_counts_match.reset_index(inplace=True)

In [None]:
measure_med_match2=pd.merge(measures_match,meas_patient_counts_match,on=['concept_name','unit_concept_name']).drop_duplicates()

In [None]:
measure_med_match3=pd.merge(measure_med_match2,measurement_cids,on='concept_name')#.drop_duplicates(inplace=True)

In [None]:
measure_med_match3.to_csv('measurement_median_matched.csv',index=False)

## ttg & gliadin measurements

In [None]:
meas_ttg = measurement_df[measurement_df['standard_concept_name'].str.contains('liadin|ransglutaminase')&measurement_df['person_id'].isin(matched_AI['person_id'])]

meas_ttg.loc[meas_ttg['value_as_concept_name']=='2','value_as_number']=2

meas_ttg['answer']=meas_ttg['value_as_number']

meas_ttg.loc[pd.isna(meas_ttg['value_as_number'])&(meas_ttg['value_as_concept_name']!='No matching concept'),'answer']=meas_ttg['value_as_concept_name']

meas_ttg_agg=meas_ttg.groupby(['person_id','meassure']).agg({'value_as_number':'mean'}).reset_index()

meas_ttg['meassure']=meas_ttg['standard_concept_name']+'('+meas_ttg['unit_concept_name']+')'
meas_ttg.loc[meas_ttg['unit_concept_name']=='No matching concept','meassure']=meas_ttg['standard_concept_name']
meas_ttg

meas_fin2 = meas_ttg_agg.pivot(columns='meassure',index='person_id',values='value_as_number').reset_index()

meas_fin2.drop(meas_fin.columns[meas_fin.isna().all()],axis=1,inplace=True)

meas_fin2.to_csv('gliadin_ttg.csv',index=False)