In [None]:
import pandas as pd
import os
bucket = os.getenv('WORKSPACE_BUCKET')

In [None]:
!gsutil cp $WORKSPACE_BUCKET/data/hibag_hla/bed_input/chr6_total* .

In [None]:
%%writefile tag_snp.txt


chr6:32445540:G:T
chr6:32634437:G:A
chr6:32638107:C:T
chr6:32690302:T:C
chr6:32691805:A:G
chr6:32713706:T:C

In [None]:
%%bash
## extract 6 tag SNPs to genotype table (ped) format
plink \
  --bfile chr6_total \
  --extract tag_snp.txt \
  --recode \
  --out tag

In [None]:
#read in and format tag snp data
data = pd.read_csv('tag.ped', sep='\s', header=None)
snps = pd.read_csv('tag.map', sep='\t', header=None)
snps['rs#']=['rs2395182','rs4639334','rs2187668','rs7775228','rs4713586','rs7454108']

data.rename({1:'person_id'},axis=1,inplace=True)

for i in range(6):
    rs = snps.loc[i,'rs#']
    data.rename(columns={2*i+6:f'{rs} A1'},inplace=True)
    data.rename(columns={2*i+7:f'{rs} A2'},inplace=True)
data

In [None]:
#based on known tag-SNP associations, search which DQ alleles per person
data.loc[(data['rs2187668 A1']=='T')|(data['rs2187668 A2']=='T'), 'DQ2.5'] = 1
data.loc[(data['rs2187668 A1']=='T')&(data['rs2187668 A2']=='T'), 'DQ2.5'] = 2
data.loc[(data['rs4639334 A1']=='A')|(data['rs4639334 A2']=='A'), 'DQ7'] = 1
data.loc[(data['rs4639334 A1']=='A')&(data['rs4639334 A2']=='A'), 'DQ7'] = 2
data.loc[(data['rs7454108 A1']=='C')|(data['rs7454108 A2']=='C'), 'DQ8'] = 1
data.loc[(data['rs7454108 A1']=='C')&(data['rs7454108 A2']=='C'), 'DQ8'] = 2
data.loc[((data['rs2395182 A1']=='T')|(data['rs2395182 A2']=='T'))&((data['rs7775228 A1']=='C')|(data['rs7775228 A2']=='C'))&((data['rs4713586 A1']=='A')|(data['rs4713586 A2']=='A')), 'DQ2.2'] = 1
data.loc[(data['rs2395182 A1']=='T')&(data['rs2395182 A2']=='T')&(data['rs7775228 A1']=='C')&(data['rs7775228 A2']=='C')&(data['rs4713586 A1']=='A')&(data['rs4713586 A2']=='A'), 'DQ2.2'] = 2
data.loc[((data['rs2395182 A1']=='T')|(data['rs2395182 A2']=='T'))&((data['rs7775228 A1']=='C')|(data['rs7775228 A2']=='C'))&((data['rs4713586 A1']=='G')|(data['rs4713586 A2']=='G')), 'DQ4'] = 1
data.loc[(data['rs2395182 A1']=='T')&(data['rs2395182 A2']=='T')&(data['rs7775228 A1']=='C')&(data['rs7775228 A2']=='C')&(data['rs4713586 A1']=='G')&(data['rs4713586 A2']=='G'), 'DQ4'] = 2


data.fillna(0,inplace=True)

In [None]:
data.to_csv('tag_DQ_cts.csv')

In [None]:
#calculate # alleles to check
data['total']=data[['DQ2.5',          'DQ7',      'DQ8',        'DQ2.2',]].sum(axis=1)
data['total with DQ4']=data[['DQ2.5',          'DQ7',      'DQ8',        'DQ2.2','DQ4']].sum(axis=1)
data=data[['person_id','DQ2.5',          'DQ7','DQ8',        'DQ2.2',          'DQ4','total','total with DQ4']]

In [None]:
#format for merging
data.columns='tag '+ data.columns+ ' alleles'
data.rename({'tag person_id alleles':'person_id','tag total with DQ4 alleles':'tag total alleles with DQ4'},axis=1,inplace=True)
data.columns

In [None]:
#checkpoint
data.to_csv('tag_snp.csv',index=False)

In [None]:
#other demographics
samples = pd.read_csv(f'{bucket}/data/gwas_v2/phenotypes/celiac_matched_data_v2.csv')
data2 = pd.merge(data,samples[['person_id','race','sex_at_birth','ethnicity','label']],on='person_id')
data2.fillna(0,inplace=True)

In [None]:
#now assign DQ genotype based on inferred allele count
data2['haplotype']='X/X'

data2.loc[data2['DQ2.5']==1,'haplotype']='DQ2.5/X'
data2.loc[data2['DQ2.2']==1,'haplotype']='DQ2.2/X'
data2.loc[data2['DQ7']==1,'haplotype']='DQ7/X'
data2.loc[data2['DQ8']==1,'haplotype']='DQ8/X'

data2.loc[(data2['DQ2.5']==1)&(data2['DQ2.2']==1),'haplotype']='DQ2.5/DQ2.2'
data2.loc[(data2['DQ2.5']==1)&(data2['DQ7']==1),'haplotype']='DQ2.5/DQ7'
data2.loc[(data2['DQ2.5']==1)&(data2['DQ8']==1),'haplotype']='DQ2.5/DQ8'
data2.loc[(data2['DQ2.2']==1)&(data2['DQ7']==1),'haplotype']='DQ2.2/DQ7'
data2.loc[(data2['DQ2.2']==1)&(data2['DQ8']==1),'haplotype']='DQ2.2/DQ8'
data2.loc[(data2['DQ7']==1)&(data2['DQ8']==1),'haplotype']='DQ2.2/DQ8'

data2.loc[data2['DQ2.5']==2,'haplotype']='DQ2.5/DQ2.5'
data2.loc[data2['DQ2.2']==2,'haplotype']='DQ2.2/DQ2.2'
data2.loc[data2['DQ7']==2,'haplotype']='DQ7/DQ7'
data2.loc[data2['DQ8']==2,'haplotype']='DQ8/DQ8'

data2.loc[data2['tot']>2,'haplotype']='?'
data2.loc[(data2['rs4713586 A2']=='0')|(data2['rs2187668 A2']=='0')|(data2['rs7775228 A2']=='0')|(data2['rs7454108 A2']=='0'),'missing geno']='yes'

In [None]:
dq_tab=data2.value_counts(['haplotype','label']).reset_index().pivot(index='haplotype',columns='label',values='count').sort_values(1)

In [None]:
data2.to_csv('dq_haplotypes_tag.csv',index=False)
dq_tab.to_csv('dq_table_tag.csv')

In [None]:
!gsutil cp dq_haplotypes_tag.csv $WORKSPACE_BUCKET/data/hla_compare/
!gsutil cp dq_table_tag.csv $WORKSPACE_BUCKET/data/hla_compare/
