In [None]:
import pandas as pd
import os
bucket = os.getenv('WORKSPACE_BUCKET')

In [None]:
import hail as hl
from hail.plot import show
from bokeh.plotting import output_file, save
import bokeh.io
from bokeh.io import *
from bokeh.resources import INLINE
bokeh.io.output_notebook(INLINE) 
%matplotlib inline
import json
import numpy as np
import re
from datetime import datetime
import os

In [None]:
hl.init(default_reference='GRCh38', idempotent=True)


In [None]:
#218 SNPs from catalog
!gsutil cp $WORKSPACE_BUCKET/data/gwas_v4/gwas-association-downloaded_2024-01-10-EFO_0001060.tsv .
gwas_meta = pd.read_csv('gwas-association-downloaded_2024-01-10-EFO_0001060.tsv',sep='\t')
#find unique SNPs
gwas_meta=gwas_meta[~pd.isna(gwas['CHR_POS'])].drop_duplicates(['CHR_ID','CHR_POS'])

In [None]:
#annotate locus for merging later
gwas_meta['locus']='chr'+gwas_meta['CHR_ID']+':'+(gwas_meta['CHR_POS']).astype(str)

In [None]:
#filtered gwas catalog snps from previous data
filtered=gwas_meta[~pd.isna(gwas_meta['SNPS'])].drop_duplicates(['locus'])

filtered.to_csv(f'{bucket}/data/gwas_v4/previous_snps_reoccurring.tsv',sep='\t',index=False)

In [None]:
filtered=pd.read_csv(f'{bucket}/data/gwas_v4/previous_snps_reoccurring.tsv',sep='\t')

In [None]:
#read vds file
vds_path = os.getenv('WGS_VDS_PATH')
vds = hl.vds.read_vds(vds_path)

In [None]:
#filter intervals to make processing more efficient
loci='chr'+filtered['CHR'].astype(str)+':'+(filtered['BP']-2).astype(str)+'-'+(filtered['BP']+2).astype(str)

meta= hl.vds.filter_intervals(
    vds,
    [hl.parse_locus_interval(x, reference_genome='GRCh38')
     for x in loci])

In [None]:
meta = hl.vds.split_multi(meta)

In [None]:
#write to mt
meta_mt=hl.vds.to_dense_mt(meta)

In [None]:
#import phenotypes to label CeD and non-CeD
pheno = hl.import_table(f'{bucket}/data/gwas_v4/pheno_hail_final.tsv', key = "person_id",  impute=True,types={'person_id':hl.tstr})


In [None]:
#annotate phenotype, with labels
meta_vars=meta_mt.semi_join_cols(pheno)
meta_vars=meta_vars.annotate_cols(pheno=pheno[meta_vars.s])

In [None]:
#annotate counts by condition for meta_analysis
meta_vars=meta_vars.annotate_rows(
    wild_type_count_ced = hl.agg.count_where((meta_vars.GT.is_hom_ref()) & (meta_vars.pheno.label == 1)),
    heterozygous_count_ced = hl.agg.count_where((meta_vars.GT.is_het()) & (meta_vars.pheno.label == 1)),
    homozygous_count_ced = hl.agg.count_where((meta_vars.GT.is_hom_var()) & (meta_vars.pheno.label == 1)),
    wild_type_count_control = hl.agg.count_where((meta_vars.GT.is_hom_ref()) & (meta_vars.pheno.label == 0)),
    heterozygous_count_control = hl.agg.count_where((meta_vars.GT.is_het()) & (meta_vars.pheno.label == 0)),
    homozygous_count_control = hl.agg.count_where((meta_vars.GT.is_hom_var()) & (meta_vars.pheno.label == 0))
    
)
meta_vars.describe()

In [None]:
#write variants to file
meta_vars.select_rows(meta_vars.wild_type_count_ced,meta_vars.heterozygous_count_ced,meta_vars.homozygous_count_ced,meta_vars.wild_type_count_control,meta_vars.heterozygous_count_control,meta_vars.homozygous_count_control).rows().export(f'{bucket}/data/gwas_v4/gwas_v4_genotypes_10804_samples.tsv')

In [None]:
previous=pd.read_csv(f'{bucket}/data/gwas_v4/gwas_v4_genotypes_10804_samples.tsv',sep='\t')

In [None]:
#merge gwas results with gwas catalog SNPs
table=pd.read_csv(f'{bucket}/data/gwas_v4/gwas_vars.tsv',sep='\t')
filtered.rename({'alleles_x':'alleles'},axis=1,inplace=True)

filtered['alleles']=filtered['alleles'].str.replace("'",'"').str.replace(', ',',')

filt2=pd.merge(table,filtered,on='locus'])

In [None]:
#clean QC fields for clarity
import re
qcdf=filt2['variant_qc'].str.split(',',expand=True)
cols={}
for i in qcdf.columns:   
    title=re.findall(r'"[a-zA-Z_]+"',qcdf.loc[1,i])
    if len(title)>0:
        cols[i]=title[0].replace('"','')
    if len(title)>1:
        cols[i]=title[1].replace('"','')

qcdf.rename(cols,axis=1,inplace=True)
qcdf.rename({'mean':'gq mean', 'stdev':'gq stdev', 'min':'gq min', 'max':'gq max','AC': 'AC_ref', 'AF': 'AF_ref', 'homozygote_count':
       'homozygote_count_ref',5:'AC_alt',7:'AF_alt',10:'homozygote_count_alt'},axis=1,inplace=True)
for i in qcdf.columns:
    qcdf[i]=qcdf[i].str.replace('"[a-zA-Z_]+"','', regex=True)
    qcdf[i]=qcdf[i].str.replace('[\[\]{}:]','', regex=True)
    qcdf[i]=qcdf[i].astype(float)
qcdf.columns

In [None]:
#merge qc with these
filt_final=pd.concat([filt2,qcdf],axis=1)

filt_final.to_csv('gwas_snps_from_previous.csv',index=False)
!gsutil cp gwas_snps_from_previous.csv {bucket}/data/gwas_v4/

In [None]:
filt_final=pd.read_csv(f'{bucket}/data/gwas_v4/gwas_snps_from_previous.csv')

In [None]:
#merge new counts with everything
filt_final.drop('alleles_y',axis=1,inplace=True)
test=pd.merge(previous,filt_final,on=['locus'])


In [None]:
#final file, to calculate chi-sq
test.to_csv('filtered_gt_counts.csv',index=False)