This notebook will extract variants with p-value < 10^-5 in plink format for linkage calculation

In [None]:
import hail as hl
from hail.plot import show
from bokeh.plotting import output_file, save
import bokeh.io
from bokeh.io import *
from bokeh.resources import INLINE
bokeh.io.output_notebook(INLINE) 
%matplotlib inline
import json
import numpy as np
import re
from datetime import datetime
import os

In [None]:
import pandas as pd
from tqdm import tqdm

import os
bucket = os.getenv("WORKSPACE_BUCKET")

In [None]:
hl.init(default_reference='GRCh38', idempotent=True)


In [None]:
mt_vds_path = os.getenv("WGS_VDS_PATH")

vds = hl.vds.read_vds(mt_vds_path)

In [None]:
gwas_res = pd.read_csv(f'{bucket}/data/gwas_v4/gwas_logp5.csv')
gwas_res['locus']='chr'+gwas_res['CHR_ID']+':'+(gwas_res['CHR_POS']-1).astype(str)+'-'+(gwas_res['CHR_POS']+1).astype(str)

In [None]:
gwas = hl.vds.filter_intervals(
    vds,
    [hl.parse_locus_interval(x, reference_genome='GRCh38')
     for x in gwas_res['locus']])

In [None]:
pheno = hl.import_table(f'{bucket}/data/gwas_v4/pheno_hail.tsv', key = "person_id",  impute=True,types={'person_id':hl.tstr})

gwas = hl.vds.filter_samples(gwas, pheno, keep = True, remove_dead_alleles = True)


gwas = hl.vds.split_multi(gwas)

mt = hl.vds.to_dense_mt(gwas)

hl.export_plink(mt, f'{bucket}/data/gwas_v4/LD_plink', ind_id = mt.s)