## Data preparation

In [None]:
import pandas as pd
data = pd.read_excel('COHORT_February2022_latest.xlsx')
data

In [None]:
import os, fnmatch
def find(pattern, path):
    try:
        result = []
        for root, dirs, files in os.walk(path):
            for name in files:
                if fnmatch.fnmatch(name, pattern):
                    result.append(os.path.join(root, name))
        resultx = result[0]
        return resultx
    except:
        return 'error'

## Exome analysis

In [None]:
import seaborn as sns
import hail as hl
import os

tmp = "/mnt/grid/janowitz/home/skleeman/tmp2"
os.environ["SPARK_LOCAL_DIRS"]=tmp

os.environ["PYSPARK_SUBMIT_ARGS"] ="--driver-memory 200g --executor-memory 2g pyspark-shell"

hl.init(default_reference='GRCh38', master='local[16]',min_block_size=128, local_tmpdir=tmp, tmp_dir=tmp)


Cluster config

In [None]:
import hail as hl
import os
import pandas as pd
import subprocess
from gnomad.utils.liftover import *
from gnomad.utils.annotations import *
from gnomad.sample_qc.pipeline import *
from gnomad.sample_qc.ancestry import *
import glob


#Define memory and CPU availability

tmp = "/mnt/grid/janowitz/rdata_norepl/tmp"

os.environ["SPARK_LOCAL_DIRS"]=tmp
os.environ["PYSPARK_SUBMIT_ARGS"] ="--conf spark.network.timeout=15m --conf spark.executor.heartbeatInterval=10m --conf spark.memory.fraction=1.0 --driver-memory 3000G --executor-memory 10G pyspark-shell"

hl.init(default_reference='GRCh38', master ='local[96]',min_block_size=128, local_tmpdir=tmp, tmp_dir=tmp)

## Exome pipeline

### Sex QC

In [None]:
import pandas as pd

mt = hl.import_vcf('/mnt/grid/janowitz/rdata_norepl/pan_immuno/pan_immuno_germline_hail.chr[X-Y].vcf.gz', force_bgz=True)
mt = hl.split_multi_hts(mt, permit_shuffle=True)
mt = mt.filter_rows(mt.locus.in_x_par(), keep=False)

#Sample QC
mt = hl.sample_qc(mt, name='sample_qc')
data = pd.read_excel('~/COHORT_Feb2020_strandupdate_rmdup.xlsx')
data['s'] = data.accession + '_' + data.SUBJECT_ID + '_normal'
data = data[['s', 'age','sex','dcb','os_days','os_stat','pfs_days','pfs_stat']]
data = data.replace({'M': 0, 'F': 1})
pheno = hl.Table.from_pandas(data, key="s") 

mt = mt.filter_cols(hl.is_defined(pheno[mt.s])) #Remove samples not included in updated cohort
mt = mt.annotate_cols(age = pheno[mt.s].age, sex = pheno[mt.s].sex, dcb = pheno[mt.s].dcb)
mt = mt.annotate_cols(is_female = hl.if_else(mt.sex==1, True, False))

hl.export_plink(mt, '/mnt/grid/janowitz/rdata_norepl/pan_immuno/plink_unfilteredXY',
                ind_id = mt.s,
                is_female = mt.is_female)

In [None]:
%%bash

cd /mnt/grid/janowitz/rdata_norepl/pan_immuno/

~/applications/plink1.9/plink --bfile /mnt/grid/janowitz/rdata_norepl/pan_immuno/plink_unfilteredXY --check-sex 0.30 0.60 --out X
~/applications/plink1.9/plink --bfile /mnt/grid/janowitz/rdata_norepl/pan_immuno/plink_unfilteredXY --check-sex y-only --out Y

In [None]:
import seaborn as sns
import pandas as pd

X = pd.read_csv('/mnt/grid/janowitz/rdata_norepl/pan_immuno/X.sexcheck',sep='\s+')
Y = pd.read_csv('/mnt/grid/janowitz/rdata_norepl/pan_immuno/Y.sexcheck',sep='\s+')

all = X.merge(Y, on=['FID','IID'])

def myfunc(F, YCOUNT):
    if F<0.5 and YCOUNT<10000:
        myvalue="female"
    elif F>=0.5 and YCOUNT>=10000:
        myvalue="male"
    elif F>=0 and YCOUNT>20000:
        myvalue="male"
    else:
        myvalue="undetermined"
    return myvalue
        
all['status'] = all.apply(lambda x: myfunc(x['F'], x['YCOUNT']), axis=1)

def discordant(PEDSEX, status):
    if PEDSEX==1 and status=="male":
        myvalue="correct"
    elif PEDSEX==2 and status=="female":
        myvalue="correct"
    else:
        myvalue="incorrect"
    return myvalue
        
all['outcome'] = all.apply(lambda x: discordant(x['PEDSEX_x'], x['status']), axis=1)


sns.scatterplot(data=all, x="F", y="YCOUNT", hue="status")

In [None]:
all[all.outcome == "incorrect"]

### Variant QC

In [None]:
mt = hl.import_vcf('/mnt/grid/janowitz/rdata_norepl/pan_immuno/pan_immuno_germline_hail.chr*.vcf.gz', force_bgz=True)
mt = hl.split_multi_hts(mt, permit_shuffle=True)

#Remove entries with depth <7/10
mt = mt.filter_entries((mt.FT == "PASS") | (hl.is_missing(mt.FT))) #Remove filtered calls (Strelka)
mt = mt.filter_entries(((mt.DP >= 7) & hl.is_snp(mt.alleles[0], mt.alleles[1])) | ((mt.DP >= 10) & hl.is_indel(mt.alleles[0], mt.alleles[1])))
mt = mt.annotate_entries(AB = mt.AD[1] / hl.sum(mt.AD))
mt = mt.annotate_rows(adj_count=hl.agg.count_where(((hl.is_snp(mt.alleles[0], mt.alleles[1])) & (mt.AB >= 0.15) & (mt.GT.is_het_ref())) |
                                                   ((hl.is_indel(mt.alleles[0], mt.alleles[1])) & (mt.AB >= 0.20) & (mt.GT.is_het_ref())) | 
                                                   mt.GT.is_hom_var()))

#Remove variants with call rate <90%
mt = hl.variant_qc(mt)
mt = mt.filter_rows(mt.variant_qc.call_rate > 0.80)
mt = mt.filter_rows(mt.variant_qc.p_value_hwe > 1e-15)
mt = mt.filter_rows(mt.adj_count >= 1) #One high-quality het entry per SNP
print(mt.count(), flush=True)

#Remove samples with call rate <90%
mt = hl.sample_qc(mt, name='sample_qc')
mt = mt.filter_cols(mt.sample_qc.call_rate > 0.80)
print(mt.count(), flush=True)

#Filter excess heterozygosity
mt = mt.annotate_cols(IB = hl.agg.inbreeding(mt.GT, mt.variant_qc.AF[1]))
summ_stats = mt.aggregate_cols(hl.agg.stats(mt.IB.f_stat))
threshold_low = summ_stats['mean'] - (3* summ_stats['stdev'])
threshold_high = summ_stats['mean'] + (3* summ_stats['stdev'])
mt = mt.filter_cols(mt.IB.f_stat > threshold_low)
mt = mt.filter_cols(mt.IB.f_stat < threshold_high)

mt.write('/mnt/grid/janowitz/rdata_norepl/pan_immuno/hail/IO_filtered.mt', overwrite=True) #Hard filters

### Sample QC

In [None]:
mt = hl.read_matrix_table('/mnt/grid/janowitz/rdata_norepl/pan_immuno/hail/IO_filtered.mt')

#Remove samples with sex mismatch
df = all[all.outcome == "correct"]
sex = hl.Table.from_pandas(df, key="IID") 
mt = mt.filter_cols(hl.is_defined(sex[mt.s]))

#Exclude LD intervals from plinkQC package, LD pruning in PLINK (not working in Hail due to bug)
intervals = hl.import_bed('/mnt/grid/janowitz/home/skleeman/ukbiobank/cancergwas/remove_ld_grch38.bed',
                         reference_genome='GRCh38')

qc_mt = mt.filter_rows(hl.is_defined(intervals[mt.locus]),keep=False)

#gnomad default filtering, including LD pruning
qc_mt = get_qc_mt(
    qc_mt,
    min_af=0.001,
    min_inbreeding_coeff_threshold=-0.025,
    ld_r2=0.1,
    apply_hard_filters = False,
    min_callrate=0.99,
    filter_lcr=False,
    filter_decoy=False,
    filter_segdup=False
)

print(qc_mt.count(), flush=True)
qc_mt = qc_mt.checkpoint('/mnt/grid/janowitz/rdata_norepl/pan_immuno/hail/IO_filtered_qc.mt', overwrite=True)

In [None]:
qc_mt.count()

In [None]:
from gnomad.utils.liftover import *
from gnomad.utils.annotations import *
from gnomad.sample_qc.pipeline import *
from gnomad.sample_qc.ancestry import *

qc_mt = hl.read_matrix_table('/mnt/grid/janowitz/rdata_norepl/pan_immuno/hail/IO_filtered_qc.mt')

#Import 1000G/HGDP reference
this_ref = hl.read_matrix_table('/mnt/grid/janowitz/home/references/1k_hgdp/ref_gnomadfilters.mt')
related_samples_to_remove_ref = hl.read_table("/mnt/grid/janowitz/home/references/1k_hgdp/related_remove_ref.ht")



#Merge cohorts
#panIO_in_ref = qc_mt.filter_rows(hl.is_defined(this_ref.rows()[qc_mt.row_key]))
#print('sites in ref and PanIO data, inds in PanIO: ' + str(panIO_in_ref.count()))

#ref_in_panIO = this_ref.filter_rows(hl.is_defined(qc_mt.rows()[this_ref.row_key]))
#print('sites in ref and PanIO data, inds in ref: ' + str(ref_in_panIO.count()))

#panIO_in_ref = panIO_in_ref.checkpoint('/mnt/grid/janowitz/rdata_norepl/pan_immuno/hail/panIO_in_ref.mt', overwrite=True)
#ref_in_panIO = ref_in_panIO.checkpoint('/mnt/grid/janowitz/rdata_norepl/pan_immuno/hail/ref_in_panIO.mt', overwrite=True)

panIO_in_ref = hl.read_matrix_table('/mnt/grid/janowitz/rdata_norepl/pan_immuno/hail/panIO_in_ref.mt')
ref_in_panIO = hl.read_matrix_table('/mnt/grid/janowitz/rdata_norepl/pan_immuno/hail/ref_in_panIO.mt')


#Ancestry PCA
#--> Reference, label with inferred populations, exclude relateds
_, scores_pca_ref, loadings_pca_ref = run_pca_with_relateds(ref_in_panIO, related_samples_to_remove_ref, 
                                                               n_pcs=10, autosomes_only=True)

#--> Project to PanIO

scores_pca_panIO = pc_project(mt = panIO_in_ref, loadings_ht = loadings_pca_ref)

#Train RF classifier
merge = scores_pca_ref.union(scores_pca_panIO)

merge = merge.annotate(
    training_pop=this_ref.cols()[merge.key].labeled_subpop)

recode = pd.read_excel('/mnt/grid/janowitz/home/references/1k_hgdp/recode.xlsx')
recode_ht = hl.Table.from_pandas(recode, key='labeled_subpop')

merge = merge.annotate(
    training_pop=recode_ht[merge.training_pop].superpop)

predictions_ref, classifer_rf_ref = assign_population_pcs(merge, pc_cols = merge.scores, known_col = 'training_pop', seed=501, min_prob = 0.50, missing_label='Other')

panIO_predictions = predictions_ref.semi_join(scores_pca_panIO) #Subset PanIO samples

panIO_predictions.write("/mnt/grid/janowitz/rdata_norepl/pan_immuno/hail/panIO_ancestry_calls.ht", overwrite=True)

relatedness_ht = hl.pc_relate(qc_mt.GT, 0.01, k=10, min_kinship=0.05, block_size=512)

related_samples_to_remove = hl.maximal_independent_set(relatedness_ht.i, relatedness_ht.j, False)
print(related_samples_to_remove.count())

related_samples_to_remove.write("/mnt/grid/janowitz/rdata_norepl/pan_immuno/hail/related_remove_panIO.ht",overwrite=True)

### Refine ancestry (EUR)

There is evidence of outliers (about 4) in the EUR subset, these can be removed by Z-score > 5 in each of PC1 and PC2.

In [None]:
import pandas as pd
import pandas as pd
from gnomad.sample_qc.pipeline import *
from gnomad.sample_qc.ancestry import *


mt = hl.read_matrix_table('/mnt/grid/janowitz/rdata_norepl/pan_immuno/hail/IO_filtered_qc.mt')

#Remove related samples
related_samples_to_remove = hl.read_table("/mnt/grid/janowitz/rdata_norepl/pan_immuno/hail/related_remove_panIO.ht")

#Add ancestry data
panIO_predictions = hl.read_table("/mnt/grid/janowitz/rdata_norepl/pan_immuno/hail/panIO_ancestry_calls.ht")
mt = mt.annotate_cols(pop = panIO_predictions[mt.s].pop)


mt_filter = mt.filter_cols(mt.pop=='EUR')
print(mt_filter.count(), flush=True)

#Run PCA excluding relateds then project onto everyone
_, pcs, _ = hl.hwe_normalized_pca(mt_filter.GT, k=2)
    
scores_pca_ref = pcs
scores_pca_ref = scores_pca_ref.transmute(**{f'PC{i}': scores_pca_ref.scores[i - 1] for i in range(1, 3)})


scores_pca_ref = scores_pca_ref.annotate(PC1_Z = ((scores_pca_ref.PC1 - scores_pca_ref.aggregate(hl.agg.mean(scores_pca_ref.PC1))) / (scores_pca_ref.aggregate(hl.agg.stats(scores_pca_ref.PC1).stdev))),
                        PC2_Z = ((scores_pca_ref.PC2 - scores_pca_ref.aggregate(hl.agg.mean(scores_pca_ref.PC2))) / (scores_pca_ref.aggregate(hl.agg.stats(scores_pca_ref.PC2).stdev))))

scores_pca_ref = scores_pca_ref.filter((hl.abs(scores_pca_ref.PC1_Z) <5) & (hl.abs(scores_pca_ref.PC2_Z) <5), keep=False)

panIO_predictions = panIO_predictions.annotate(pop = hl.case()
                                              .when(hl.is_defined(scores_pca_ref[panIO_predictions.s]), "Other")
                                              .default(panIO_predictions.pop))

panIO_predictions_pd = panIO_predictions.to_pandas()
panIO_predictions_pd = panIO_predictions_pd[["s", "pop"]]
panIO_predictions_pd['pop'].value_counts()

panIO_predictions.write("/mnt/grid/janowitz/rdata_norepl/pan_immuno/hail/panIO_ancestry_calls_update.ht", overwrite=True)


### Prepare for imputation

In [None]:
mt = hl.read_matrix_table('/mnt/grid/janowitz/rdata_norepl/pan_immuno/hail/IO_filtered.mt')
mt = mt.filter_rows(mt.locus.in_autosome())

panIO_predictions = hl.read_table("/mnt/grid/janowitz/rdata_norepl/pan_immuno/hail/panIO_ancestry_calls_update.ht")
related_samples_to_remove = hl.read_table("/mnt/grid/janowitz/rdata_norepl/pan_immuno/hail/related_remove_panIO.ht")

#panIO_predictions = panIO_predictions.filter((panIO_predictions.pop=="EUR"))

mt = mt.filter_cols(hl.is_defined(panIO_predictions[mt.s]))
mt = mt.filter_cols(~hl.is_defined(related_samples_to_remove[mt.col_key]))

mt.count()

In [None]:
hl.export_vcf(mt, '/mnt/grid/janowitz/rdata_norepl/pan_immuno/new_impute/exome_variants_forimputation_allancestry.vcf')

### Imputed cohort

Imputation using TOPMED server then filtered by RSQ>0.6

In [None]:
mt = hl.import_vcf('/mnt/grid/janowitz/rdata_norepl/pan_immuno/imputed/chr*.dose.vcf.gz', force_bgz=True)
mt = mt.filter_rows(mt.info.R2 >= 0.6)

mt.write('/mnt/grid/janowitz/rdata_norepl/pan_immuno/panIO_dna_imputed_info0.6.mt', overwrite=True)

### Export cleaned GRCh37 data

Filter to only SNPs in PGS

In [None]:
#Filter summary stats

from gnomad.utils.liftover import *
from gnomad.utils.annotations import *

import pandas as pd

mt = hl.read_matrix_table('/mnt/grid/janowitz/rdata_norepl/pan_immuno/panIO_dna_imputed_info0.6.mt')

panIO_predictions = hl.read_table("/mnt/grid/janowitz/rdata_norepl/pan_immuno/hail/panIO_ancestry_calls_update.ht")
panIO_predictions = panIO_predictions.filter(panIO_predictions.pop=="EUR")
related_samples_to_remove = hl.read_table("/mnt/grid/janowitz/rdata_norepl/pan_immuno/hail/related_remove_panIO.ht")

mt = mt.filter_cols(hl.is_defined(panIO_predictions[mt.s])) #Only EUR
mt = mt.filter_cols(~hl.is_defined(related_samples_to_remove[mt.col_key])) #Remove related

#Add phenotype data
data = pd.read_excel('~/COHORT_Feb2020_strandupdate_rmdup.xlsx')
data['s'] = data.accession + '_' + data.SUBJECT_ID + '_normal'
data = data[['s', 'age','sex','dcb','os_days','os_stat','pfs_days','pfs_stat']]
data = data.replace({'M': 0, 'F': 1})
pheno = hl.Table.from_pandas(data, key="s") 

mt = mt.annotate_cols(age = pheno[mt.s].age, sex = pheno[mt.s].sex, dcb = pheno[mt.s].dcb,
                     os_days = pheno[mt.s].os_days, os_stat = pheno[mt.s].os_stat)
mt = mt.annotate_cols(is_female = hl.if_else(mt.sex==1, True, False))

stats = hl.import_table('/mnt/grid/ukbiobank/data/Application58510/skleeman/gwas_cystatinc/PRS/EUR/summ_SEM_cystatin_vaf_effectflip.tsv', impute=True)
stats = stats.annotate(locus=hl.locus(hl.str(stats.CHR), hl.int32(stats.BP),reference_genome='GRCh37'))
stats = stats.annotate(alleles=[stats.A2, stats.A1])

mt = default_lift_data(mt)

stats = stats.filter(hl.is_defined(mt.rows()[stats.locus, stats.alleles]))
print(stats.count())
stats = stats.drop('locus', 'alleles')

stats.export('/mnt/grid/ukbiobank/data/Application58510/skleeman/gwas_cystatinc/PRS/EUR/summ_SEM_cystatin_vaf_effectflip_exome.tsv')


hl.export_plink(mt, '/mnt/grid/janowitz/rdata_norepl/pan_immuno/hail/imputed_exome_gwas_panIO_grch37',
                ind_id = mt.s,
                is_female = mt.is_female,
                varid = hl.delimit([mt.locus.contig, hl.str(mt.locus.position)], ':'))