In [None]:
import polars as pl
import pandas as pd
import os, sys
from tqdm import tqdm

from grelu.data.preprocess import filter_blacklist
from grelu.data.utils import get_chromosomes

## Paths

In [None]:
gwas_dir = '/gstore/data/humgenet/projects/statgen/GWAS/Benchmark_GWAS/'
matrix_file='/gstore/data/resbioai/grelu/decima/20240823/data.h5ad'
trait_file = os.path.join(gwas_dir, 'disease_list.txt')
gnomad_file = '/data/gnomAD/gnomad-regulatory-variants.tsv'

## Load GWAS SNPs

In [None]:
traits = pl.read_csv(trait_file, has_header=False, new_columns=['trait_ID', 'study', 'trait_name'], 
                     separator=' ')
traits.head(2)

In [None]:
snps = []
for row in tqdm(traits.iter_rows()):
    susie_file = os.path.join(gwas_dir, 'Complete', row[0], f'{row[1]}.susie.gwfinemap.b38.gz')
    df = pl.read_csv(susie_file, separator='\t',columns=[0,1,2, 3, 4, 6, 9, 10],
             new_columns = ['chrom', 'rsid', 'pos', 'ref', 'alt', 'MAF', 'p', 'PIP'])
    df = df.with_columns(pl.lit(row[0]).alias('trait_ID'))
    df = df.with_columns(pl.lit(row[1]).alias('study'))
    df = df.with_columns(pl.lit(row[2]).alias('trait_name'))
    snps.append(df)

snps = pl.concat(snps)

## Load gnomad regulatory variants within 100 kb of decima gene TSS

In [None]:
%%time
gnomad = pl.read_csv(gnomad_file, has_header=False, separator='\t', columns=[0, 1, 2, 3, 4, 5, 8],
    new_columns=['chrom', 'pos', 'rsid', 'ref', 'alt', 'af', 'vep']).unique()
print(len(gnomad))
gnomad.head(3)

## Subset gwas variants with high PIP

In [None]:
print(len(snps))
snps = snps.filter(pl.col("PIP") > 0.9)
print(len(snps))

## Select regulatory variants <100kb from TSS

In [None]:
%%time
snps = snps.join(gnomad, on=['chrom', 'pos', 'rsid', 'ref', 'alt'], how = 'inner')
snps = snps.with_columns(maf = snps['af'].apply(lambda x: 1-x if x > .5 else x))
snps = snps.drop(columns=['af'])
print(len(snps))

## Filter chromosomes

In [None]:
snps = snps.filter(pl.col("chrom").is_in(get_chromosomes('autosomesXY')))
len(snps)

## Filter SNVs with clear alleles

In [None]:
snps = snps.filter(pl.col("ref").is_in(["A", "C", "G", "T"]))
snps = snps.filter(pl.col("alt").is_in(["A", "C", "G", "T"]))
len(snps)

## Filter blacklist

In [None]:
snps = snps.to_pandas()
snps['start'] = snps['pos'].tolist()
snps['end'] = snps['start']+1
snps = snps[['chrom', 'start', 'end'] + [x for x in snps.columns if x not in ['chrom', 'start', 'end']]]
snps = filter_blacklist(snps, 'hg38')
snps = snps.drop(columns=['start', 'end'])
len(snps)

## Filter significant p-value

In [None]:
snps = snps[snps.p < 1e-6].copy()
len(snps)

## Assign variant ID

In [None]:
snps['variant'] = snps.chrom.astype(str) + '_' + snps.pos.astype(str) + '_' + snps.ref.astype(str) + '_' + snps.alt.astype(str)

## Get unique variants

In [None]:
pos_variants = snps.groupby(['chrom', 'pos', 'ref', 'alt', 'variant', 'rsid', 'vep','maf']).p.min().reset_index(name='min_p')

In [None]:
len(pos_variants)

## Save

In [None]:
out_dir='/gstore/data/resbioai/grelu/decima/20240823/gwas_44traits/positive_variants'

In [None]:
pos_file = os.path.join(out_dir, 'positive_variants_and_traits.csv')
pos_variants_file = os.path.join(out_dir, 'positive_variants.csv')

In [None]:
snps.to_csv(pos_file, index=False)
pos_variants.to_csv(pos_variants_file, index=False)