In [1]:
import glob
import itertools as it
import numpy as np
import os
import pandas as pd
import pybedtools as pbt
import statsmodels
import subprocess
import sys
import time

sys.path.append(os.path.dirname(os.getcwd()))
import UKBB_traits

sys.path.append("/dors/capra_lab/users/fongsl/tools/py_/")
import config_readwrite as crw
import fet
import split_filename

sys.path.append("/dors/capra_lab/users/fongsl/tools/genome/")
import chr_functions

In [2]:
name = "/data/hodges_lab/ATAC-STARR_B-cells/bin_human-evolution/config"
config, configfile_name = crw.read_config(name)

BKGD = config["BKGD"]["no_hla"]  # background to shuffle in
REGIONS_PATH = config["CIS_TRANS"]["regions_dir"]

RE = config["UKBB"]["results"]
PATH = config["UKBB"]["path"]
FC_PY = config['VAR_ENRICHMENT']["bin"]  # script to calculate fold-change
FC_SLURM = config['VAR_ENRICHMENT']["bin_slurm"]  # script to calculate fold-change
OUTFILE = config["UKBB"]["peaks_fold_change1k"]

HLA = True

# functions to intersect bed files 

In [3]:
def calculate_fc(py, bed, gwas, bkgd, outfile):
    cmd = f'python {py} {bed} {gwas} {bkgd} -o {outfile} -i 1000 -n 10'
    print("\n\nrunning", bed)
    print(cmd)
    subprocess.call(cmd, shell = True)

In [4]:
def calculate_slurm(slurm, bed, gwas, bkgd, outfile, nthreads):
    cmd = f'sbatch {slurm} {bed} {gwas} {bkgd} {outfile} {nthreads}'
    print("\n\nrunning slurm", bed)
    subprocess.call(cmd, shell = True)

# launch GWAS enrichment

## get GWAS traits

In [5]:
trait_dict = UKBB_traits.trait_all_mapping()
traits = []

for key in trait_dict.keys():
    if HLA is False:
        traits.append(os.path.join(PATH, f"{key}clean_LD_exp_uniq_nohla.bed"))
    else:
        traits.append(os.path.join(PATH, f"{key}clean_LD_exp_uniq.bed"))
traits[0]

'/data/hodges_lab/ATAC-STARR_B-cells/data/hansen-fong/gwas/ukbb/categorical-20002-both_sexes-1381clean_LD_exp_uniq.bed'

In [6]:
UKBB_traits.trait_all_mapping()

{'categorical-20002-both_sexes-1381': 'SLE',
 'categorical-20002-both_sexes-1464': 'RA',
 'categorical-20002-both_sexes-1452': 'ECZEMA',
 'categorical-20002-both_sexes-1111': 'ASTHMA',
 'categorical-20002-both_sexes-1463': 'UC',
 'phecode-202.2-both_sexes': 'NHL',
 'continuous-30000-both_sexes-irnt': 'WBC_COUNT',
 'categorical-20002-both_sexes-1387': 'HAY_FEVER_ALLERGIC_RHINITIS',
 'continuous-30200-both_sexes-irnt': 'NEUTROPHIL_PERCENTAGE',
 'continuous-845-both_sexes': 'AGE_COMPLETE_FULL_TIME_EDU',
 'phecode-290.11-both_sexes': 'ALZ',
 'categorical-22506-both_sexes-111': 'TOBACCO_SMOKING_MOST_ALL_DAY',
 'continuous-30120-both_sexes-irnt': 'LYMPHOCYTE_COUNT',
 'continuous-30080-both_sexes-irnt': 'PLATELET_COUNT',
 'continuous-30100-both_sexes-irnt': 'MEAN_PLATELET_VOLUME',
 'phecode-174-both_sexes': 'BREAST_CANCER',
 'icd10-K50-both_sexes': 'CROHNS',
 'phecode-335-both_sexes': 'MS',
 'phecode-939-both_sexes': 'ATOPIC_DERMATITIS',
 'phecode-557.1-both_sexes': 'CELIAC_DISEASE',
 'phecod

## get file names to intersect 

In [7]:
## file names to test for enrichment:

test_filenames = [

                "cis_only.bed",
                "cis+trans.bed",
                "trans_only.bed",
                "conserved_active.regions.bed",
    
                "HH-active_MM-inactive_cis-only.bed",
                "HH-active_MM-inactive_cis+trans.bed",
                "HH-active_MM-inactive_trans-only.bed",
    
                "MM-active_HH-inactive_cis-only.bed",
                "MM-active_HH-inactive_cis+trans.bed", 
                "MM-active_HH-inactive_trans-only.bed",

]

## run filename x trait intersection 

In [8]:
def check_already_run(outf, sid):

    #print(outf, sid)
    
    # have you run this before? 
    if os.path.exists(outf) is False:
        RUN = True
        
    # if so, which analyses have been done? 
    else:
        test = pd.read_csv(outf, sep='\t')
    
        already_run = list(set(test["sid"]))
        
    
        if sid in already_run:
            RUN = False
        else:
            RUN = True

    return RUN

In [17]:
combos = it.product(traits, test_filenames)

val = 0
for trait, f in combos:

    f = "peaks-" + f.strip(".bed") + "_nohla.bed"

    # write a file for each trait
    path, filename, sid = split_filename.split_filename(trait)
    outfile = os.path.join(RE, (sid.split("_exp.bed")[0]+"_nohla.tsv"))

    full_bed = os.path.join(REGIONS_PATH, f)
    
    

    N_THREADS = 10
    
   
   # print(f"n lines for {sid}", sum(1 for line in open(trait)),"\n")

    RUN = check_already_run(outfile, f.split(".bed")[0])

    if RUN is True:
        print("\nRUNNING", RUN, full_bed, trait, "\n\n")
        
        calculate_slurm(FC_SLURM, full_bed, trait, BKGD, outfile, N_THREADS)

        val +=1  # count runs
    #if val >0 and val%50 ==0:
     #   time.sleep(60*6) # sleep for six minutes
    #else:
      #  print("skipping", full_bed, "\n\n")
    
    
    #calculate_fc(FC_PY, full_bed, trait, BKGD, outfile)
print(val)


RUNNING True /data/hodges_lab/ATAC-STARR_B-cells/results/results_human-evolution/regions/peaks-cis_only_nohla.bed /data/hodges_lab/ATAC-STARR_B-cells/data/hansen-fong/gwas/ukbb/categorical-20002-both_sexes-1381clean_LD_exp_uniq.bed 




running slurm /data/hodges_lab/ATAC-STARR_B-cells/results/results_human-evolution/regions/peaks-cis_only_nohla.bed

RUNNING True /data/hodges_lab/ATAC-STARR_B-cells/results/results_human-evolution/regions/peaks-cis+trans_nohla.bed /data/hodges_lab/ATAC-STARR_B-cells/data/hansen-fong/gwas/ukbb/categorical-20002-both_sexes-1381clean_LD_exp_uniq.bed 




running slurm /data/hodges_lab/ATAC-STARR_B-cells/results/results_human-evolution/regions/peaks-cis+trans_nohla.bed

RUNNING True /data/hodges_lab/ATAC-STARR_B-cells/results/results_human-evolution/regions/peaks-trans_only_nohla.bed /data/hodges_lab/ATAC-STARR_B-cells/data/hansen-fong/gwas/ukbb/categorical-20002-both_sexes-1381clean_LD_exp_uniq.bed 




running slurm /data/hodges_lab/ATAC-STARR_B-cells/re


RUNNING True /data/hodges_lab/ATAC-STARR_B-cells/results/results_human-evolution/regions/peaks-conserved_active.regions_nohla.bed /data/hodges_lab/ATAC-STARR_B-cells/data/hansen-fong/gwas/ukbb/categorical-20002-both_sexes-1452clean_LD_exp_uniq.bed 




running slurm /data/hodges_lab/ATAC-STARR_B-cells/results/results_human-evolution/regions/peaks-conserved_active.regions_nohla.bed

RUNNING True /data/hodges_lab/ATAC-STARR_B-cells/results/results_human-evolution/regions/peaks-HH-active_MM-inactive_cis-only_nohla.bed /data/hodges_lab/ATAC-STARR_B-cells/data/hansen-fong/gwas/ukbb/categorical-20002-both_sexes-1452clean_LD_exp_uniq.bed 




running slurm /data/hodges_lab/ATAC-STARR_B-cells/results/results_human-evolution/regions/peaks-HH-active_MM-inactive_cis-only_nohla.bed

RUNNING True /data/hodges_lab/ATAC-STARR_B-cells/results/results_human-evolution/regions/peaks-HH-active_MM-inactive_cis+trans_nohla.bed /data/hodges_lab/ATAC-STARR_B-cells/data/hansen-fong/gwas/ukbb/categorical-20002


RUNNING True /data/hodges_lab/ATAC-STARR_B-cells/results/results_human-evolution/regions/peaks-MM-active_HH-inactive_trans-only_nohla.bed /data/hodges_lab/ATAC-STARR_B-cells/data/hansen-fong/gwas/ukbb/categorical-20002-both_sexes-1463clean_LD_exp_uniq.bed 




running slurm /data/hodges_lab/ATAC-STARR_B-cells/results/results_human-evolution/regions/peaks-MM-active_HH-inactive_trans-only_nohla.bed
46


In [None]:

rerun = {'/data/hodges_lab/ATAC-STARR_B-cells/data/hansen-fong/gwas/categorical-20002-both_sexes-1387clean_LD_exp_uniq.bed':
         [
            "cis+trans.bed", 
            #"MM-active_HH-inactive_cis+trans.bed",
            #"HH-active_MM-inactive_trans-only.bed", 
            #"MM-active_HH-inactive_cis-only.bed",
            #"MM-active_HH-inactive_trans-only.bed"
                     ]
}


for trait, fs in rerun.items():
    for f in fs:
        full_bed = os.path.join(REGIONS_PATH, f)

        N_THREADS = 10

        # write a file for each trait
        path, filename, sid = split_filename.split_filename(trait)

        outfile = os.path.join(RE, (sid.split("_exp.bed")[0]+".tsv"))

        print(f"n lines for {sid}", sum(1 for line in open(trait)))
        calculate_slurm(FC_SLURM, full_bed, trait, BKGD, outfile, N_THREADS)

    #calculate_fc(FC_PY, full_bed, trait, BKGD, outfile)
    