In [1]:
"""
compare footprints
"""

'\ncompare footprints\n'

In [2]:
from functools import partial
import glob
from multiprocessing import Pool
import numpy as np
import os
import pandas as pd
import pybedtools as pbt
import re
import subprocess
import sys

sys.path.append("/dors/capra_lab/users/fongsl/tools/py_/")
sys.path.append("/dors/capra_lab/users/fongsl/tools/genome/")
import config_readwrite as crw
import chr_functions
import split_filename

In [3]:
name = "/data/hodges_lab/ATAC-STARR_B-cells/bin_human-evolution/config"
config, configfile_name = crw.read_config(name)

SHUF = config["SHUFFLES"]["shuf-all"]
REGIONS = config["CIS_TRANS"]["regions"]
ANNOT = config["CIS_TRANS"]["regions_annotations"]

ID_TAG = config["TF_FOOTPRINTING_JASPAR"]["ID_TAG"]

PATH = config["TF_FOOTPRINTING"]["PATH"]
RE = config["TF_FOOTPRINTING"]["results"]

RNA = config["RNASEQ"]["dif_exp"]
FPKM = config["RNASEQ"]["GM12878_FPKM"]

path, region_file, region = split_filename.split_filename(REGIONS)

0. string split to get TF name
1. intersect regions w footprint files
2. create matrix w/ region_id (should be 17605 regions x 693 archetypes) 

# functions

## load human footprint matrix and raw data

In [4]:
def load_data(cell_line, raw):
    
    MATRIX = config[f"TF_FOOTPRINTING_JASPAR_{cell_line}"]["matrix"]
    
    FP = config[f"TF_FOOTPRINTING_JASPAR_{cell_line}"]["FP"] # write
    
    if raw is True:
        
        cols= [
            "#chr",
            "start", "end", "region_id",
            "#chr_tf",
            "start_tf",
            "end_tf",
            "tfid",
            "score", 
            "strand",
            "overlap"
            ]
        FP = FP.strip('.bed') + "_clean.bed"
        print(FP)
        df = pd.read_csv(FP, sep='\t',
                        )
        df=df.drop_duplicates().reset_index()
        df["len"] = df["end_tf"] - df['start_tf']
        df = df.drop(columns=["level_0", "index"])
    else:
        df = pd.read_csv(MATRIX, sep='\t').fillna(0)
        
    return df

In [5]:
CL = "GM12878"
#RAW = True
#hu_raw = load_data(CL, RAW)

RAW = False
hu = load_data(CL, RAW)

/data/hodges_lab/ATAC-STARR_B-cells/data/hansen-fong/TF_footprint/GM12878_1.8-JASPAR_0.05-pval_morethan5bp_clean.bed


## describe the rhesus footprints

In [8]:
def load_liftedover_fp(path):
    LIFTOVER = os.path.join(path, "LCL8664_1.liftOver.to.Hg38.bed")
    cols = ["#chr_tf_lifted", "start_tf_lifted", "end_tf_lifted", "tfid2", "region_id"]
    liftover = pd.read_csv(LIFTOVER, sep = '\t', header=None, names =cols)
    return liftover

In [9]:
#CL, RAW = "LCL8664", True
#rh_raw = load_data(CL, RAW)

CL, RAW = "LCL8664", False
rh = load_data(CL, RAW)
lifted = load_liftedover_fp(PATH) # 288802/290592 TF FP liftover

#rh_raw = pd.merge(rh_raw, lifted) # merge raw rhe FP data w/ liftover coordinates to hg38

/data/hodges_lab/ATAC-STARR_B-cells/data/hansen-fong/TF_footprint/LCL8664_1.8-JASPAR_0.05-pval_morethan5bp_clean.bed


# Find the TF you're looking for 

In [36]:
def find_tf_name(tfname, df):
    names = []
    for n, i in enumerate(list(df)):
        if tfname in i:
            print(n, i)
            names.append(i)
    return names

# get hu, rhe ETS1 dataframes

In [24]:
def get_FP_df(tf, df, species):
    
    # get the human ETS1 FP data
    tfdf = df[["region_id", tf]]
    
    # if there is >1 TFBS predicted in the region, reassign to one. 
    # Don't care about dose rn

    #tfdf.loc[tfdf[tf] >1, tf]=1
    
    # rename the column to mark that this is human FP data. 
    tfdf = tfdf.rename(columns={tf:f"{tf}-{species}_N_FP"})
    
    #annotations
    tfdf = add_annotations(ANNOT, tfdf)

    return tfdf


In [25]:
def add_annotations(annot_file, df):
    annot = pd.read_csv(annot_file, sep='\t')

    annot_cols = [
            '#chr',
            'start',
            'end',
            'region_id',
            'conserved_active.regions',

            'trans_only',
            'cis_only',
            'cis+trans',

            'HH-active_MM-inactive_cis-only',
            'HH-active_MM-inactive_trans-only',
            'HH-active_MM-inactive_cis+trans',

            'MM-active_HH-inactive_cis-only',
            'MM-active_HH-inactive_trans-only',
            'MM-active_HH-inactive_cis+trans',
            ]
    # merge the annotations
    annot_df = pd.merge(annot[annot_cols], df, how="left")
    
    # fill na for regions that did not footprint at all...
    annot_df = annot_df.fillna(0)
    
    
    return annot_df

In [29]:
def dif_fp(tf, hutf, rhtf):
    annot_cols = [
            '#chr',
            'start',
            'end',
            'region_id',
            'conserved_active.regions',

            'trans_only',
            'cis_only',
            'cis+trans',

            'HH-active_MM-inactive_cis-only',
            'HH-active_MM-inactive_trans-only',
            'HH-active_MM-inactive_cis+trans',

            'MM-active_HH-inactive_cis-only',
            'MM-active_HH-inactive_trans-only',
            'MM-active_HH-inactive_cis+trans',
            ]
    # merge the two columns on the region id
    merged = pd.merge(hutf, rhtf, on=annot_cols, how = "left")

    # fill the na's
    merged = merged.fillna(0) 
    
    # calculate the differential fp column 
    merged[f"{tf}_difFP"]= merged[f"{tf}-human_N_FP"]-merged[f"{tf}-rhesus_N_FP"]
    
    # reorder columns
    order_cols = [
                '#chr',
                'start',
                'end',
                'region_id',
                f'{tf}-human_N_FP',
                f'{tf}-rhesus_N_FP',
                f'{tf}_difFP',
                'conserved_active.regions',
                'trans_only',
                'cis_only',
                'cis+trans',
                'HH-active_MM-inactive_cis-only',
                'HH-active_MM-inactive_trans-only',
                'HH-active_MM-inactive_cis+trans',
                'MM-active_HH-inactive_cis-only',
                'MM-active_HH-inactive_trans-only',
                'MM-active_HH-inactive_cis+trans',
               
    ]

    merged = merged[order_cols]

    return merged

In [38]:
TFNAMES = find_tf_name("ETS1", hu)
TF = TFNAMES[0]


# get the human ETS1 FP data
hu_ets1 = get_FP_df(TF, hu, "human")

# get the rhesus ETS1 FP data
rhe_ets1 = get_FP_df(TF, rh, "rhesus")

124 ETS1_MA0098.3


In [39]:
merged = dif_fp(TF, hu_ets1, rhe_ets1)
print(merged.shape)
merged.loc[merged[f"{TF}-human_N_FP"]>0]

(17604, 17)


Unnamed: 0,#chr,start,end,region_id,ETS1_MA0098.3-human_N_FP,ETS1_MA0098.3-rhesus_N_FP,ETS1_MA0098.3_difFP,conserved_active.regions,trans_only,cis_only,cis+trans,HH-active_MM-inactive_cis-only,HH-active_MM-inactive_trans-only,HH-active_MM-inactive_cis+trans,MM-active_HH-inactive_cis-only,MM-active_HH-inactive_trans-only,MM-active_HH-inactive_cis+trans
5,chr1,1375433,1375553,chr1:1375433-1375553,1.0,0.0,1.0,0.0,0,0,1,0.0,0.0,0.0,0.0,0.0,1.0
9,chr1,1658947,1659167,chr1:1658947-1659167,2.0,2.0,0.0,1.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
10,chr1,1659227,1659317,chr1:1659227-1659317,1.0,0.0,1.0,1.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
17,chr1,2195083,2195323,chr1:2195083-2195323,1.0,1.0,0.0,0.0,1,0,0,0.0,0.0,0.0,0.0,1.0,0.0
24,chr1,6026008,6026158,chr1:6026008-6026158,1.0,0.0,1.0,0.0,0,1,0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17476,chr9,129824009,129824299,chr9:129824009-129824299,1.0,1.0,0.0,1.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
17497,chr9,131125554,131125784,chr9:131125554-131125784,1.0,1.0,0.0,1.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
17499,chr9,131256280,131256790,chr9:131256280-131256790,1.0,0.0,1.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
17539,chr9,133133960,133134070,chr9:133133960-133134070,1.0,1.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
out = os.path.join(RE, f"{TF}-differential_footprints.tsv")
merged.to_csv(out, sep='\t', index=False)
out

'/data/hodges_lab/ATAC-STARR_B-cells/results/results_human-evolution/TF_footprinting/ETS1_MA0098.3-differential_footprints.tsv'

In [42]:
os.getcwd()

'/gpfs52/data/hodges_lab/ATAC-STARR_B-cells/bin_human-evolution/TF_FP'