In [13]:
input_folder = "/data/home/natant/Negatives/Data/Encode690/filtered_hg38_101bp"
metadata_file = "/data/home/natant/Negatives/Data/Encode690/filtered_hg38_101bp/metadata.csv"

In [2]:
import pandas as pd

metadata_df = pd.read_csv(metadata_file)
metadata_df

Unnamed: 0,project,lab,composite,dataType,view,cell,treatment,antibody,control,dataVersion,...,type,md5sum,size,filename,antibody_base,cell_base,maxatac_cell,df_cell,maxatac_tf,df_tf
0,wgEncode,Broad,wgEncodeAwgTfbsUniform,ChipSeq,Peaks,GM12878,,CTCF,std,ENCODE Mar 2012 Freeze,...,narrowPeak,fef0af7fe1e724159e665085da53efb1,982K,wgEncodeAwgTfbsBroadGm12878CtcfUniPk.narrowPea...,ctcf,gm12878,GM12878,GM12878,CTCF,CTCF
1,wgEncode,Broad,wgEncodeAwgTfbsUniform,ChipSeq,Peaks,HepG2,,CTCF,std,ENCODE Mar 2012 Freeze,...,narrowPeak,6f8ab32864e515e0362494a3a8df2b8e,892K,wgEncodeAwgTfbsBroadHepg2CtcfUniPk.narrowPeak.gz,ctcf,hepg2,HepG2,HepG2,CTCF,CTCF
2,wgEncode,Broad,wgEncodeAwgTfbsUniform,ChipSeq,Peaks,K562,,CTCF,std,ENCODE Mar 2012 Freeze,...,narrowPeak,919d1986bad2d03b4beb4a40a12f5bb2,1.2M,wgEncodeAwgTfbsBroadK562CtcfUniPk.narrowPeak.gz,ctcf,k562,K562,K562,CTCF,CTCF
3,wgEncode,HudsonAlpha,wgEncodeAwgTfbsUniform,ChipSeq,Peaks,A549,EtOH_0.02pct,ATF3,,ENCODE Mar 2012 Freeze,...,narrowPeak,04b555c96fac6ff5cd3ade02be4c9b66,168K,wgEncodeAwgTfbsHaibA549Atf3V0422111Etoh02UniPk...,atf3,a549,A549,A549,ATF3,ATF3
4,wgEncode,HudsonAlpha,wgEncodeAwgTfbsUniform,ChipSeq,Peaks,A549,DEX_100nM,CREB1_(SC-240),,ENCODE Mar 2012 Freeze,...,narrowPeak,4e04e30603901f81c80fb99c0394d11f,378K,wgEncodeAwgTfbsHaibA549Creb1sc240V0416102Dex10...,creb1,a549,A549,A549,CREB1,CREB1_(SC-240)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,wgEncode,UW,wgEncodeAwgTfbsUniform,ChipSeq,Peaks,HCT-116,,CTCF,std,ENCODE Mar 2012 Freeze,...,narrowPeak,5353919b8dec58898d42391ed013ed3b,1006K,wgEncodeAwgTfbsUwHct116CtcfUniPk.narrowPeak.gz,ctcf,hct116,HCT116,HCT-116,CTCF,CTCF
143,wgEncode,UW,wgEncodeAwgTfbsUniform,ChipSeq,Peaks,HEK293,,CTCF,std,ENCODE Mar 2012 Freeze,...,narrowPeak,85054a5025553ab029c7eb16c94dcbc4,1.1M,wgEncodeAwgTfbsUwHek293CtcfUniPk.narrowPeak.gz,ctcf,hek293,HEK293,HEK293,CTCF,CTCF
144,wgEncode,UW,wgEncodeAwgTfbsUniform,ChipSeq,Peaks,HepG2,,CTCF,std,ENCODE Mar 2012 Freeze,...,narrowPeak,3fdf2bfeeb9bf469510d615efec560ae,905K,wgEncodeAwgTfbsUwHepg2CtcfUniPk.narrowPeak.gz,ctcf,hepg2,HepG2,HepG2,CTCF,CTCF
145,wgEncode,UW,wgEncodeAwgTfbsUniform,ChipSeq,Peaks,K562,,CTCF,std,ENCODE Mar 2012 Freeze,...,narrowPeak,365af2fdea48c046dafad14b73f69b5d,850K,wgEncodeAwgTfbsUwK562CtcfUniPk.narrowPeak.gz,ctcf,k562,K562,K562,CTCF,CTCF


In [17]:
import os
import pybedtools
from collections import defaultdict


# # Group files by cell type
cell_groups = metadata_df.groupby('cell')['filename'].apply(list).to_dict()

# Function to load narrowPeak files and convert to BedTool objects
def load_peaks(file_list, input_folder):
    bedtools_list = []
    for file in file_list:
        file_path = os.path.join(input_folder, file.rstrip('.gz'))
        bedtools_list.append(pybedtools.BedTool(file_path))
    return bedtools_list

# Function to merge peaks and annotate with TFs
def merge_peaks_with_tfs(bedtools_list, tf_list):
    merged_peaks = bedtools_list[0].cat(*bedtools_list[1:], postmerge=False).sort().merge(c=4, o='distinct')
    annotated_peaks = []
    for peak in merged_peaks:
        overlapping_tfs = set()
        for bedtool, tf in zip(bedtools_list, tf_list):
            if bedtool.any_hits(peak):
                overlapping_tfs.add(tf)
        peak.fields.append(','.join(overlapping_tfs))
        annotated_peaks.append(peak)
    return pybedtools.BedTool(annotated_peaks)

# Process each cell type group
merged_results = defaultdict(list)
for cell, files in cell_groups.items():
    bedtools_list = load_peaks(files, input_folder)
    tf_list = metadata_df[metadata_df['filename'].isin(files)]['antibody_base'].tolist()
    merged_peaks = merge_peaks_with_tfs(bedtools_list, tf_list)
    merged_results[cell] = merged_peaks

# Convert merged results to DataFrame
merged_peaks_list = []
for cell, peaks in merged_results.items():
    for peak in peaks:
        merged_peaks_list.append({
            'cell': cell,
            'chrom': peak.chrom,
            'start': peak.start,
            'end': peak.end,
            'tfs': peak.fields[-1]
        })

merged_peaks_df = pd.DataFrame(merged_peaks_list)
merged_peaks_df

KeyboardInterrupt: 

In [None]:
# import os
# import pybedtools
# from collections import defaultdict


# # Group files by cell type
# cell_groups = metadata_df.groupby('cell')['filename'].apply(list).to_dict()

# # Function to load narrowPeak files and convert to BedTool objects
# def load_peaks(file_list, input_folder):
#     bedtools_list = []
#     for file in file_list:
#         file_path = os.path.join(input_folder, file.rstrip('.gz'))
#         bedtools_list.append(pybedtools.BedTool(file_path))
#     return bedtools_list

# # Function to calculate overlaps and unique peaks
# def calculate_overlaps(bedtools_list):
#     if len(bedtools_list) < 2:
#         return 0, 0, 0  # Not enough files to compare

#     # Calculate intersections
#     intersection = bedtools_list[0]
#     for bedtool in bedtools_list[1:]:
#         intersection = intersection.intersect(bedtool, u=True)

#     # Calculate unique peaks
#     unique_peaks = sum(len(bedtool) for bedtool in bedtools_list) - len(intersection)

#     # Calculate percentage of overlapping peaks
#     total_peaks = sum(len(bedtool) for bedtool in bedtools_list)
#     overlap_percentage = (len(intersection) / total_peaks) * 100

#     return overlap_percentage, unique_peaks, len(intersection)

# # Process each cell type group
# results = defaultdict(dict)
# for cell, files in cell_groups.items():
#     bedtools_list = load_peaks(files, input_folder)
#     overlap_percentage, unique_peaks, len_intersection = calculate_overlaps(bedtools_list)
#     results[cell]['overlap_percentage'] = overlap_percentage
#     results[cell]['unique_peaks'] = unique_peaks
#     results[cell]['overlapping_peaks'] = len_intersection

# # Calculate the number of TFs and list them for each cell type
# for cell, files in cell_groups.items():
#     tf_set = set(metadata_df[metadata_df['cell'] == cell]['antibody_base'])
#     results[cell]['num_tfs'] = len(tf_set)
#     results[cell]['tfs'] = list(tf_set)
#     # results[cell]['total_peaks'] = sum(len(pybedtools.BedTool(os.path.join(input_folder, file.rstrip('.gz')))) for file in files)
#     # results[cell]['overlapping_peaks'] = len(pybedtools.BedTool(os.path.join(input_folder, files[0].rstrip('.gz'))).intersect(bedtools_list[0], u=True))

# results_df = pd.DataFrame.from_dict(results, orient='index')
# results_df


# WRONG!