Peaks
Hi-C; CTCF
Hi-C; rad21
ChIA-PET; CTCF
ChIA-PET; rad21

In [60]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [61]:
# Load the data into a DataFrame
CTCF_df = pd.read_csv("ENCFF356LIU_CTCF_ChIP.bed", sep='\t', header=None).iloc[:,:3]

Rad21_df = pd.read_csv("ENCFF834GOT_Rad21_ChIP.bed", sep='\t', header=None).iloc[:,:3]

ChIA_PET_loops_df = pd.read_csv("ChIA_PET_loops.bedpe", sep='\t', header=None).iloc[:,:6]

Hi_C_loops_df = pd.read_csv("Hi_C_loops.txt", sep='\t', header=None, skiprows=2 ).iloc[:,:6]
# Adding "chr" to the numbers in the 0th and 3rd columns
Hi_C_loops_df[0] = 'chr' + Hi_C_loops_df[0].astype(str)
Hi_C_loops_df[3] = 'chr' + Hi_C_loops_df[3].astype(str)

# Print shapes of the DataFrames
print("Shape of CTCF_df:", CTCF_df.shape)
print("Shape of Rad21_df:", Rad21_df.shape)
print("Shape of ChIA_PET_loops_df:", ChIA_PET_loops_df.shape)
print("Shape of Hi_C_loops_df:", Hi_C_loops_df.shape)

Shape of CTCF_df: (40949, 3)
Shape of Rad21_df: (34623, 3)
Shape of ChIA_PET_loops_df: (5815083, 6)
Shape of Hi_C_loops_df: (15233, 6)


In [62]:
# Function to check if regions overlap
def regions_overlap(region1_start, region1_end, region2_start, region2_end):
    return np.maximum(region1_start, region2_start) <= np.minimum(region1_end, region2_end)

# Function to find overlapping rows between two DataFrames
def find_overlapping_rows(df1, df2):
    overlapping_rows = []
    total_iterations = len(df1) * len(df2)
    
    # Initialize tqdm for progress bar
    progress_bar = tqdm(total=total_iterations, desc="Finding overlapping rows", position=0)
    
    for index1, row1 in df1.iterrows():
        for index2, row2 in df2.iterrows():
            # Check if regions overlap
            if row1[0] == row2[0] and (regions_overlap(row1[1], row1[2], row2[1], row2[2]) or
                                        regions_overlap(row1[4], row1[5], row2[1], row2[2])):
                overlapping_rows.append(index1)
                break  # Move to the next row in df1
            # Update progress bar
            progress_bar.update(1)
    # Close tqdm
    progress_bar.close()
    
    return df1.loc[overlapping_rows]

# Function to find overlapping loops between two DataFrames
def find_overlapping_loops(df1, df2):
    overlapping_loops = []
    total_iterations = len(df1) * len(df2)
    
    # Initialize tqdm for progress bar
    progress_bar = tqdm(total=total_iterations, desc="Finding overlapping loops", position=0)
    
    for index1, row1 in df1.iterrows():
        for index2, row2 in df2.iterrows():
            # Check if regions overlap and chromosomes are the same
            if row1[0] == row2[0] and (regions_overlap(row1[1], row1[2], row2[1], row2[2]) or
                                        regions_overlap(row1[4], row1[5], row2[1], row2[2])):
                overlapping_loops.append((index1, index2))
            # Update progress bar
            progress_bar.update(1)
    
    # Close tqdm
    progress_bar.close()
    
    # Create a DataFrame from the list of overlapping loops
    overlapping_loops_df = pd.DataFrame(overlapping_loops, columns=['df1_index', 'df2_index'])
    return overlapping_loops_df

In [63]:
# Find overlapping rows
overlapping_rows_df_Hi_C_CTCF = find_overlapping_rows(Hi_C_loops_df, CTCF_df)
overlapping_rows_df_Hi_C_Rad21 = find_overlapping_rows(Hi_C_loops_df, Rad21_df)
overlapping_rows_df_ChIA_PET_CTCF = find_overlapping_rows(ChIA_PET_loops_df, CTCF_df)
overlapping_rows_df_ChIA_PET_Rad21 = find_overlapping_rows(ChIA_PET_loops_df, Rad21_df)

# Save overlapping rows DataFrames to CSV
overlapping_rows_df_Hi_C_CTCF.to_csv("out/overlapping_rows_df_Hi_C_CTCF.csv", index=False)
overlapping_rows_df_Hi_C_Rad21.to_csv("out/overlapping_rows_df_Hi_C_Rad21.csv", index=False)
overlapping_rows_df_ChIA_PET_CTCF.to_csv("out/overlapping_rows_df_ChIA_PET_CTCF.csv", index=False)
overlapping_rows_df_ChIA_PET_Rad21.to_csv("out/overlapping_rows_df_ChIA_PET_Rad21.csv", index=False)

Finding overlapping rows:  15%|▏| 90957699/623776117 [5:34:36<345:37:22, 428.23i

KeyboardInterrupt: 

In [None]:
# Find common loops
common_loops_Hi_C_CTCF__Hi_C_Rad21 = find_overlapping_loops( overlapping_rows_df_Hi_C_CTCF, overlapping_rows_df_Hi_C_Rad21)
common_loops_Hi_C_CTCF__ChIA_PET_CTCF = find_overlapping_loops( overlapping_rows_df_Hi_C_CTCF, overlapping_rows_df_ChIA_PET_CTCF)
common_loops_Hi_C_CTCF__ChIA_PET_Rad21 = find_overlapping_loops( overlapping_rows_df_Hi_C_CTCF, overlapping_rows_df_ChIA_PET_Rad21)
common_loops_Hi_C_Rad21__ChIA_PET_CTCF = find_overlapping_loops( overlapping_rows_df_Hi_C_Rad21, overlapping_rows_df_ChIA_PET_CTCF)
common_loops_Hi_C_Rad21__ChIA_PET_Rad21 = find_overlapping_loops( overlapping_rows_df_Hi_C_Rad21, overlapping_rows_df_ChIA_PET_Rad21)
common_loops_ChIA_PET_CTCF__ChIA_PET_Rad21 = find_overlapping_loops( overlapping_rows_df_ChIA_PET_CTCF, overlapping_rows_df_ChIA_PET_Rad21)

# Save common loops DataFrames to CSV
common_loops_Hi_C_CTCF__Hi_C_Rad21.to_csv("out/common_loops_Hi_C_CTCF__Hi_C_Rad21.csv", index=False)
common_loops_Hi_C_CTCF__ChIA_PET_CTCF.to_csv("out/common_loops_Hi_C_CTCF__ChIA_PET_CTCF.csv", index=False)
common_loops_Hi_C_CTCF__ChIA_PET_Rad21.to_csv("out/common_loops_Hi_C_CTCF__ChIA_PET_Rad21.csv", index=False)
common_loops_Hi_C_Rad21__ChIA_PET_CTCF.to_csv("out/common_loops_Hi_C_Rad21__ChIA_PET_CTCF.csv", index=False)
common_loops_Hi_C_Rad21__ChIA_PET_Rad21.to_csv("out/common_loops_Hi_C_Rad21__ChIA_PET_Rad21.csv", index=False)
common_loops_ChIA_PET_CTCF__ChIA_PET_Rad21.to_csv("out/common_loops_ChIA_PET_CTCF__ChIA_PET_Rad21.csv", index=False)