In [1]:
input_folder = "/data/home/natant/Negatives/Data/Encode690/ENCODE_hg38_subset_celltype_merged"
output_folder = "/data/home/natant/Negatives/Data/Encode690/ENOCDE_hg38_subset_celltype_merged_no_CTCF"

In [2]:
import os
import pandas as pd

# Function to process files
def remove_ctcf(input_folder, output_folder):
    for file_name in os.listdir(input_folder):
        if file_name.endswith("_concatenated.narrowPeak"):
            file_path = os.path.join(input_folder, file_name)
            df = pd.read_csv(file_path, sep='\t', header=None)
            
            # Filter out rows where the 4th column is 'CTCF'
            df_filtered = df[df[3] != 'CTCF']
            
            # Save the filtered dataframe to the output folder
            output_file_name = file_name.replace("_concatenated.narrowPeak", "_concatenated_no_CTCF.narrowPeak")
            output_file_path = os.path.join(output_folder, output_file_name)
            df_filtered.to_csv(output_file_path, sep='\t', header=False, index=False)

# Call the function
remove_ctcf(input_folder, output_folder)

: 

: 

In [6]:
# Function to compare file lengths and note down the results
def compare_file_lengths(input_folder, output_folder):
    results = []
    
    for file_name in os.listdir(input_folder):
        if file_name.endswith("_concatenated.narrowPeak"):
            input_file_path = os.path.join(input_folder, file_name)
            output_file_name = file_name.replace("_concatenated.narrowPeak", "_concatenated_no_CTCF.narrowPeak")
            output_file_path = os.path.join(output_folder, output_file_name)
            
            # Read the input and output files
            df_input = pd.read_csv(input_file_path, sep='\t', header=None)
            df_output = pd.read_csv(output_file_path, sep='\t', header=None)
            
            unique_TFs = df_output[3].nunique()
            # Calculate the number of rows and percentage of lines removed
            original_length = len(df_input)
            new_length = len(df_output)
            percentage_removed = ((original_length - new_length) / original_length) * 100
            
            # Extract cell type from file name
            cell_type = file_name.split('_')[0]
            
            # Append the results to the list
            results.append([cell_type, original_length, new_length, percentage_removed, unique_TFs])
    
    # Create a dataframe to display the results
    results_df = pd.DataFrame(results, columns=['Cell Type', 'Original Length', 'New Length', 'Percentage Removed', 'unique_TFs'])
    return results_df

# Call the function and display the results
results_df = compare_file_lengths(input_folder, output_folder)

results_df


Unnamed: 0,Cell Type,Original Length,New Length,Percentage Removed,unique_TFs
0,IMR90,40829,40829,0.0,1
1,MCF-7,346656,26385,92.388708,3
2,HCT-116,87083,36626,57.941274,3
3,HepG2,549774,500944,8.881831,33
4,K562,545857,502453,7.951533,35
5,HEK293,56458,8995,84.067803,1
6,PANC-1,13416,13416,0.0,1
7,GM12878,440936,400703,9.124453,35
8,A549,195653,149763,23.45479,13


: 

: 

In [10]:
import pybedtools
import os
import matplotlib.pyplot as plt

# Get the list of ChIP files in the input folder
chip_files = [f for f in os.listdir(output_folder) if f.endswith('_concatenated_no_CTCF.narrowPeak')]

# Take the first file
first_chip_file = os.path.join(output_folder, chip_files[1])

# Load the file using pybedtools
bed = pybedtools.BedTool(first_chip_file)

# Merge peaks with a maximum of 40 bp overlap allowed before merging
merged_bed = bed.merge(d=-40)

# Calculate the number of original peaks and merged peaks
original_peak_count = len(bed)
merged_peak_count = len(merged_bed)

# Calculate the number of overlapping peaks
overlapping_peak_count = original_peak_count - merged_peak_count

# Calculate the percentage of overlapping peaks
overlapping_peak_percentage = (overlapping_peak_count / original_peak_count) * 100

print(f"Total number of peaks: {original_peak_count}")
print(f"Number of overlapping peaks: {overlapping_peak_count}")
print(f"Percentage of overlapping peaks: {overlapping_peak_percentage:.2f}%")

Total number of peaks: 149763
Number of overlapping peaks: 69257
Percentage of overlapping peaks: 46.24%


: 

: 

In [None]:
from tqdm import tqdm

# Initialize lists to store the results
d_values = list(range(1, 101))
overlapping_percentages = []

def get_unique_tfs_from_file(file_path):
    with open(file_path, 'r') as file:
        tfs = set(line.split()[3] for line in file)
    return len(tfs)

# Initialize a figure for subplots
fig, axes = plt.subplots(len(chip_files), 1, figsize=(10, len(chip_files) * 5), sharex=True)

# Loop through each ChIP file and generate the plot
for i, chip_file in enumerate(tqdm(chip_files, desc="Processing ChIP files")):
    # Load the file using pybedtools
    bed = pybedtools.BedTool(os.path.join(output_folder, chip_file))
    
    # Initialize lists to store the results
    overlapping_percentages = []

    # Loop through each value of d and calculate the overlapping percentage
    for d in d_values:
        merged_bed = bed.merge(d=-d)
        merged_peak_count = len(merged_bed)
        overlapping_peak_count = len(bed) - merged_peak_count
        overlapping_peak_percentage = (overlapping_peak_count / len(bed))
        overlapping_percentages.append(overlapping_peak_percentage)

    # Get the cell type from the file name
    cell_type = chip_file.split("_")[0]
    
    # Get the number of unique TFs for the cell type
    unique_tfs = get_unique_tfs_from_file(os.path.join(output_folder, chip_file))

    # Plot the results
    axes[i].plot(d_values, overlapping_percentages, marker='o')
    axes[i].set_ylabel('Percentage of Overlapping Peaks')
    axes[i].set_title(f'Cell Type: {cell_type} (Unique TFs: {unique_tfs})')
    axes[i].grid(True)
    axes[i].set_ylim(0, max(overlapping_percentages) * 1.1)

# Set common labels
axes[-1].set_xlabel('d value in bed.merge')

plt.tight_layout()
plt.show()


NameError: name 'plt' is not defined

: 

: 