In [1]:
# Import necessary libraries



import time
import sys

In [2]:
#!pip install ipywidgets


In [3]:
#!pip install jupyterlab_widgets


In [4]:
# Set up paths
gene_file = '/logo2/irfan/Reference_like_Data/Ensembl/EnsemblePromotersUp2000_down1000_sorted.bed'
bedgraph_folder = '/logo2/irfan/Bladder2/Bladder_bg'
output_folder = bedgraph_folder+"_promoter"

# Time and execute the processing function
start_time = time.time()

In [5]:
import os
from pybedtools import BedTool
from tqdm.notebook import tqdm
from multiprocessing import Pool, cpu_count

import pandas as pd

def average_bedgraph_for_genes(gene_file, bedgraph_file):
    genes = BedTool(gene_file)
    bedgraph = BedTool(bedgraph_file)

    # Intersect bedgraph with genes
    intersected = genes.intersect(bedgraph, wao=True)
    
    # Process the intersection to handle non-numeric values and exclude no overlaps
    avg_values = {}
    for row in intersected:
        gene_name = row[3]
        overlap_value = row[7]
        if overlap_value == '.':  # No overlap, continue to the next row
            continue
        
        avg_values[gene_name] = avg_values.get(gene_name, [])
        avg_values[gene_name].append(float(overlap_value))

    # Compute the average for each gene
    for gene in avg_values:
        avg_values[gene] = sum(avg_values[gene]) / len(avg_values[gene])

    return avg_values

def process_single_bedgraph(args):
    gene_file, input_filepath, output_filepath = args

    result = average_bedgraph_for_genes(gene_file, input_filepath)

    # Write the results to an output file
    with open(output_filepath, 'w') as out_file:
        for gene, value in result.items():
            out_file.write(f"{gene}\t{value}\n")
    
    return result  # Return the result dictionary for aggregation

def process_all_bedgraphs(gene_file, bedgraph_folder, output_folder):
    # Check if the output folder exists; if it does, exit the code
    if os.path.exists(output_folder):
        print(f"The output folder '{output_folder}' already exists. Exiting.")
        sys.exit(1)
    os.makedirs(output_folder)

    bedgraph_files = [f for f in os.listdir(bedgraph_folder) if f.endswith('.bedgraph')]
    
    args_list = []
    for filename in bedgraph_files:
        input_filepath = os.path.join(bedgraph_folder, filename)
        output_filename = filename.replace('.bedgraph', '_avg.txt')
        output_filepath = os.path.join(output_folder, output_filename)
        args_list.append((gene_file, input_filepath, output_filepath))
    
    # Use multiprocessing to process bedgraph files in parallel
    with Pool(cpu_count() - 1) as pool:  # Use all but one CPU core
        results = pool.map(process_single_bedgraph, args_list)

    # Aggregate the results into a single dataframe
    df = pd.DataFrame({filename: results[idx] for idx, filename in enumerate(bedgraph_files)})
    df = df.fillna('NA')

    return df  # Return the dataframe


In [6]:

allsamples_gene = process_all_bedgraphs(gene_file, bedgraph_folder, output_folder)

elapsed_time = time.time() - start_time
print(f"Processing completed in {elapsed_time:.2f} seconds.")

Processing completed in 194.03 seconds.


In [7]:
print(allsamples_gene.shape)
allsamples_gene.index.name = 'gene'
allsamples_gene.head()

(18995, 136)


Unnamed: 0_level_0,WBC-1321-auto.bedgraph_rolled.bedgraph,WBC-1071-auto.bedgraph_rolled.bedgraph,WBC-1222-auto.bedgraph_rolled.bedgraph,WBC-1050-auto.bedgraph_rolled.bedgraph,WBC-1235-auto.bedgraph,Nu-10-auto.bedgraph_rolled.bedgraph,WBC-1317-auto.bedgraph_rolled.bedgraph,WBC-1197-auto.bedgraph_rolled.bedgraph,Nu-11-auto.bedgraph_rolled.bedgraph,BC001-auto.bedgraph_rolled.bedgraph,...,WBC-1233-auto.bedgraph_rolled.bedgraph,WBC-1322-auto.bedgraph_rolled.bedgraph,WBC-1198-auto.bedgraph_rolled.bedgraph,Nu-30-auto.bedgraph_rolled.bedgraph,WBC-1104-auto.bedgraph_rolled.bedgraph,WBC-1277-auto.bedgraph_rolled.bedgraph,Nu-6-auto.bedgraph_rolled.bedgraph,WBC-1062-auto.bedgraph_rolled.bedgraph,Nu-13-auto.bedgraph_rolled.bedgraph,Nu-37-auto.bedgraph_rolled.bedgraph
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
OR4F5,0.5,1.0,0.0,0.3515,0.331,0.602,0.794667,0.69825,0.5835,,...,,0.4995,1.0,0.95,0.754,0.5385,0.704857,0.515667,0.5,0.556667
SAMD11,0.047836,0.061634,0.06761,0.061627,0.06446,0.058204,0.069521,0.068336,0.054956,0.061382,...,0.06028,0.083213,0.066324,0.050677,0.060413,0.072417,0.064129,0.064719,0.0618,0.055538
NOC2L,0.07722,0.092384,0.101955,0.094266,0.089719,0.056849,0.091634,0.086926,0.092954,0.076904,...,0.064299,0.085728,0.090789,0.058649,0.076744,0.106516,0.084423,0.085092,0.078686,0.080631
KLHL17,0.235836,0.253939,0.245386,0.261108,0.244496,0.211639,0.260795,0.249007,0.234775,0.254946,...,0.230148,0.2492,0.242655,0.215565,0.258522,0.275684,0.231753,0.260782,0.206884,0.233631
PLEKHN1,0.328355,0.391304,0.3455,0.449061,0.349535,0.381206,0.353472,0.363931,0.43087,0.354118,...,0.388721,0.406872,0.374251,0.373896,0.386456,0.37539,0.424916,0.375138,0.368599,0.37728


In [8]:
allsamples_gene.to_csv(output_folder+".txt", sep='\t')

In [9]:
print('done')

done
