In [1]:
# Import necessary libraries



import time
import sys

In [2]:
#!pip install ipywidgets


In [3]:
#!pip install jupyterlab_widgets


In [4]:
# Set up paths
gene_file = '/logo2/irfan/Reference_like_Data/Ensembl/EnsemblePromotersUp2000_down1000_sorted.bed'
bedgraph_folder = '/logo2/irfan/GBM_project/gbm_model_data_bg_rolled'
output_folder = bedgraph_folder+"_promoter"

# Time and execute the processing function
start_time = time.time()

In [5]:
import os
from pybedtools import BedTool
from tqdm.notebook import tqdm
from multiprocessing import Pool, cpu_count

import pandas as pd

def average_bedgraph_for_genes(gene_file, bedgraph_file):
    genes = BedTool(gene_file)
    bedgraph = BedTool(bedgraph_file)

    # Intersect bedgraph with genes
    intersected = genes.intersect(bedgraph, wao=True)
    
    # Process the intersection to handle non-numeric values and exclude no overlaps
    avg_values = {}
    for row in intersected:
        gene_name = row[3]
        overlap_value = row[7]
        if overlap_value == '.':  # No overlap, continue to the next row
            continue
        
        avg_values[gene_name] = avg_values.get(gene_name, [])
        avg_values[gene_name].append(float(overlap_value))

    # Compute the average for each gene
    for gene in avg_values:
        avg_values[gene] = sum(avg_values[gene]) / len(avg_values[gene])

    return avg_values

def process_single_bedgraph(args):
    gene_file, input_filepath, output_filepath = args

    result = average_bedgraph_for_genes(gene_file, input_filepath)

    # Write the results to an output file
    with open(output_filepath, 'w') as out_file:
        for gene, value in result.items():
            out_file.write(f"{gene}\t{value}\n")
    
    return result  # Return the result dictionary for aggregation

def process_all_bedgraphs(gene_file, bedgraph_folder, output_folder):
    # Check if the output folder exists; if it does, exit the code
    if os.path.exists(output_folder):
        print(f"The output folder '{output_folder}' already exists. Exiting.")
        sys.exit(1)
    os.makedirs(output_folder)

    bedgraph_files = [f for f in os.listdir(bedgraph_folder) if f.endswith('.bedgraph')]
    
    args_list = []
    for filename in bedgraph_files:
        input_filepath = os.path.join(bedgraph_folder, filename)
        output_filename = filename.replace('.bedgraph', '_avg.txt')
        output_filepath = os.path.join(output_folder, output_filename)
        args_list.append((gene_file, input_filepath, output_filepath))
    
    # Use multiprocessing to process bedgraph files in parallel
    with Pool(cpu_count() - 1) as pool:  # Use all but one CPU core
        results = pool.map(process_single_bedgraph, args_list)

    # Aggregate the results into a single dataframe
    df = pd.DataFrame({filename: results[idx] for idx, filename in enumerate(bedgraph_files)})
    df = df.fillna('NA')

    return df  # Return the dataframe


In [6]:

allsamples_gene = process_all_bedgraphs(gene_file, bedgraph_folder, output_folder)

elapsed_time = time.time() - start_time
print(f"Processing completed in {elapsed_time:.2f} seconds.")

Processing completed in 105.71 seconds.


In [7]:
print(allsamples_gene.shape)
allsamples_gene.index.name = 'gene'
allsamples_gene.head()

(18995, 29)


Unnamed: 0_level_0,NU56-auto.bedgraph_rolled.bedgraph,bH03_B03-auto.bedgraph_rolled.bedgraph,H05_TM__ULPWGS-auto.bedgraph_rolled.bedgraph,95-auto.bedgraph_rolled.bedgraph,NU20-auto.bedgraph_rolled.bedgraph,aH03_B01-auto.bedgraph_rolled.bedgraph,aH01_B01-auto.bedgraph_rolled.bedgraph,H02_TM_ULPWGS-auto.bedgraph_rolled.bedgraph,aH02_B01-auto.bedgraph_rolled.bedgraph,H04_TM_ULPWGS-auto.bedgraph_rolled.bedgraph,...,H04_TM-auto.bedgraph_rolled.bedgraph,aH04_B01-auto.bedgraph_rolled.bedgraph,NU13-auto.bedgraph_rolled.bedgraph,100-auto.bedgraph_rolled.bedgraph,bH02_B03-auto.bedgraph_rolled.bedgraph,bH05_B03-auto.bedgraph_rolled.bedgraph,aH05_B01b-auto.bedgraph_rolled.bedgraph,NU45-auto.bedgraph_rolled.bedgraph,bH04_B03-auto.bedgraph_rolled.bedgraph,H01_TM_ULPWGS-auto.bedgraph_rolled.bedgraph
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
OR4F5,0.865,0.3135,1.0,,0.623333,0.75,0.4335,1.0,,1.0,...,0.8505,0.823667,,,,0.5,,0.796,0.645875,1.0
SAMD11,0.061631,0.072191,0.999378,0.079404,0.067093,0.071473,0.056893,0.9862,0.064551,0.999444,...,0.077631,0.058867,0.060964,0.071636,0.066573,0.065739,0.071858,0.066191,0.056947,0.994293
NOC2L,0.072822,0.096906,0.997184,0.111915,0.065292,0.116918,0.090732,0.99981,0.07114,0.994692,...,0.101325,0.088516,0.094417,0.069299,0.072932,0.08571,0.091379,0.08583,0.071132,0.999698
KLHL17,0.222505,0.226339,0.995936,0.258648,0.217695,0.271431,0.241497,0.999803,0.214547,0.995393,...,0.264502,0.223086,0.239254,0.230489,0.223161,0.244658,0.230913,0.23419,0.213391,0.999688
PLEKHN1,0.384817,0.388231,0.99984,0.364528,0.374528,0.355364,0.351152,0.999071,0.348612,0.998622,...,0.5366,0.35182,0.373039,0.355461,0.320592,0.290603,0.316722,0.356075,0.372783,0.998364


In [8]:
allsamples_gene.to_csv(output_folder+".txt", sep='\t')

In [9]:
print('done')

done
