In [None]:
# Matrix Manipulation/Management Libraries
import pandas as pd

# Data Visualization Libaries
import seaborn as sns
import matplotlib.pyplot as plt

# Input/Output Libraries
import os

# Bioinformatics Libraries
import pybedtools

# Statistical Tests, Machine Learning, etc. Libraries
import igraph

# Miscellaneous Libraries 
import random
import time

In [None]:
# Function: Process Activity-By-Contact Model Data 

def read_samples(file: str, sample_info: str):
    cols =  ['Chrom', 'Start', 'End', 'TSS_Chrom', 'TSS_Start', 'TSS_End', 'Gene', 'Score', 'Strand_1', 'Strand_2']
    sample = pd.read_csv(file, sep= '\t', header = None, names = cols)
    sample = sample.drop(['TSS_Chrom', 'TSS_Start', 'TSS_End', 'Score', 'Strand_1', 'Strand_2'], axis = 1) 
    sample['Gene'] = sample['Gene'].str.split('_').str[0]
    sample['Sample'] = sample_info
    
    return sample

In [None]:
# Function: Combine Modules (Enhancers) within n BPs, Maintain Column-Specific (Gene, Sample,etc.) Information

def combine_modules(modules, distance: int, column_header: str):
    modules = modules.sort_values(by=["Start"])
    merged = []
    current_start = modules.iloc[0]["Start"]
    current_end = modules.iloc[0]["End"]
    chromosome = modules.iloc[0]["Chrom"]
    column_information = set([modules.iloc[0][column_header]])
    for i in range(1, len(modules)):
        if modules.iloc[i]["Chrom"] != chromosome:
            merged.append([chromosome, current_start, current_end, list(column_information)])
            current_start = modules.iloc[i]["Start"]
            current_end = modules.iloc[i]["End"]
            chromosome = modules.iloc[i]["Chrom"]
            column_information = set([modules.iloc[i][column_header]])
        elif modules.iloc[i]["Start"] - current_end <= distance:
            current_end = modules.iloc[i]["End"]
            column_information.add(modules.iloc[i][column_header])
        else:
            merged.append([chromosome, current_start, current_end, list(column_information)])
            current_start = modules.iloc[i]["Start"]
            current_end = modules.iloc[i]["End"]
            column_information = set([modules.iloc[i][column_header]])
    merged.append([chromosome, current_start, current_end, list(column_information)]) 
    
    return pd.DataFrame(merged, columns = ["Chrom", "Start", "End", column_header])

In [None]:
# Load Activity-By-Contact Model Samples

file_list = ['0h_sample', '3h_sample', '6h_sample', '12h_sample', '24h_sample', '48h_sample', '72h_sample']
file_dir = '/projectsp/f_ak1833_1/Ziyuan/abc_analysis_organized/further_analysis/'

samples = []

for file_name in file_list: 
    file_path = file_dir + file_name + '.bedpe'
    sample = read_samples(file_path, file_name)
    samples.append(sample)
    
uncollapsed_module_regions = pd.concat(samples)

In [None]:
# Collapse Modules (Enhancers) within 100 BPs

module_regions_genes = combine_modules(uncollapsed_module_regions, 100, "Gene")
module_regions_samples = combine_modules(uncollapsed_module_regions, 100, "Sample")

In [None]:
# Combine Module (Enhancers) Sequences, Target Genes, Samples

module_regions_unlabeled = module_regions_samples.merge(module_regions_genes, on=["Chrom","Start", 'End'])
module_regions_unlabeled = module_regions_unlabeled.explode('Gene')
module_regions_unlabeled = module_regions_unlabeled.explode('Sample')
module_regions_unlabeled = module_regions_unlabeled.drop_duplicates()

In [None]:
modules_labeled = modules_regions_unlabeled
modules_labeled['Module'] = 'Module_' + modules_labeled.groupby(['Chrom', 'Start', 'End']).ngroup().add(1).astype(str)