In [None]:
# Matrix Manipulation and Data Visualization Libraries
import pandas as pd

# Input/Output Libraries
import os

In [None]:
def read_samples(file_path: str, sample_info: str):
    column_names =  ['Chromosome', 'Start', 'End', 'TSS_Chromosome', 'TSS_Start', 'TSS_End', 'Gene', 'Score', 'Strand_1', 'Strand_2'] # <-- Standard ".bedpe" Format
    sample = pd.read_csv(file_path, sep= '\t', header = None, names = column_names)
    sample = sample.drop(['TSS_Chromosome', 'TSS_Start', 'TSS_End', 'Score', 'Strand_1', 'Strand_2'], axis = 1) 
    sample['Gene'] = sample['Gene'].str.split('_').str[0] # <-- Isolates "PGGHG" Gene from "PGGHG_genic|chr11:282676-283176"
    sample['Sample'] = sample_info
    
    return sample # <-- returns table containing ([Chromosome], [Start], [End], [Gene], [Sample]) Headers 

In [None]:
file_list = ['0h_sample', '3h_sample', '6h_sample', '12h_sample', '24h_sample', '48h_sample', '72h_sample'] # <-- Files with ".bedpe", ex. /projectsp/f_ak1833_1/Ziyuan/abc_analysis_organized/further_analysis/0h_sample.bedpe
file_dir = '/projectsp/f_ak1833_1/Ziyuan/abc_analysis_organized/further_analysis/'

samples = [] # <-- List to be used to Reference Individual Samples

for file_name in file_list: 
    file_path = file_dir + file_name + '.bedpe'
    sample = read_samples(file_path, file_name)
    samples.append(sample)
    
module_regions = pd.concat(samples) # <-- Concatenates Modules, DOES NOT Collapse 

In [None]:
def combine_modules(modules, distance: int):
    modules = modules.sort_values(by=["Start"])
    merged = []
    current_start = modules.iloc[0]["Start"]
    current_end = modules.iloc[0]["End"]
    chromosome = modules.iloc[0]["Chromosome"]
    for i in range(1, len(modules)):
        if modules.iloc[i]["Chromosome"] != chromosome: # <-- Checks Module's Chromosomes are Similar
            merged.append([chromosome, current_start, current_end])
            current_start = modules.iloc[i]["Start"]
            current_end = modules.iloc[i]["End"]
            chromosome = modules.iloc[i]["Chromosome"]
        elif modules.iloc[i]["Start"] - current_end <= distance: # <-- Checks Distance is <= 'distance' Variable
            current_end = modules.iloc[i]["End"]
        else:
            merged.append([chromosome, current_start, current_end])
            current_start = modules.iloc[i]["Start"]
            current_end = modules.iloc[i]["End"]
    merged.append([chromosome, current_start, current_end])
    
    return pd.DataFrame(merged, columns = ["Chromosome", "Start", "End"]) # <-- Collapses Modules from ALL Samples, Returns ".bed" File 

In [None]:
genomic_cordinates_modules = module_regions[['Chromosome', 'Start', 'End']] # <-- Function 'combine_modules' takes ['Chromosome', 'Start', 'End'] Headers as Input
genomic_cordinates_modules = combine_modules(genomic_cordinates_modules, 100) # <-- Combines Modules within 100 BPs of Eachother 

In [None]:
genomic_cordinates_modules.to_csv("/home/wbd20/Kreimer_Lab/Network/genomic_cordinates_modules.bed", sep = "\t", index = False, header = False) # <-- Saves '.bed' file used for ATACSeq/H3K27ac Bedtool's

In [None]:
# Submit throught SLURMS Job, multicov uses BAMs to Count Reads

bedtools multicov -bams /projectsp/f_ak1833_1/Neuro_7TP_ATACseq_OUT/ATACseq_0hr_rep1/ATACseq_0hr_rep1_shifted.bam /projectsp/f_ak1833_1/Neuro_7TP_ATACseq_OUT/ATACseq_0hr_rep2/ATACseq_0hr_rep2_shifted.bam -bed /home/wbd20/Kreimer_Lab/Network/genomic_cordinates_modules.bed > /home/wbd20/Kreimer_Lab/ATACSeq/Read_Counts/ATACseq_0hr_enhancers_combined_PeakCounts.tab
bedtools multicov -bams /projectsp/f_ak1833_1/Neuro_7TP_ATACseq_OUT/ATACseq_3hr_rep1/ATACseq_3hr_rep1_shifted.bam /projectsp/f_ak1833_1/Neuro_7TP_ATACseq_OUT/ATACseq_3hr_rep2/ATACseq_3hr_rep2_shifted.bam -bed /home/wbd20/Kreimer_Lab/Network/genomic_cordinates_modules.bed > /home/wbd20/Kreimer_Lab/ATACSeq/Read_Counts/ATACseq_3hr_enhancers_combined_PeakCounts.tab
bedtools multicov -bams /projectsp/f_ak1833_1/Neuro_7TP_ATACseq_OUT/ATACseq_6hr_rep1/ATACseq_6hr_rep1_shifted.bam /projectsp/f_ak1833_1/Neuro_7TP_ATACseq_OUT/ATACseq_6hr_rep2/ATACseq_6hr_rep2_shifted.bam -bed /home/wbd20/Kreimer_Lab/Network/genomic_cordinates_modules.bed > /home/wbd20/Kreimer_Lab/ATACSeq/Read_Counts/ATACseq_6hr_enhancers_combined_PeakCounts.tab
bedtools multicov -bams /projectsp/f_ak1833_1/Neuro_7TP_ATACseq_OUT/ATACseq_12hr_rep1/ATACseq_12hr_rep1_shifted.bam /projectsp/f_ak1833_1/Neuro_7TP_ATACseq_OUT/ATACseq_12hr_rep2/ATACseq_12hr_rep2_shifted.bam -bed /home/wbd20/Kreimer_Lab/Network/genomic_cordinates_modules.bed > /home/wbd20/Kreimer_Lab/ATACSeq/Read_Counts/ATACseq_12hr_enhancers_combined_PeakCounts.tab
bedtools multicov -bams /projectsp/f_ak1833_1/Neuro_7TP_ATACseq_OUT/ATACseq_24hr_rep1/ATACseq_24hr_rep1_shifted.bam /projectsp/f_ak1833_1/Neuro_7TP_ATACseq_OUT/ATACseq_24hr_rep2/ATACseq_24hr_rep2_shifted.bam -bed /home/wbd20/Kreimer_Lab/Network/genomic_cordinates_modules.bed > /home/wbd20/Kreimer_Lab/ATACSeq/Read_Counts/ATACseq_24hr_enhancers_combined_PeakCounts.tab
bedtools multicov -bams /projectsp/f_ak1833_1/Neuro_7TP_ATACseq_OUT/ATACseq_48hr_rep1/ATACseq_48hr_rep1_shifted.bam /projectsp/f_ak1833_1/Neuro_7TP_ATACseq_OUT/ATACseq_48hr_rep2/ATACseq_48hr_rep2_shifted.bam -bed /home/wbd20/Kreimer_Lab/Network/genomic_cordinates_modules.bed > /home/wbd20/Kreimer_Lab/ATACSeq/Read_Counts/ATACseq_48hr_enhancers_combined_PeakCounts.tab
bedtools multicov -bams /projectsp/f_ak1833_1/Neuro_7TP_ATACseq_OUT/ATACseq_72hr_rep1/ATACseq_72hr_rep1_shifted.bam /projectsp/f_ak1833_1/Neuro_7TP_ATACseq_OUT/ATACseq_72hr_rep2/ATACseq_72hr_rep2_shifted.bam -bed /home/wbd20/Kreimer_Lab/Network/genomic_cordinates_modules.bed > /home/wbd20/Kreimer_Lab/ATACSeq/Read_Counts/ATACseq_72hr_enhancers_combined_PeakCounts.tab

bedtools multicov -bams /projectsp/f_ak1833_1/dualSMADChiPSeq/rep1_OUT/0hr_K27ac/0hr_K27ac_noblklst.bam /projectsp/f_ak1833_1/dualSMADChiPSeq/rep2_OUT/0hr_K27ac/0hr_K27ac_noblklst.bam -bed /home/wbd20/Kreimer_Lab/Network/genomic_cordinates_modules.bed > /home/wbd20/Kreimer_Lab/H3K27ac/Read_Counts/H3K27ac_0hr_enhancers_combined_PeakCounts.tab
bedtools multicov -bams /projectsp/f_ak1833_1/dualSMADChiPSeq/rep1_OUT/3hr_K27ac/3hr_K27ac_noblklst.bam /projectsp/f_ak1833_1/dualSMADChiPSeq/rep2_OUT/3hr_K27ac/3hr_K27ac_noblklst.bam -bed /home/wbd20/Kreimer_Lab/Network/genomic_cordinates_modules.bed > /home/wbd20/Kreimer_Lab/H3K27ac/Read_Counts/H3K27ac_3hr_enhancers_combined_PeakCounts.tab
bedtools multicov -bams /projectsp/f_ak1833_1/dualSMADChiPSeq/rep1_OUT/6hr_K27ac/6hr_K27ac_noblklst.bam /projectsp/f_ak1833_1/dualSMADChiPSeq/rep2_OUT/6hr_K27ac/6hr_K27ac_noblklst.bam -bed /home/wbd20/Kreimer_Lab/Network/genomic_cordinates_modules.bed > /home/wbd20/Kreimer_Lab/H3K27ac/Read_Counts/H3K27ac_6hr_enhancers_combined_PeakCounts.tab
bedtools multicov -bams /projectsp/f_ak1833_1/dualSMADChiPSeq/rep1_OUT/12hr_K27ac/12hr_K27ac_noblklst.bam /projectsp/f_ak1833_1/dualSMADChiPSeq/rep2_OUT/12hr_K27ac/12hr_K27ac_noblklst.bam -bed /home/wbd20/Kreimer_Lab/Network/genomic_cordinates_modules.bed > /home/wbd20/Kreimer_Lab/H3K27ac/Read_Counts/Ziyuan/H3K27ac_12hr_enhancers_combined_PeakCounts.tab
bedtools multicov -bams /projectsp/f_ak1833_1/dualSMADChiPSeq/rep1_OUT/24hr_K27ac/24hr_K27ac_noblklst.bam /projectsp/f_ak1833_1/dualSMADChiPSeq/rep2_OUT/24hr_K27ac/24hr_K27ac_noblklst.bam -bed /home/wbd20/Kreimer_Lab/Network/genomic_cordinates_modules.bed > /home/wbd20/Kreimer_Lab/H3K27ac/Read_Counts/H3K27ac_24hr_enhancers_combined_PeakCounts.tab
bedtools multicov -bams /projectsp/f_ak1833_1/dualSMADChiPSeq/rep1_OUT/48hr_K27ac/48hr_K27ac_noblklst.bam /projectsp/f_ak1833_1/dualSMADChiPSeq/rep2_OUT/48hr_K27ac/48hr_K27ac_noblklst.bam -bed /home/wbd20/Kreimer_Lab/Network/genomic_cordinates_modules.bed > /home/wbd20/Kreimer_Lab/H3K27ac/Read_Counts/H3K27ac_48hr_enhancers_combined_PeakCounts.tab
bedtools multicov -bams /projectsp/f_ak1833_1/dualSMADChiPSeq/rep1_OUT/72hr_K27ac/72hr_K27ac_noblklst.bam /projectsp/f_ak1833_1/dualSMADChiPSeq/rep2_OUT/72hr_K27ac/72hr_K27ac_noblklst.bam -bed /home/wbd20/Kreimer_Lab/Network/genomic_cordinates_modules.bed > /home/wbd20/Kreimer_Lab/H3K27ac/Read_Counts/H3K27ac_72hr_enhancers_combined_PeakCounts.tab


In [None]:
class ReadCounts: 
    def __init__(self, ngs_method: str, dataframe: pd.DataFrame, replicants: int, timepoint: int, condition: str, batch: str):
        self.ngs_method = ngs_method # <-- ATACSeq, H3K27ac, RNASeq, etc. 
        self.dataframe = dataframe # <-- Sourced from Bedtool's multicov
        self.replicants = replicants 
        self.timepoint = timepoint # <-- Sample Information, Integer Variable
        self.condition = condition 
        self.batch = batch # <-- Relevant to ImpulseDE2 
        header = ['CHROM', 'START', 'END'] + [f'{timepoint}hrs.{ngs_method}_REPLICANT.{i}' for i in range(1, replicants+1)]
        self.dataframe.columns = header
        self.df = self.dataframe

In [None]:
def read_readcounts(file_paths: list[str], ngs_method: str, replicants: int, timepoints: list[int], conditions: list[str], batches: list[str]):
    readcounts = []
    for i in range(len(file_paths)):
        dataframe = pd.read_csv(file_paths[i], sep = "\t", header = None)
        readcounts.append(ReadCounts(ngs_method[i], dataframe, replicants[i], timepoints[i], conditions[i], batches[i]))
    
    return readcounts # <-- List of ReadCount Class Objects

In [None]:
file_paths = ['/home/wbd20/Kreimer_Lab/ATACSeq/Read_Counts/ATACseq_0hr_enhancers_combined_PeakCounts.tab', 
              '/home/wbd20/Kreimer_Lab/ATACSeq/Read_Counts/ATACseq_3hr_enhancers_combined_PeakCounts.tab', 
              '/home/wbd20/Kreimer_Lab/ATACSeq/Read_Counts/ATACseq_6hr_enhancers_combined_PeakCounts.tab',
              '/home/wbd20/Kreimer_Lab/ATACSeq/Read_Counts/ATACseq_12hr_enhancers_combined_PeakCounts.tab',
              '/home/wbd20/Kreimer_Lab/ATACSeq/Read_Counts/ATACseq_24hr_enhancers_combined_PeakCounts.tab',
              '/home/wbd20/Kreimer_Lab/ATACSeq/Read_Counts/ATACseq_48hr_enhancers_combined_PeakCounts.tab',
              '/home/wbd20/Kreimer_Lab/ATACSeq/Read_Counts/ATACseq_72hr_enhancers_combined_PeakCounts.tab',
              ]

ngs_method = ["ATACSeq", "ATACSeq", "ATACSeq", "ATACSeq", "ATACSeq", "ATACSeq", "ATACSeq"]
replicants = [2, 2, 2, 2, 2, 2, 2]
timepoints = [0, 3, 6, 12, 24, 48, 72]
condition = ["case", "case", "case", "case", "case", "case", "case"]
batch = ["B_NULL", "B_NULL", "B_NULL", "B_NULL", "B_NULL", "B_NULL", "B_NULL"]

ATACSeq_ReadCounts = read_readcounts(file_paths, ngs_method, replicants, timepoints, condition, batch)

file_paths = ['/home/wbd20/Kreimer_Lab/H3K27ac/Read_Counts/H3K27ac_0hr_enhancers_combined_PeakCounts.tab', 
              '/home/wbd20/Kreimer_Lab/H3K27ac/Read_Counts/H3K27ac_3hr_enhancers_combined_PeakCounts.tab', 
              '/home/wbd20/Kreimer_Lab/H3K27ac/Read_Counts/H3K27ac_6hr_enhancers_combined_PeakCounts.tab',
              '/home/wbd20/Kreimer_Lab/H3K27ac/Read_Counts/H3K27ac_12hr_enhancers_combined_PeakCounts.tab',
              '/home/wbd20/Kreimer_Lab/H3K27ac/Read_Counts/H3K27ac_24hr_enhancers_combined_PeakCounts.tab',
              '/home/wbd20/Kreimer_Lab/H3K27ac/Read_Counts/H3K27ac_48hr_enhancers_combined_PeakCounts.tab',
              '/home/wbd20/Kreimer_Lab/H3K27ac/Read_Counts/H3K27ac_72hr_enhancers_combined_PeakCounts.tab',
              ]

ngs_method = ["H3K27ac", "H3K27ac", "H3K27ac", "H3K27ac", "H3K27ac", "H3K27ac", "H3K27ac"]
replicants = [2, 2, 2, 2, 2, 2, 2]
timepoints = [0, 3, 6, 12, 24, 48, 72]
condition = ["case", "case", "case", "case", "case", "case", "case"]
batch = ["B_NULL", "B_NULL", "B_NULL", "B_NULL", "B_NULL", "B_NULL", "B_NULL"]

H3K27ac_ReadCounts = read_readcounts(file_paths, ngs_method, replicants, timepoints, condition, batch)


In [None]:
def generate_deseq2_conditions(rc_list: list[ReadCounts], output_dir: str):
    rc_list = sorted(rc_list, key = lambda x: x.timepoint)
    for i in range(len(rc_list)):
        for j in range(i + 1, len(rc_list)):
            rc1, rc2 = rc_list[i], rc_list[j]
            data = []
            for k in range(1, rc1.replicants + 1):
                data.append([f"{rc1.timepoint}hrs.{rc1.ngs_method}_REPLICANT.{k}", f"{rc1.timepoint}hrs.{rc1.ngs_method}"])
            for k in range(1, rc2.replicants + 1):
                data.append([f"{rc2.timepoint}hrs.{rc2.ngs_method}_REPLICANT.{k}", f"{rc2.timepoint}hrs.{rc2.ngs_method}"])

            df = pd.DataFrame(data, columns = ["", "timepoints"])
            filename = f"{rc1.timepoint}hrs_{rc2.timepoint}hrs.{rc1.ngs_method}.DESeq2_conditions.csv"
            filepath = os.path.join(output_dir, filename)
            df.to_csv(filepath, index = False)

In [None]:
def generate_deseq2_comparisons(rc_list: list[ReadCounts], output_dir: str):
    rc_list = sorted(rc_list, key = lambda x: x.timepoint)
    for i in range(len(rc_list)):
        for j in range(i + 1, len(rc_list)):
            rc1, rc2 = rc_list[i], rc_list[j]
            df = pd.merge(rc1.df, rc2.df, on=['CHROM', 'START', 'END'])
            filename = f"{rc1.timepoint}hrs_{rc2.timepoint}hrs.{rc1.ngs_method}.DESeq2_comparisons.csv"
            filepath = os.path.join(output_dir, filename)
            df.to_csv(filepath, index = False)

In [None]:
generate_deseq2_conditions(ATACSeq_ReadCounts,'/home/wbd20/Kreimer_Lab/ATACSeq/DESeq2')
generate_deseq2_conditions(H3K27ac_ReadCounts,'/home/wbd20/Kreimer_Lab/ATACSeq/DESeq2')

In [None]:
def generate_impulsede2_conditions(rc_list: list[ReadCounts], output_dir: str):
    rc_list = sorted(rc_list, key = lambda x: x.timepoint)
    data = []
    timepoints = sorted(list(set([rc.timepoint for rc in rc_list])))
    for rc in rc_list:
        for k in range(1, rc.replicants + 1):
            data.append([f"{rc.timepoint}hrs.{rc.ngs_method}_REPLICANT.{k}", rc.condition, timepoints.index(rc.timepoint) + 1, rc.batch])
    df = pd.DataFrame(data, columns = ["Sample", "Condition", "Time", "Batch"])
    timepoints_str = '_'.join([f"{rc.timepoint}hrs" for rc in rc_list])
    filename = f"{timepoints_str}_{rc_list[0].ngs_method}.ImpulseDE2_conditions.csv"
    filepath = os.path.join(output_dir, filename)
    df.to_csv(filepath, index = False)

In [None]:
def generate_impulsede2_comparisons(rc_list: list[ReadCounts], output_dir: str):
    rc_list = sorted(rc_list, key = lambda x: x.timepoint)
    df = rc_list[0].df
    for rc in rc_list[1:]:
        df = pd.merge(df, rc.df, on=['CHROM', 'START', 'END'], how='inner')
    filename = f"{'_'.join([f'{rc.timepoint}hrs' for rc in rc_list])}_{rc_list[0].ngs_method}.ImpulseDE2_comparisons.csv"
    filepath = os.path.join(output_dir, filename)
    df.to_csv(filepath, index = False)

In [None]:
generate_impulsede2_conditions(ATACSeq_ReadCounts,'/home/wbd20/Kreimer_Lab/ATACSeq/ImpulseDE2')
generate_impulsede2_comparisons(H3K27ac_ReadCounts,'/home/wbd20/Kreimer_Lab/ATACSeq/ImpulseDE2')