In [15]:
import os
import re
import pandas as pd
from glob import glob

In [54]:
def process_bin_gene_counts(bin_dir, intersect_folder, dtype):
    gene_intersect_dir = os.path.join(bin_dir, "bedtools_out", intersect_folder)

    if dtype == 'gene':
        bed_files = glob(os.path.join(gene_intersect_dir, "*_genesIntersect.bed"))
    elif dtype == 'nucleosome':
        bed_files = glob(os.path.join(gene_intersect_dir, "*_nucIntersect.bed"))
    df_list = []
    
    print(f"{gene_intersect_dir}")
    for bed in bed_files:
        print("")
        print(bed)
    for bedfile in bed_files:
        
        if dtype == 'gene':
            
            # Extract sample ID from filename (e.g., "A14891_125_155_genesIntersect.bed" -> "A14891_125_155")
            if dtype == 'gene':
                sample_id = os.path.basename(bedfile).replace("_genesIntersect.bed", "")
            if dtype == 'nucleosome':
                sample_id = os.path.basename(bedfile).replace("_nucIntersect.bed", "")
            
            print(f"Reading {sample_id}")
            df = pd.read_csv(
                bedfile,
                sep='\t',
                header=None,
                names=["chr", "source", "feature", "start", "end", "score", "strand", "phase", "attributes", "count"]
            )
            
            # Extract gene_id and gene_name from attributes
            df["gene_id"] = df["attributes"].apply(
                lambda x: re.search(r"gene:([^;]+)", x).group(1) if re.search(r"gene:([^;]+)", x) else None
            )
            df["gene_name"] = df["attributes"].apply(
                lambda x: re.search(r"Name=([^;]+)", x).group(1) if re.search(r"Name=([^;]+)", x) else None
            )
            
            # Retain only necessary columns
            df_final = df[["gene_id", "gene_name", "chr", "start", "end", "count"]].copy()
            df_final["sample"] = sample_id  # Attach sample info
            
            df_list.append(df_final)
        if dtype == 'nucleosome':
            

    print("Finished reading files")
    print("Combining.....")

    # Combine all data into one DataFrame
    combined = pd.concat(df_list, ignore_index=True)
    
    print("Finished combining")

    # Pivot to wide format **keeping chr, start, end as gene locations**
    wide_counts = combined.pivot_table(
        index=["gene_id", "gene_name", "chr", "start", "end"], 
        columns="sample", 
        values="count",
        fill_value=0
    ).reset_index()

    return wide_counts

In [71]:
import os
import re
import pandas as pd
from glob import glob

def process_bin_counts(bin_dir, intersect_folder, dtype):
    """
    Generates a count matrix for genes or nucleosomes from BEDTools intersect files.
    
    Parameters:
    - bin_dir: Path to bin directory (e.g., "125_155").
    - intersect_folder: Subfolder inside "bedtools_out" for intersect files.
    - dtype: Type of analysis ('gene' or 'nucleosome').
    
    Returns:
    - Pandas DataFrame containing the count matrix.
    """
    intersect_dir = os.path.join(bin_dir, "bedtools_out", intersect_folder)

    # Identify files based on dtype
    if dtype == 'gene':
        bed_files = glob(os.path.join(intersect_dir, "*_genesIntersect.bed"))
    elif dtype == 'nucleosome':
        bed_files = glob(os.path.join(intersect_dir, "*_nucIntersect.bed"))
    else:
        raise ValueError("Invalid dtype. Use 'gene' or 'nucleosome'.")

    df_list = []

    print(f"Processing {dtype} data in: {intersect_dir}")
    
    for bedfile in bed_files:
        # Extract sample ID from filename
        sample_id = os.path.basename(bedfile).replace("_genesIntersect.bed", "").replace("_nucIntersect.bed", "")
        print(f"Reading {sample_id}")

        if dtype == 'gene':
            df = pd.read_csv(
                bedfile,
                sep='\t',
                header=None,
                names=["chr", "source", "feature", "start", "end", "score", "strand", "phase", "attributes", "count"],
                dtype={"attributes": str},  # Force attributes column to be a string
                na_values=[".", "NA"]  # Handle missing values
            )

            # Ensure attributes column is a string before applying regex
            df["attributes"] = df["attributes"].fillna("").astype(str)

            # Extract gene_id and gene_name only if dtype == 'gene'
            df["gene_id"] = df["attributes"].apply(
                lambda x: re.search(r"gene:([^;]+)", x).group(1) if re.search(r"gene:([^;]+)", x) else None
            )
            df["gene_name"] = df["attributes"].apply(
                lambda x: re.search(r"Name=([^;]+)", x).group(1) if re.search(r"Name=([^;]+)", x) else None
            )

            df_final = df[["gene_id", "gene_name", "chr", "start", "end", "count"]].copy()

        elif dtype == 'nucleosome':
            df = pd.read_csv(
                bedfile,
                sep='\t',
                header=None,
                names=["chr", "start", "end", "nuc_id", "peak1", "peak2", "unknown", "count"],
                dtype={"chr": str, "start": int, "end": int, "nuc_id": str, "count": int},  # Ensures correct types
                na_values=[".", "NA"]  # Handle missing values
            )

            # Keep only necessary columns
            df_final = df[["chr", "start", "end", "nuc_id", "count"]].copy()

        df_final["sample"] = sample_id  # Attach sample info
        df_list.append(df_final)

    print("Finished reading files. Combining data...")

    # Combine all data into one DataFrame
    combined = pd.concat(df_list, ignore_index=True)

    # Pivot to wide format for DESeq2 compatibility
    if dtype == 'gene':
        index_cols = ["gene_id", "gene_name", "chr", "start", "end"]
    else:  # nucleosome case
        index_cols = ["chr", "start", "end", "nuc_id"]

    wide_counts = combined.pivot_table(
        index=index_cols,
        columns="sample",
        values="count",
        fill_value=0
    ).reset_index()

    print("Finished creating count matrix.")
    return wide_counts


In [40]:
# Example usage
# bin_dir = "/Users/.../ctDNA_11042024/data/human_binned_sequences/125_155"
# counts_125_155 = process_bin_gene_counts(bin_dir)
# counts_125_155.to_csv("125_155_gene_counts.tsv", sep="\t", index=False)

# If you want to do the loop for each bin
bins = ["125_155","150_180","170_220","220_345","40_150"]
# bins = ["0_345"]
parent = "/Users/janzules/Roselab/ctDNA_11042024/data/human_binned_sequences"
output_dir = "/Users/janzules/Roselab/ctDNA_11042024/data/human_binned_sequences/ctMatrices/"
#Which Counts?

In [72]:
# intersect_type = 'gene'
intersect_type = "nucleosome"

if intersect_type == 'gene':
    intersect_folder = "geneIntersect"
    sub_folder = "geneCounts"
    out_name = "_gene_counts.tsv"
    dtype = 'gene'
elif intersect_type == 'nucleosome':
    intersect_folder = "nuclIntersect"
    sub_folder = "nuclCounts"
    out_name = "_nucleosome_counts.tsv"
    dtype = 'nucleosome'
    

print(intersect_folder)
print(dtype)

nuclIntersect
nucleosome


In [73]:
dtype

'nucleosome'

In [74]:
bin_path = os.path.join(parent, "125_155")
a = process_bin_gene_counts(bin_path, intersect_folder, dtype)

/Users/janzules/Roselab/ctDNA_11042024/data/human_binned_sequences/125_155/bedtools_out/nuclIntersect

/Users/janzules/Roselab/ctDNA_11042024/data/human_binned_sequences/125_155/bedtools_out/nuclIntersect/A14891_125_155_nucIntersect.bed

/Users/janzules/Roselab/ctDNA_11042024/data/human_binned_sequences/125_155/bedtools_out/nuclIntersect/A14895_125_155_nucIntersect.bed

/Users/janzules/Roselab/ctDNA_11042024/data/human_binned_sequences/125_155/bedtools_out/nuclIntersect/A14899_125_155_nucIntersect.bed

/Users/janzules/Roselab/ctDNA_11042024/data/human_binned_sequences/125_155/bedtools_out/nuclIntersect/A14902_125_155_nucIntersect.bed

/Users/janzules/Roselab/ctDNA_11042024/data/human_binned_sequences/125_155/bedtools_out/nuclIntersect/A14906_125_155_nucIntersect.bed

/Users/janzules/Roselab/ctDNA_11042024/data/human_binned_sequences/125_155/bedtools_out/nuclIntersect/A14903_125_155_nucIntersect.bed

/Users/janzules/Roselab/ctDNA_11042024/data/human_binned_sequences/125_155/bedtools_out

  df = pd.read_csv(


TypeError: expected string or bytes-like object, got 'float'

In [75]:
a

NameError: name 'a' is not defined

In [46]:

for b in bins:
    bin_path = os.path.join(parent, b)
    mat = process_bin_gene_counts(bin_path, intersect_folder, dtype)
    outname = os.path.join(output_dir, sub_folder, f"{b}{out_name}")
    mat.to_csv(outname, sep="\t", index=False)

/Users/janzules/Roselab/ctDNA_11042024/data/human_binned_sequences/125_155/bedtools_out/nuclIntersect
['/Users/janzules/Roselab/ctDNA_11042024/data/human_binned_sequences/125_155/bedtools_out/nuclIntersect/A14891_125_155_nucIntersect.bed', '/Users/janzules/Roselab/ctDNA_11042024/data/human_binned_sequences/125_155/bedtools_out/nuclIntersect/A14895_125_155_nucIntersect.bed', '/Users/janzules/Roselab/ctDNA_11042024/data/human_binned_sequences/125_155/bedtools_out/nuclIntersect/A14899_125_155_nucIntersect.bed', '/Users/janzules/Roselab/ctDNA_11042024/data/human_binned_sequences/125_155/bedtools_out/nuclIntersect/A14902_125_155_nucIntersect.bed', '/Users/janzules/Roselab/ctDNA_11042024/data/human_binned_sequences/125_155/bedtools_out/nuclIntersect/A14906_125_155_nucIntersect.bed', '/Users/janzules/Roselab/ctDNA_11042024/data/human_binned_sequences/125_155/bedtools_out/nuclIntersect/A14903_125_155_nucIntersect.bed', '/Users/janzules/Roselab/ctDNA_11042024/data/human_binned_sequences/125_155

  df = pd.read_csv(


TypeError: expected string or bytes-like object, got 'float'

In [1]:
import os
import re
import pandas as pd
from glob import glob

def process_bin_gene_counts(bin_dir, intersect_folder):
    gene_intersect_dir = os.path.join(bin_dir, "bedtools_out", intersect_folder)
    
    bed_files = glob(os.path.join(gene_intersect_dir, "*_genesIntersect.bed"))
    df_list = []
    
    for bedfile in bed_files:
        # e.g. "A14891_125_155_genesIntersect.bed"
        sample_id = os.path.basename(bedfile).replace("_genesIntersect.bed","")  # "A14891_125_155"
        
        # Read the BED file
        # By default, pandas will treat # lines as comments if you have them. 
        # Adjust header=None if no header line in the file.
        print(f"Reading {sample_id}")
        df = pd.read_csv(
            bedfile,
            sep='\t',
            header=None,
            names=["chr","source","feature","start","end","score","strand","phase","attributes","count"]
        )
        
        # Parse gene_id and gene_name from the 'attributes' column
        # Look for strings like "ID=gene:ENSG00000186092;Name=OR4F5;..."
        # We'll capture "ENSG00000186092" and "OR4F5" with regex
        df["gene_id"] = df["attributes"].apply(
            lambda x: re.search(r"gene:([^;]+)", x).group(1) if re.search(r"gene:([^;]+)", x) else None
        )
        df["gene_name"] = df["attributes"].apply(
            lambda x: re.search(r"Name=([^;]+)", x).group(1) if re.search(r"Name=([^;]+)", x) else None
        )
        
        # Aggregate counts by gene
        df_sum = df.groupby(["gene_id","gene_name"], as_index=False)["count"].sum()
        
        # Attach sample info
        df_sum["sample"] = sample_id
        df_list.append(df_sum)

    print("Finished reading files")
    print("Combining.....")
    # Combine into one long df
    combined = pd.concat(df_list, ignore_index=True)
    print("Finished combining")
    # Pivot to wide format
    wide_counts = combined.pivot_table(
        index=["gene_id","gene_name"], 
        columns="sample", 
        values="count",
        fill_value=0
    ).reset_index()
    
    return wide_counts