In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import uuid  # To generate a unique ID for each file
from tqdm import tqdm

In [2]:
# List of chromosomes
chromosomes = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 
               'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 
               'chr20', 'chr21', 'chr22', 'chrX']

# Initialize an empty list to store each row of data
rows = []

for directory in ["./GM12878", "./K562", "./Hela", "./HAP1"]:
    for file in tqdm(os.listdir(directory)):
        file_path = os.path.join(directory, file)
        
        # Generate a unique ID for each file
        unique_id = str(uuid.uuid4())
        
        # Initialize a dictionary to store data for this file (each key is a column in the DataFrame)
        file_data = {
            "unique_id": unique_id,
            "file_directory": file,
            "home_directory": directory.split('/')[-1],
            "chr_total": 0  # This will store the sum of all chromosome bin sizes
        }
        
        # Process each chromosome in the file
        for chr in chromosomes:
            # Load data and filter for the current chromosome
            data = pd.read_csv(file_path, sep='\t', header=None, names=['chr1', 'pos1', 'chr2', 'pos2', 'interaction'])
            data = data[data['chr1'] == chr]
            
            # Calculate the bin size count for the chromosome if there is data
            if not data.empty:
                min_pos = data['pos1'].min()
                max_pos = data['pos1'].max()
                bin_size_count = (max_pos - min_pos) / 1_000_000  # Convert to Mb
                
                # Store the bin size count for this chromosome
                file_data[chr] = bin_size_count
                file_data["chr_total"] += bin_size_count  # Add to the total count for this file
            else:
                # If no data for this chromosome, store 0
                file_data[chr] = 0
        
        # Append this file's data as a row in the list
        rows.append(file_data)

# Convert the list of dictionaries to a pandas DataFrame
results_df = pd.DataFrame(rows)

  0%|          | 0/24 [00:00<?, ?it/s]

100%|██████████| 24/24 [00:00<00:00, 26.17it/s]
100%|██████████| 48/48 [00:01<00:00, 27.10it/s]
100%|██████████| 1622/1622 [01:30<00:00, 17.83it/s]
100%|██████████| 917/917 [00:43<00:00, 20.89it/s]


In [None]:
results_df 

In [12]:
for file in tqdm(os.listdir("./GM12878")):
    file_path = os.path.join("./GM12878", file)
    output = {}
    for chr in chromosomes:
        data = pd.read_csv(file_path, sep='\t', header=None, names=['chr1', 'pos1', 'chr2', 'pos2', 'interaction'])
        data = data[data['chr1'] == chr]
        min_pos = data['pos1'].min()
        output[chr] = min_pos
    print(output)
        

 29%|██▉       | 7/24 [00:00<00:00, 32.93it/s]

{'chr1': 1000000, 'chr2': 0, 'chr3': 2000000, 'chr4': 2000000, 'chr5': 1000000, 'chr6': 0, 'chr7': 0, 'chr8': 0, 'chr9': 1000000, 'chr10': 0, 'chr11': 0, 'chr12': 0, 'chr13': 19000000, 'chr14': 21000000, 'chr15': 25000000, 'chr16': 3000000, 'chr17': 0, 'chr18': 1000000, 'chr19': 0, 'chr20': 0, 'chr21': 17000000, 'chr22': 16000000, 'chrX': 3000000}
{'chr1': 0, 'chr2': 0, 'chr3': 0, 'chr4': 0, 'chr5': 0, 'chr6': 0, 'chr7': 0, 'chr8': 0, 'chr9': 0, 'chr10': 0, 'chr11': 0, 'chr12': 0, 'chr13': 19000000, 'chr14': 20000000, 'chr15': 22000000, 'chr16': 0, 'chr17': 0, 'chr18': 0, 'chr19': 0, 'chr20': 0, 'chr21': 10000000, 'chr22': 17000000, 'chrX': 2000000}
{'chr1': 3000000, 'chr2': 0, 'chr3': 0, 'chr4': 1000000, 'chr5': 0, 'chr6': 0, 'chr7': 0, 'chr8': 0, 'chr9': 0, 'chr10': 0, 'chr11': 0, 'chr12': 0, 'chr13': 22000000, 'chr14': 21000000, 'chr15': 24000000, 'chr16': 0, 'chr17': 0, 'chr18': 0, 'chr19': 0, 'chr20': 1000000, 'chr21': 14000000, 'chr22': 18000000, 'chrX': 3000000}
{'chr1': 1000000

 46%|████▌     | 11/24 [00:00<00:00, 29.79it/s]

{'chr1': 0, 'chr2': 0, 'chr3': 0, 'chr4': 0, 'chr5': 0, 'chr6': 1000000, 'chr7': 0, 'chr8': 0, 'chr9': 0, 'chr10': 0, 'chr11': 0, 'chr12': 0, 'chr13': 19000000, 'chr14': 20000000, 'chr15': 22000000, 'chr16': 0, 'chr17': 0, 'chr18': 0, 'chr19': 0, 'chr20': 0, 'chr21': 16000000, 'chr22': 17000000, 'chrX': 4000000}
{'chr1': 1000000, 'chr2': 0, 'chr3': 0, 'chr4': 0, 'chr5': 2000000, 'chr6': 0, 'chr7': 0, 'chr8': 1000000, 'chr9': 0, 'chr10': 1000000, 'chr11': 4000000, 'chr12': 0, 'chr13': 19000000, 'chr14': 20000000, 'chr15': 22000000, 'chr16': 1000000, 'chr17': 1000000, 'chr18': 0, 'chr19': 2000000, 'chr20': 1000000, 'chr21': 17000000, 'chr22': 17000000, 'chrX': 2000000}
{'chr1': 1000000, 'chr2': 0, 'chr3': 0, 'chr4': 0, 'chr5': 0, 'chr6': 0, 'chr7': 1000000, 'chr8': 1000000, 'chr9': 0, 'chr10': 0, 'chr11': 1000000, 'chr12': 0, 'chr13': 19000000, 'chr14': 19000000, 'chr15': 23000000, 'chr16': 0, 'chr17': 0, 'chr18': 0, 'chr19': 0, 'chr20': 0, 'chr21': 15000000, 'chr22': 18000000, 'chrX': 2

 75%|███████▌  | 18/24 [00:00<00:00, 28.65it/s]

{'chr1': 0, 'chr2': 0, 'chr3': 1000000, 'chr4': 2000000, 'chr5': 0, 'chr6': 0, 'chr7': 0, 'chr8': 0, 'chr9': 0, 'chr10': 0, 'chr11': 1000000, 'chr12': 0, 'chr13': 20000000, 'chr14': 20000000, 'chr15': 20000000, 'chr16': 0, 'chr17': 0, 'chr18': 0, 'chr19': 0, 'chr20': 0, 'chr21': 15000000, 'chr22': 18000000, 'chrX': 3000000}
{'chr1': 0, 'chr2': 0, 'chr3': 0, 'chr4': 0, 'chr5': 0, 'chr6': 0, 'chr7': 0, 'chr8': 0, 'chr9': 0, 'chr10': 0, 'chr11': 0, 'chr12': 0, 'chr13': 19000000, 'chr14': 20000000, 'chr15': 20000000, 'chr16': 0, 'chr17': 0, 'chr18': 0, 'chr19': 0, 'chr20': 0, 'chr21': 10000000, 'chr22': 17000000, 'chrX': 2000000}
{'chr1': 0, 'chr2': 0, 'chr3': 0, 'chr4': 0, 'chr5': 0, 'chr6': 0, 'chr7': 0, 'chr8': 0, 'chr9': 0, 'chr10': 0, 'chr11': 0, 'chr12': 0, 'chr13': 19000000, 'chr14': 20000000, 'chr15': 20000000, 'chr16': 0, 'chr17': 0, 'chr18': 0, 'chr19': 0, 'chr20': 0, 'chr21': 9000000, 'chr22': 18000000, 'chrX': 2000000}
{'chr1': 1000000, 'chr2': 0, 'chr3': 0, 'chr4': 2000000, 'c

100%|██████████| 24/24 [00:00<00:00, 30.08it/s]

{'chr1': 1000000, 'chr2': 0, 'chr3': 0, 'chr4': 0, 'chr5': 1000000, 'chr6': 1000000, 'chr7': 0, 'chr8': 0, 'chr9': 1000000, 'chr10': 0, 'chr11': 0, 'chr12': 0, 'chr13': 20000000, 'chr14': 20000000, 'chr15': 20000000, 'chr16': 1000000, 'chr17': 0, 'chr18': 0, 'chr19': 1000000, 'chr20': 1000000, 'chr21': 16000000, 'chr22': 19000000, 'chrX': 3000000}
{'chr1': 1000000, 'chr2': 0, 'chr3': 0, 'chr4': 0, 'chr5': 1000000, 'chr6': 0, 'chr7': 0, 'chr8': 0, 'chr9': 0, 'chr10': 0, 'chr11': 0, 'chr12': 0, 'chr13': 19000000, 'chr14': 21000000, 'chr15': 23000000, 'chr16': 0, 'chr17': 0, 'chr18': 2000000, 'chr19': 2000000, 'chr20': 0, 'chr21': 14000000, 'chr22': 19000000, 'chrX': 4000000}
{'chr1': 0, 'chr2': 0, 'chr3': 0, 'chr4': 0, 'chr5': 0, 'chr6': 0, 'chr7': 0, 'chr8': 0, 'chr9': 0, 'chr10': 0, 'chr11': 0, 'chr12': 0, 'chr13': 19000000, 'chr14': 20000000, 'chr15': 20000000, 'chr16': 0, 'chr17': 0, 'chr18': 0, 'chr19': 0, 'chr20': 0, 'chr21': 10000000, 'chr22': 17000000, 'chrX': 2000000}
{'chr1': 0




In [None]:
# List of chromosomes
chromosomes = ['chr1']
for directory in ["./GM12878"]:
    for file in tqdm(os.listdir(directory)):
        file_path = os.path.join(directory, file)
        # Process each chromosome in the file
        for chr in chromosomes:
            # Load data and filter for the current chromosome
            data = pd.read_csv(file_path, sep='\t', header=None, names=['chr1', 'pos1', 'chr2', 'pos2', 'interaction'])
            data = data[data['chr1'] == chr]
            
            # Calculate the bin size count for the chromosome if there is data
            if not data.empty:
                min_pos1 = data['pos1'].min()
                max_pos1 = data['pos1'].max()
                min_pos2 = data['pos2'].min()
                max_pos2 = data['pos2'].max()
                print(max_pos1-min_pos1, max_pos2-min_pos2)
            

100%|██████████| 24/24 [00:00<00:00, 311.38it/s]

246000000 243000000
247000000 246000000
246000000 246000000
245000000 245000000
245000000 245000000
248000000 246000000
246000000 246000000
248000000 247000000
247000000 248000000
247000000 245000000
248000000 248000000
247000000 247000000
245000000 248000000
248000000 248000000
249000000 249000000
248000000 247000000
246000000 247000000
247000000 246000000
247000000 248000000
248000000 248000000
247000000 246000000
248000000 247000000
248000000 247000000
247000000 242000000



