Martin Loza

23/12/22

In this workflow I would setup the current average matrices to use cooler to transform them into cool and mcool file.

In [10]:
# Init libraries
import pandas as pd
import cooler

# Global variables
in_dir = "~/Documents/Projects/HK_Interactions/Analysis/2023_12/2023_12_19/Results/"
out_dir = "/Users/martin/Documents/Projects/HK_Interactions/Analysis/2023_12/2023_12_26/Results/"
cool_dir ="/Volumes/MARTIN_LOZA/Projects/HK_Interactions/Data/HiC/SRA/mcool/"
date = "231222"

# Local functions



In previous test I verify that we can create cool files using the cooler function load. For this process we need to provide the bins information and the bedpe files with the mean values. 
The function need to be run in the command line, but I will setup the neccesary files in this notebook.

In [25]:
#define the resolutions to be used
resolutions = ['5000','10000','25000','50000','100000','250000','500000','1000000','2500000','5000000']
# resolutions = ['5000000']

#for each resolution, we need to save the bin information and the average matrix
for res in resolutions:
   
    # Load the mcool file for the resolution. We can use any cell type, since the bin information is the same for all of them.
    cool_data = cooler.Cooler(cool_dir + "GM12878-HindIII-allReps-filtered.mcool::resolutions/" + str(res))
    #get the bins table
    bins = cool_data.bins()[:]
    #remove the columns that are not needed
    bins = bins[['chrom', 'start', 'end']]
    #save the bins table as a bed file
    bins.to_csv(f'{out_dir}/bins/bins_{res}.bed', sep='\t', index=False, header=False)
    
    # Load the average matrix df
    average_df = pd.read_csv(f'{in_dir}mean_values_{res}.tsv', sep='\t')
    #select columns
    average_df = average_df[['chrom1', 'start1', 'end1', 'chrom2', 'start2', 'end2', "mean"]]
    #save the bedpe file
    average_df.to_csv(f'{out_dir}/average_bedpe/mean_values_{res}.bed', sep='\t', index=False, header=False)

## TESTS

In [2]:
#set resolution
res = 500000
# Load the average matrix df
average_df = pd.read_csv(f'{in_dir}mean_values_{res}.tsv', sep='\t')
#select columns
average_df = average_df[['chrom1', 'start1', 'end1', 'chrom2', 'start2', 'end2', "mean"]]
average_df[:]


Unnamed: 0,chrom1,start1,end1,chrom2,start2,end2,mean
0,chr1,0,500000,chr1,0,500000,-10.000000
1,chr1,0,500000,chr1,500000,1000000,-10.000000
2,chr1,0,500000,chr1,16500000,17000000,-10.000000
3,chr1,0,500000,chr1,21500000,22000000,-10.000000
4,chr1,500000,1000000,chr1,500000,1000000,3.834949
...,...,...,...,...,...,...,...
824749,chrX,155000000,155500000,chrX,155500000,156000000,3.884706
824750,chrX,155000000,155500000,chrX,156000000,156040895,-10.000000
824751,chrX,155500000,156000000,chrX,155500000,156000000,4.411554
824752,chrX,155500000,156000000,chrX,156000000,156040895,-10.000000


In [4]:
#save the bed files

average_df.to_csv(f'{out_dir}mean_values_{res}.bed', sep='\t', index=False, header=False)

It looks like I can do this with cooler as 
cooler load -f bg2 hg19_chr_sizes.txt:10000 input.bed test.cool
I should investigate this way.

In [None]:
cooler load -f bg2 hg38.chrom.sizes.txt:5000000 mean_values_500000.bed test_500K.cool

It looks like there are duplicate pixels... let's verify this

In [6]:
#get a copy of the average_df
test_df = average_df.copy()
#create a new column with an id of the interaction, using the start and end of the bins
test_df['id'] = test_df['chrom1'] + '_' + test_df['start1'].astype(str) + '_' + test_df['end1'].astype(str) + '_' + test_df['chrom2'] + '_' + test_df['start2'].astype(str) + '_' + test_df['end2'].astype(str)
#find duplicated ids
test_df[test_df.duplicated(['id'])] 

Unnamed: 0,chrom1,start1,end1,chrom2,start2,end2,mean,id


Maybe I need to provide the bed file of the bins used

In [17]:
# Load the mcool file for the resolution
cool_data = cooler.Cooler(cool_dir + "GM12878-HindIII-allReps-filtered.mcool::resolutions/" + str(res))
#get the bins table
bins = cool_data.bins()[:]
#remove the columns that are not needed
bins = bins[['chrom', 'start', 'end']]
#save the bins table as a bed file
bins.to_csv(f'{out_dir}bins_{res}.bed', sep='\t', index=False, header=False)

In [16]:
bins[:]

Unnamed: 0,chrom,start,end
0,chr1,0,500000
1,chr1,500000,1000000
2,chr1,1000000,1500000
3,chr1,1500000,2000000
4,chr1,2000000,2500000
...,...,...,...
6068,chrX,154000000,154500000
6069,chrX,154500000,155000000
6070,chrX,155000000,155500000
6071,chrX,155500000,156000000


It looks like we did it.. let's have a look at the cool file created

In [21]:
#Load test cool file
test_cool = cooler.Cooler(f'{out_dir}test_500K.cool')
#get matrix information
test_matrix =test_cool.matrix(balance=False, as_pixels=True, join = True)[:]

In [22]:
test_matrix[:]

Unnamed: 0,chrom1,start1,end1,chrom2,start2,end2,count
0,chr1,0,500000,chr1,0,500000,-10.000000
1,chr1,0,500000,chr1,500000,1000000,-10.000000
2,chr1,0,500000,chr1,16500000,17000000,-10.000000
3,chr1,0,500000,chr1,21500000,22000000,-10.000000
4,chr1,500000,1000000,chr1,500000,1000000,3.834949
...,...,...,...,...,...,...,...
824749,chrX,155000000,155500000,chrX,155500000,156000000,3.884706
824750,chrX,155000000,155500000,chrX,156000000,156040895,-10.000000
824751,chrX,155500000,156000000,chrX,155500000,156000000,4.411554
824752,chrX,155500000,156000000,chrX,156000000,156040895,-10.000000


In [23]:
average_df[:]

Unnamed: 0,chrom1,start1,end1,chrom2,start2,end2,mean
0,chr1,0,500000,chr1,0,500000,-10.000000
1,chr1,0,500000,chr1,500000,1000000,-10.000000
2,chr1,0,500000,chr1,16500000,17000000,-10.000000
3,chr1,0,500000,chr1,21500000,22000000,-10.000000
4,chr1,500000,1000000,chr1,500000,1000000,3.834949
...,...,...,...,...,...,...,...
824749,chrX,155000000,155500000,chrX,155500000,156000000,3.884706
824750,chrX,155000000,155500000,chrX,156000000,156040895,-10.000000
824751,chrX,155500000,156000000,chrX,155500000,156000000,4.411554
824752,chrX,155500000,156000000,chrX,156000000,156040895,-10.000000
