# Analyze randomized graphs

In [34]:
import csv
from scipy.stats import ks_2samp

## Analyze link lists

### Perform Kolmogorov-Smirnov test

In [35]:
def getKS(fnOriginal, fnRandomized):
    L = []
    with open(fnOriginal, newline='') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
        for row in spamreader:
            L.append([  int(row[0]), int(row[1])   ])
    R = []
    with open(fnRandomized, newline='') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
        for row in spamreader:
            R.append([  int(row[0]), int(row[1])   ])

    lL = sorted([link[1]-link[0] for link in L])
    lR = sorted([link[1]-link[0] for link in R])

    rez = ks_2samp(lL, lR)
    return rez

## Analyze cool files

In [36]:
import cooler
import pandas as pd
import numpy as np
import h5py
import hicrep
from hicrep.utils import readMcool
from hicrep import hicrepSCC

In [37]:
fnOriginal = "./sampleCool.cool"
fnRandomized = "./sampleRandomizedCool.cool"
pv=4 
chr="chr15" #these params were used in randomization

def getKSCool(fnOriginal, fnRandomized, pv, chr):
    def getL(inputPath, chr, pv):
        #Returns (extracts) a list of links from cool file
        cool = cooler.Cooler(inputPath)
        bins = cool.bins()[:].to_dict('records')
        rows = []
        pixels = cool.pixels()[:].to_dict('records')
        #print(pixels)
        for px in pixels:
            if px["count"] > pv:
                a = bins[px["bin1_id"]]
                b = bins[px["bin2_id"]]
                if str(a["chrom"]) == chr and str(b["chrom"]) == chr:
                    link = ((a["start"] + a["end"]) // 2,
                            (b["start"] + b["end"]) // 2)
                    rows.append(link)
        
        return [list(el) for el in rows]

    L = getL(fnOriginal, chr, pv)
    R = getL(fnRandomized, chr, pv)

    lL = sorted([link[1]-link[0] for link in L])
    lR = sorted([link[1]-link[0] for link in R])

    rez = ks_2samp(lL, lR)
    return rez

In [38]:

def doHiCRep(fcool1, fcool2, chrs):
    
    cool1, binSize1 = readMcool(fcool1, -1)
    cool2, binSize2 = readMcool(fcool2, -1)
    # binSize1 and binSize2 will be set to the bin size built in the cool file
    binSize = binSize1
    # smoothing window half-size
    h = 1

    # maximal genomic distance to include in the calculation
    dBPMax = 50000000

    # whether to perform down-sampling or not 
    # if set True, it will bootstrap the data set # with larger contact counts to
    # the same number of contacts as in the other data set; otherwise, the contact 
    # matrices will be normalized by the respective total number of contacts
    bDownSample = False

    # compute the SCC score
    # this will result in a SCC score for each chromosome available in the data set
    # listed in the same order as the chromosomes are listed in the input Cooler files
    # scc = hicrepSCC(cool1, cool2, h, dBPMax, bDownSample)

    # Optionally you can get SCC score from a subset of chromosomes
    sccSub = hicrepSCC(cool1, cool2, h, dBPMax, bDownSample, np.array(chrs, dtype=str))
    #print(sccSub)
    return sccSub





In [40]:
#Sample links example
fnOriginal = "./sampleOriginalLinks.csv"
fnRandomized = "./myRandomizedLinks.csv"
print("KS score:", getKS(fnOriginal, fnRandomized))

#Full schwarzer dataset, analyze chr15
fnOriginal = "./sampleCool.cool"
fnRandomized = "./sampleRandomizedCool.cool"
pv=4 
chr="chr15" #these params were used in randomization
print(getKSCool(fnOriginal, fnRandomized, pv, chr))
print(doHiCRep(fnOriginal, fnRandomized, [chr]))


#sample cool chr15 with only significant (count>4) links example
fnOriginal = "./schwarzerChr15Sample.cool"
fnRandomized = "./schwarzerChr15Randomized80.cool"
chr="15"
pv=4
rezKS = getKSCool(fnOriginal, fnRandomized, pv, chr)
print(rezKS)
rezHiCRep = doHiCRep(fnOriginal, fnRandomized, [chr])
print(rezHiCRep)



KS score: KstestResult(statistic=0.0015917963841088345, pvalue=0.9999991662600691)
KstestResult(statistic=0.010612177638625653, pvalue=0.0021373712325705504)
[0.36412353]
KstestResult(statistic=0.00919890752500796, pvalue=0.012738090332724203)
[0.09935032]
