# This notebook demonstrates the comparison of datasets. The comparison is based on C3 and S(k)

Imports

In [None]:
from universal import UniversalDS, ChrData
from topologicalFeatures import Cliques, Bases, BasesOfBases

import matplotlib.pyplot as plt
from numpy import log2
import csv

Read 3 datasets

In [None]:
A = UniversalDS("..\\sampleData\\data-pvalue-0.7-fin-min.json")
B = UniversalDS("..\\sampleData\\data-pvalue-5-fin-min.json")
C = UniversalDS("..\\sampleData\\data-pvalue-10-fin-min.json") 

A.DS = "Tissue pcHi-C"
B.DS = "Blood cell pcHi-C"
C.DS = "Tissue Hi-C"

Choose a chromosome to analyze. Files in sampleData contain chr6 and chr15

In [None]:
ch = "chr6"

Utility function to create a plot

In [None]:
def plot3DS(linkPointsA, linkPointsB, linkPointsC, 
            title, ylabel):
    title = title
    fig, ax = plt.subplots()
    print("start {ch}".format(ch=ch))
    bins = 250
    ax.hist(x=linkPointsA, bins=bins, histtype='step', label=A.DS, fill=True, alpha=.3, density=True, facecolor="red", hatch='-----', linewidth=0.1, edgecolor="black")
    ax.hist(x=linkPointsB, bins=bins, histtype='step', label=B.DS, fill=True, alpha=.3, density=True, facecolor="black", hatch='\\\\\\\\\\', linewidth=0.1, edgecolor="black")
    ax.hist(x=linkPointsC, bins=bins, histtype='step', label=C.DS, fill=True, alpha=.4, density=True, facecolor="blue", hatch='//////', linewidth=0.1, edgecolor="black")
    ax.set_xlim(left=0)
    ax.ticklabel_format(axis='x', scilimits=(6,6) )
    ax.set_title("{ch}".format(ch=ch))
    ax.tick_params(labelsize=16)
    ax.legend(loc="upper right")
    ax.set(xlabel='Loci in Mbp')
    ax.set(ylabel=ylabel)
    fig.suptitle(title, size=18)
    fig.set_size_inches(16.,10.)
    #plt.savefig('{oa}-{v}-compare-randomized-{o}-vs-{r}.png'.format(oa=objectAnalyzed, v=variant, o=U.DS, r=R.DS), dpi=300)
    plt.savefig('IMG-{title}.png'.format(title=title), dpi=600)
    plt.savefig('IMG-{title}.svg'.format(title=title), dpi=600)
    #plt.show()
    plt.clf()

Plot link endpoint loci in 3 datasets

In [None]:
aChData = ChrData(A, ch)
bChData = ChrData(B, ch)
cChData = ChrData(C, ch)

aLinks = aChData.allLinks
linkPointsA = [aChData.segmentIndToMidpoint[el[0]] for el in aLinks] +\
                [aChData.segmentIndToMidpoint[el[1]] for el in aLinks]

bLinks = bChData.allLinks
linkPointsB = [bChData.segmentIndToMidpoint[el[0]] for el in bLinks] +\
                [bChData.segmentIndToMidpoint[el[1]] for el in bLinks]

cLinks = cChData.allLinks
linkPointsC = [cChData.segmentIndToMidpoint[el[0]] for el in cLinks] +\
                [cChData.segmentIndToMidpoint[el[1]] for el in cLinks]

plot3DS(linkPointsA, linkPointsB, linkPointsC,
        title="Link loci in different datasets",
        ylabel='Normalized count of link endpoints')

Calculate and plot C3 endpoint loci

In [None]:
A3 = Cliques(aChData, minC3TissueCount=1)
aLinks = A3.getLinksList()
linkPointsA = [aChData.segmentIndToMidpoint[el[0]] for el in aLinks] +\
                [aChData.segmentIndToMidpoint[el[1]] for el in aLinks]

B3 = Cliques(bChData, minC3TissueCount=1)
bLinks = B3.getLinksList()
linkPointsB = [bChData.segmentIndToMidpoint[el[0]] for el in bLinks] +\
                [bChData.segmentIndToMidpoint[el[1]] for el in bLinks]

C3 = Cliques(cChData, minC3TissueCount=1)
cLinks = C3.getLinksList()
linkPointsC = [cChData.segmentIndToMidpoint[el[0]] for el in cLinks] +\
                [cChData.segmentIndToMidpoint[el[1]] for el in cLinks]

plot3DS(linkPointsA, linkPointsB, linkPointsC,
        title="C3 loci in different datasets",
        ylabel='Normalized count of C3 endpoints')

Calculate and plot support S(k) 

In [None]:
bobA = BasesOfBases(aChData) 
bobB = BasesOfBases(bChData) #In code, supports are called 'bases' for historical reasons
bobC = BasesOfBases(cChData)


baseDeg = int(log2(len(bobA.links)))
aLinks = bobA.reduce(baseDeg)
linkPointsA = [aChData.segmentIndToMidpoint[el[0]] for el in aLinks] +\
                [aChData.segmentIndToMidpoint[el[1]] for el in aLinks]

baseDeg = int(log2(len(bobB.links)))
bLinks = bobB.reduce(baseDeg)
linkPointsB = [bChData.segmentIndToMidpoint[el[0]] for el in bLinks] +\
                [bChData.segmentIndToMidpoint[el[1]] for el in bLinks]

baseDeg = int(log2(len(bobC.links)))
cLinks = bobC.reduce(baseDeg)
linkPointsC = [cChData.segmentIndToMidpoint[el[0]] for el in cLinks] +\
                [cChData.segmentIndToMidpoint[el[1]] for el in cLinks]

plot3DS(linkPointsA, linkPointsB, linkPointsC,
        title="S(k) loci in different datasets",
        ylabel='Normalized count of S(k) endpoints')

Table generation with counts of different elements.
For 2 chromsomes of the 3 sample datasets this is expected to run for ~1min

In [None]:
L = []
header = ["Chr"]
for obj in ["Links", "C3-deg1", "C3-deg2", "S(log(links))", "S(16)"]:
    for ds in [A,B,C]:
        header.append(f"{obj} in {ds.DS}")
for c in A.chrs:
    print(f"Processing {c}")
    row = [c]
    #link count
    for ds in [A,B,C]:
        chData = ChrData(ds, ch=c)
        row.append(len(chData.allLinks))
    #C3 with 1 tissue
    for ds in [A,B,C]:
        chData = ChrData(ds, ch=c)
        C3 = Cliques(chData, minC3TissueCount=1)
        row.append(len(C3.allCliques))
    #C3 with 2 tissues
    for ds in [A,B,C]:
        chData = ChrData(ds, ch=c)
        C3 = Cliques(chData, minC3TissueCount=2)
        row.append(len(C3.allCliques))
    #S(log(links))
    for ds in [A,B,C]:
        chData = ChrData(ds, ch=c)
        bobC = BasesOfBases(chData)
        baseDeg = int(log2(len(bobC.links)))
        links = bobC.reduce(baseDeg)
        row.append(len(links))
    #S(16)
    for ds in [A,B,C]:
        chData = ChrData(ds, ch=c)
        bobC = BasesOfBases(chData)
        baseDeg = 16
        links = bobC.reduce(baseDeg)
        row.append(len(links))
    L.append(row)

with open('elementCountsTable.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(L)


