In [28]:
from universal import UniversalDS, ChrData
from topologicalFeatures import Cliques, Bases, BasesOfBases


In [29]:
U = UniversalDS("../sampleData/data-pvalue-5-fin-min.json")
R = UniversalDS("../sampleData/data-pvalue-5-fin-minRND.json")
U.DS = "Blood cell pcHi-C"

# U = UniversalDS("../sampleData/data-pvalue-0.7-fin-min.json")
# R = UniversalDS("../sampleData/data-pvalue-0.7-fin-minRND.json")
# U.DS = "Tissue pcHi-C"

# U = UniversalDS("../sampleData/data-pvalue-10-fin-min.json")
# R = UniversalDS("../sampleData/data-pvalue-10-fin-minRND.json")
# U.DS = "Tissue Hi-C"



In [30]:
R.DS = "Randomized "+U.DS

In [31]:
ch = "chr6"
uChData = ChrData(U, ch=ch)
rChData = ChrData(R, ch=ch)

In [32]:
import matplotlib.pyplot as plt


In [33]:
def plotLinkLengths(uChData, uLinks, rChData, rLinks, title="", rr=0.025e8, rbins=80, binx=0):
    #Link lengths of original and randomized chr6
    #uChData and rChData- ChrData instances with U-original and R-randomized data
    #uLinks and rLinks - list of links whose endpoints are to be plotted
    #rr - x axis right limit
    #rBins - number of bins in histogram
    #binx - width of bin in dataset. If 0, it is ignored. If 0, binx is ignored
    objectAnalyzed = "Link Lengths"
    variant="All Links"
    fig, ax = plt.subplots()
    print("start {ch}".format(ch=ch))

    print(len(uLinks), len(rLinks))

    linkLengthsU = [abs(uChData.getLinkLength(link)) for link in uLinks if abs(uChData.getLinkLength(link))<=rr]
    linkLengthsR = [abs(rChData.getLinkLength(link)) for link in rLinks if abs(rChData.getLinkLength(link))<=rr]
    if binx!=0: 
        maxRBin = max([abs(rChData.getLinkLength(link)) for link in rLinks])
        rbins = [0]
        i=0
        while (i < maxRBin):
            i+=1*binx
            rbins.append(i+1)
    print(rbins)


            
    print(len(linkLengthsU), len(linkLengthsR))
    print(max([abs(uChData.getLinkLength(link)) for link in uLinks]), "max original link")
    print(max([abs(rChData.getLinkLength(link)) for link in rLinks]), "max randomized link")
    print(min([abs(uChData.getLinkLength(link)) for link in uLinks]), "min original link")
    print(min([abs(rChData.getLinkLength(link)) for link in rLinks]), "min randomized link")
    ax.hist(x=linkLengthsU, bins=rbins, histtype='step', label='Real link lengths', linewidth=2)
    ax.hist(x=linkLengthsR, bins=rbins, histtype='step', label='Randomized link lengths', linewidth=2)
    ax.set_xlim(right=rr, left=0)


    ax.set_title("{ch}".format(ch=ch))
    ax.tick_params(labelsize=24)
    # axs[num[0], num[1]].set_ylim(ymin=0, ymax=maxVal)
    # axs[num[0], num[1]].set_xlim(xmin=0, xmax=maxVal)
    # axs[num[0], num[1]].plot([0,maxVal], [0,maxVal])

    ax.legend(loc="upper right")

    ax.set(xlabel='Link length in Mbp')
    ax.xaxis.label.set_fontsize(28)
    ax.set(ylabel='Link count')
    ax.yaxis.label.set_fontsize(28)
    fig.suptitle(title, size=32)
    fig.set_size_inches(16.,10.)
    #plt.savefig('{oa}-{v}-compare-randomized-{o}-vs-{r}.png'.format(oa=objectAnalyzed, v=variant, o=U.DS, r=R.DS), dpi=300)
    plt.savefig('IMG-{title}.png'.format(title=title), dpi=600)
    plt.savefig('IMG-{title}.svg'.format(title=title))
    plt.savefig('IMG-{title}.pdf'.format(title=title), dpi=800)
    #plt.show()
    plt.clf()

In [34]:
if U.DS!="Blood cell pcHi-C":
    #To fix binning inconsistency
    plotLinkLengths(uChData=uChData, uLinks=uChData.links, rChData=rChData, rLinks=rChData.links, title=f"Link lengths in {U.DS}", rr=0.025e8, binx=25000) #plots randomized vs real link lengths
else: plotLinkLengths(uChData=uChData, uLinks=uChData.links, rChData=rChData, rLinks=rChData.links, title=f"Link lengths in {U.DS}", rr=0.025e8) #plots randomized vs real link lengths

start chr6
47834 47834
80
44560 44162
170572582 max original link
170572582 max randomized link
2452 min original link
1477 min randomized link


<Figure size 1600x1000 with 0 Axes>

In [35]:
# ch = "chr6"
# uuChData = ChrData(U, ch=ch, tissueMask=3)
# rrChData = ChrData(R, ch=ch, tissueMask=3)
# deg = 2
# CU = Cliques(owner=uuChData, minC3TissueCount=deg)
# CR = Cliques(owner=rrChData, minC3TissueCount=deg)

# cuLinks = CU.getLinks()
# crLinks = CR.getLinks()
# iii=9
# plotLinkLengths(uChData=uuChData, uLinks=cuLinks, rChData=rrChData, rLinks=crLinks, title=f"NoTitle-Triangles-{deg}", rr=0.025e8) #plots randomized vs real link lengths of C3 with deg tissues

In [36]:
deg = 1
CU = Cliques(owner=uChData, minC3TissueCount=deg)
CR = Cliques(owner=rChData, minC3TissueCount=deg)
cuLinks = CU.getLinksList()
crLinks = CR.getLinksList()
if U.DS!="Blood cell pcHi-C":
    plotLinkLengths(uChData=uChData, uLinks=cuLinks, rChData=rChData, rLinks=crLinks, title=f"Link lengths of C3 with {deg}+ tissues in {U.DS}", rr=0.025e8, binx=25000) #plots randomized vs real link lengths of C3 with deg tissues
else: plotLinkLengths(uChData=uChData, uLinks=cuLinks, rChData=rChData, rLinks=crLinks, title=f"Link lengths of C3 with {deg}+ tissues in {U.DS}", rr=0.025e8) #plots randomized vs real link lengths of C3 with deg tissues

start chr6
173304 111480
80
160079 102028
162832229 max original link
162832229 max randomized link
2452 min original link
1477 min randomized link


<Figure size 1600x1000 with 0 Axes>

In [37]:
deg = 2
CU = Cliques(owner=uChData, minC3TissueCount=deg)
CR = Cliques(owner=rChData, minC3TissueCount=deg)
cuLinks = CU.getLinksList()
crLinks = CR.getLinksList()
if U.DS!="Blood cell pcHi-C":
    plotLinkLengths(uChData=uChData, uLinks=cuLinks, rChData=rChData, rLinks=crLinks, title=f"Link lengths of C3 with {deg}+ tissues in {U.DS}", rr=0.025e8, binx=25000) #plots randomized vs real link lengths of C3 with deg tissues
else: plotLinkLengths(uChData=uChData, uLinks=cuLinks, rChData=rChData, rLinks=crLinks, title=f"Link lengths of C3 with {deg}+ tissues in {U.DS}", rr=0.025e8) #plots randomized vs real link lengths of C3 with deg tissues

start chr6
97992 55680
80
93292 52209
162831356 max original link
162805021 max randomized link
3015 min original link
1477 min randomized link


<Figure size 1600x1000 with 0 Axes>

In [38]:
bobU = BasesOfBases(uChData)
bobR = BasesOfBases(rChData)

from numpy import log2
baseDeg = int(log2(len(bobU.links)))

linksU = bobU.reduce(deg=baseDeg)
linksR = bobR.reduce(deg=baseDeg)

if U.DS!="Blood cell pcHi-C":
    plotLinkLengths(uChData=uChData, uLinks=linksU, rChData=rChData, rLinks=linksR, title=f"Support S({baseDeg}) in real vs randomized dataset {U.DS}", rr=0.025e8, binx=25000) #plots randomized vs real link lengths of C3 with deg tissues
else: plotLinkLengths(uChData=uChData, uLinks=linksU, rChData=rChData, rLinks=linksR, title=f"Support S({baseDeg}) in real vs randomized dataset {U.DS}", rr=0.025e8) #plots randomized vs real link lengths of C3 with deg tissues






Bases 15 of bases calculated. 3098 bases found
Bases 15 of bases calculated. 1530 bases found
start chr6
3098 1530
80
2898 1425
65359729 max original link
30698939 max randomized link
2452 min original link
1540 min randomized link


<Figure size 1600x1000 with 0 Axes>

In [39]:
#plotLinkLengths(uChData=uChData, uLinks=cuLinks, rChData=uChData, rLinks=uChData.links, title="TrianglesVsLinks", rr=0.025e8) #plots randomized vs real link lengths of C3 with deg tissues
