In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
from matplotlib.pyplot import subplots, style, rc, rc_context, close
from tqdm import tqdm
from venn import venn, pseudovenn
from collections import defaultdict
from itertools import count, islice
from functools import lru_cache
from argparse import Namespace

In [None]:
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import dendrogram, fcluster, linkage
from sklearn.metrics import silhouette_score
from matplotlib.gridspec import GridSpec
%matplotlib inline

In [None]:
from scipy.cluster.hierarchy import cophenet
from scipy.stats import pearsonr, wilcoxon

In [None]:
from edgecaselib.formats import load_index, load_kmerscan
from edgecaselib.densityplot import interpret_arguments
from edgecaselib.util import natsorted_chromosomes
from pickle import dump, load
from os import path
from tempfile import NamedTemporaryFile
from subprocess import check_output, CalledProcessError
from pysam import AlignmentFile
from scipy.stats import chi2_contingency
from statsmodels.stats.multitest import multipletests
from scipy.ndimage import uniform_filter1d
from matplotlib.patches import FancyArrowPatch, Rectangle
from matplotlib.lines import Line2D

In [None]:
def wilcoxon_dropna(df, a, b):
    dfnona = df[[a, b]].dropna()
    try:
        yes = sum(dfnona[a] < dfnona[b])
        no = sum(dfnona[a] > dfnona[b])
        p = wilcoxon(dfnona[a], dfnona[b])[1]
        return yes, no, p
    except ValueError:
        return np.nan, np.nan, np.nan

In [None]:
SAMFILTERS = [["is_q", "tract_anchor"], 0, 0]
ecx = load_index("../../hg38ext.fa.ecx")

In [None]:
SUBJECT_TO_TRIO = {
    "HG001": "NA12878",
    "HG002": "AshkenazimTrio", "HG003": "AshkenazimTrio", "HG004": "AshkenazimTrio",
    "HG005": "ChineseTrio", "HG006": "ChineseTrio", "HG007": "ChineseTrio",
}
Q_CHROMS = ["chr7", "chr8", "chr11", "chr12", "14qtel_1-500K_1_12_12_rc", "chr15", "18qtel_1-500K_1_12_12_rc"]
MAXLEN = 1600
DATA_DIR = "../../../data/datasets/2021"

In [None]:
def squarify(narrowform):
    triu_fillna = narrowform.pivot(index="qname1", columns="qname2", values="relative_ld").fillna(0)
    return triu_fillna.T + triu_fillna

distances_narrowform = pd.read_csv(f"{DATA_DIR}/PacBio/haplotypes/levenshtein-q_arm.tsv", sep="\t", escapechar="#")

RAW_GLOBAL_LDS = {
    rname: squarify(distances_narrowform[distances_narrowform["rname"]==rname].drop(columns="rname"))
    for rname in distances_narrowform["rname"].drop_duplicates()
    if rname in Q_CHROMS
}

In [None]:
KMERSCANNER_PKL = f"{DATA_DIR}/PacBio/kmerscanner-q_arm.pkl"
KMERSCANNER_DAT = f"{DATA_DIR}/PacBio/kmerscanner-q_arm.dat.gz"

if path.isfile(KMERSCANNER_PKL):
    with open(KMERSCANNER_PKL, mode="rb") as pkl:
        DENSITIES = load(pkl)
else:
    DENSITIES = load_kmerscan(KMERSCANNER_DAT, True, SAMFILTERS, 10)
    with open(KMERSCANNER_PKL, mode="wb") as pkl:
        dump(DENSITIES, pkl)

In [None]:
class GridFig():

    def __init__(self, width_ratios, height_ratios, scale=1):
        self.figure, _ = subplots(
            figsize=(sum(width_ratios)*scale, sum(height_ratios)*scale),
            ncols=0, nrows=0,
        )
        self.gs = GridSpec(
            ncols=len(width_ratios), wspace=0, width_ratios=width_ratios, 
            nrows=len(height_ratios), hspace=0, height_ratios=height_ratios,
            figure=self.figure,
        )
 
    def subplot(self, gridspec_slice, aspect="auto", frame=False):
        ax = self.figure.add_subplot(gridspec_slice, aspect=aspect)
        if frame is False:
            ax.set(frame_on=False)
        else:
            for spine in {"top", "right", "bottom", "left"} - set(frame):
                ax.spines[spine].set_visible(False)
        return ax

In [None]:
def plot_dendrogram(Z, gf):
    ax = gf.subplot(gf.gs[0,0])
    with rc_context({"lines.linewidth": .5}):
        dendrogram(
            Z, orientation="left",
            link_color_func=lambda x: "black", ax=ax,
        )
    ax.set(
        xticks=[], xlabel=None,
        yticks=[], ylabel=None,
        ylim=ax.get_ylim()[::-1],
    )

In [None]:
def plot_heatmap(data2d, gf, cmap="gray_r", vmax=.15):
    ax = gf.subplot(gf.gs[0,1])
    sns.heatmap(data2d, cmap=cmap, cbar=False, vmin=0, vmax=vmax, ax=ax)
    ax.set(xticks=[], yticks=[], xlabel=None, ylabel=None)

In [None]:
def cluster(lds, metric="correlation", method="ward"):
    Z = linkage(squareform(lds), metric=metric, method=method, optimal_ordering=False)
    leaves = dendrogram(Z, no_plot=True)["leaves"]
    data2d = lds.iloc[leaves, leaves].copy()
    dispatcher = pd.DataFrame(index=data2d.index)
    dispatcher.index.name = "read"
    to_subject = dispatcher.index.map(lambda s: s.split(":")[1])
    for subject in sorted(to_subject.drop_duplicates()):
        dispatcher[subject] = (to_subject==subject)
    return Z, data2d, dispatcher

In [None]:
def get_plottable_density_section(densities, chrom, motif, data2d, ecx):
    chrom_densities = densities[chrom]
    if motif is None:
        by_motif = chrom_densities[chrom_densities["motif"]=="TTAGGG"]
    else:
        by_motif = chrom_densities[chrom_densities["motif"]==motif]
    section = by_motif.set_index("name").reindex(data2d.index).iloc[:,8:].copy()
    if motif is None:
        section = (~section.isnull()).astype(int) / 3
    section.columns = section.columns.astype(int)
    anchor = ecx.loc[
        (ecx["rname"]==chrom) & (ecx["flag"]==0x4000) & (ecx["prime"]==3),
        "pos",
    ].iloc[0]
    return section[[c for c in section.columns if c>=anchor]]

In [None]:
def get_absentees(lds, densities, chrom, ecx):
    raw_section = get_plottable_density_section(densities, chrom, "TTAGGG", lds, ecx)
    nulls = raw_section.isnull().all(axis=1)
    return nulls[nulls].index

In [None]:
def section_to_RGB(ps, color, alpha_factor=1.2):
    return np.transpose(
        np.array([
            np.full_like(ps, color[0]),
            np.full_like(ps, color[1]),
            np.full_like(ps, color[2]),
            np.clip(ps*alpha_factor, a_min=None, a_max=1),
        ]),
        axes=(1, 2, 0),
    )

In [None]:
def draw_fancy_arrow(
    y, start, end, ax, lw=.25,
    csty="angle3,angleA=45,angleB=-45",
    asty="Simple, tail_width=.25, head_width=2, head_length=3"
):
    ax.add_patch(FancyArrowPatch(
        (start, y), (end, y),
        connectionstyle=csty,
        arrowstyle=asty,
        lw=lw, color="#888", clip_on=False,
    ))

In [None]:
POPULATION_COLORS = {
    "HG001": "black",
    "HG002": "green", "HG003": "green", "HG004": "green",
    "HG005": "steelblue", "HG006": "steelblue", "HG007": "steelblue",
}

def plot_subjects(dispatcher, gf, s=10):
    for i, subject in enumerate(sorted(SUBJECT_TO_TRIO)):
        sax = gf.subplot(gf.gs[0,i+3])
        sax.plot([0, 0], [0, len(dispatcher)], lw=.5, color="#888")
        if subject in dispatcher:
            truthiness = dispatcher[subject].reset_index(drop=True)
            positions = truthiness[truthiness].index
            for x in [-.1, 0, .1]:
                sax.scatter(x=[x]*len(positions), y=positions, marker="_", s=s, color=POPULATION_COLORS[subject])
        sax.set(
            xticks=[0], xticklabels=[subject+"  "],
            yticks=[], xlabel=None, ylabel=None,
            xlim=(-.5, .5),
            ylim=(len(dispatcher), -1),
        )
        for tick in sax.get_xticklabels():
            tick.set_rotation(90)
        if subject in {"HG002", "HG005"}:
            draw_fancy_arrow(len(dispatcher), 1, 0, sax)
            draw_fancy_arrow(len(dispatcher), 2, 0, sax)
        sax.tick_params(axis="both", which="both", length=0)

In [None]:
IMSHOW_PALETTE = {
    None: [.7, .7, .7],
    "TTAGGG": [.0, .4, .1],
    "TGAGGG": [1, 1, 0],
    "TTAGGGG": [.5, .9, 1],
}

def plot_densities(densities, chrom, data2d, ecx, gf, extent, bin_size=100):
    ax = gf.subplot(gf.gs[0,-1])
    for motif, color in IMSHOW_PALETTE.items():
        ps = get_plottable_density_section(densities, chrom, motif, data2d, ecx).values
        breakat = MAXLEN // 100
        orig_len = ps.shape[1]
        if ps.shape[1] < MAXLEN:
            ps = np.pad(ps, ((0, 0), (0, MAXLEN-ps.shape[1])))
        elif ps.shape[1] > MAXLEN:
            ps = ps[:,:MAXLEN]
        pa = section_to_RGB(np.clip(uniform_filter1d(ps, 5, 1), a_min=0.0, a_max=1.0), color, 1.5)
        ax.imshow(pa, extent=extent, interpolation="nearest")
    ticklabels=np.linspace(0, MAXLEN//100, MAXLEN//100+1).astype(int).astype(str)
    fullaxislen = len(ticklabels)
    ticklabels = ticklabels[:breakat+1]
    xmin, xmax = extent[:2]
    ax.set(
        xticks=np.linspace(xmin, xmax, MAXLEN//100+1)[:breakat+1],
        xticklabels=ticklabels,
        xlabel="Kbp of telomeric tract",
        yticks=[], ylabel=None,
    )
    ax.tick_params(axis="both", which="both", length=0)
    ax.tick_params(axis="x", which="both", length=3)
    ax.axhline(0, 0, (breakat+1)/fullaxislen, lw=1, c="black")

In [None]:
@lru_cache(maxsize=None)
def convname(cn):
    match = re.search(r'^\d+', cn)
    if match:
        return match.group() + "q"
    else:
        return cn.split("chr")[1] + "q"

In [None]:
def process_lds(raw_global_lds, chrom, densities, ecx, no_plot=False, scale=.2):
    lds = raw_global_lds[chrom].copy()
    absentees = get_absentees(lds, densities, chrom, ecx)
    lds.drop(index=absentees, columns=absentees, inplace=True)
    Z, data2d, dispatcher = cluster(lds, metric="euclidean", method="ward")
    if no_plot:
        gf = None
    else:
        h = 6*len(lds)/50
        w = 30
        gf = GridFig([h/3,h,.3]+[.85]*7+[w], [h], scale=scale)
        plot_dendrogram(Z, gf=gf)
        plot_heatmap(data2d, gf=gf)
        plot_subjects(dispatcher, gf=gf, s=7)
        plot_densities(densities, chrom, data2d, ecx, gf=gf, extent=[0,w,0,h])
        if len(chrom) > 11:
            name = "{} ({})".format(convname(chrom), chrom[:6]+"…")
        else:
            name = "{} ({})".format(convname(chrom), chrom)
        gf.figure.get_axes()[0].set_ylabel(name, fontsize=13)
    return lds, Z, data2d, dispatcher, gf

In [None]:
def cophenetic_correlation(lds, Z):
    r, p = pearsonr(squareform(lds), cophenet(Z))
    return r, max(p, 5e-324) # p-value of zero is just a rounding issue

In [None]:
def fixup_labels(gf, chrom):
    if chrom == "chr7":
        gf.figure.get_axes()[1].set_title("Pairwise relative\nLevenshtein distances", fontsize=13, loc="right")
        gf.figure.get_axes()[5].set_title("Subjects", fontsize=13)
        gf.figure.get_axes()[-1].set_title("Motif densities", loc="left", fontsize=13)
    if chrom != "18qtel_1-500K_1_12_12_rc":
        for ax in gf.figure.get_axes()[:-1]:
            ax.set(xticklabels=[], xlabel=None)
        gf.figure.get_axes()[-1].set(xlabel=None)

In [None]:
read_to_category = lambda dispatcher, pos: pd.Series(
    index=dispatcher.index,
    data=dispatcher.index.map(lambda s: s.split(":")[pos])
)

In [None]:
def count_subtrees(dispatcher):
    read_to_trio = read_to_category(dispatcher, 0)
    running_trio = None
    trio_runs = defaultdict(int)
    run = 0
    for trio in read_to_trio:
        if trio != running_trio:
            if run:
                trio_runs[running_trio] += 1
            running_trio, run = trio, 1
        else:
            run += 1
    if run:
        trio_runs[running_trio] += 1
    return trio_runs

In [None]:
reindex_to = lambda row, rtc, cat: row.reindex(rtc[rtc==cat].index).dropna()
reindex_in = lambda row, rtc: row.reindex(rtc[rtc==rtc[row.name]].index).dropna()
reindex_out = lambda row, rtc: row.reindex(rtc[rtc!=rtc[row.name]].index).dropna()

In [None]:
def get_closest_distances(lds, dispatcher):
    read_to_trio = read_to_category(dispatcher, 0)
    read_to_subject = read_to_category(dispatcher, 1)
    return lds.apply(
        lambda row: pd.Series({
            "subject": reindex_in(row, read_to_subject).drop(index=row.name).min(),
            "trio": reindex_in(reindex_out(row, read_to_subject), read_to_trio).min(),
            "outgroup": reindex_out(row, read_to_trio).min(),
        }),
        axis=1,
    )

In [None]:
def subject_to_subject_lds(lds, dispatcher, a, b):
    read_to_subject = read_to_category(dispatcher, 1)
    return lds.loc[read_to_subject[read_to_subject==a].index, read_to_subject[read_to_subject==b].index]

In [None]:
def get_closest_family_distances(lds, dispatcher, **kwargs):
    target, test, control = list(kwargs)
    return pd.DataFrame({
        f"{test} to {target}": subject_to_subject_lds(lds, dispatcher, kwargs[test], kwargs[target]).min(axis=1),
        f"{test} to {control}": subject_to_subject_lds(lds, dispatcher, kwargs[test], kwargs[control]).min(axis=1),
    })

In [None]:
def process_distances(lds, dispatcher):
    closest_distances = get_closest_distances(lds, dispatcher)
    ashkenazim_from_father = get_closest_family_distances(lds, dispatcher, son="HG002", father="HG003", mother="HG004")
    ashkenazim_from_mother = get_closest_family_distances(lds, dispatcher, son="HG002", mother="HG004", father="HG003")
    chinese_from_father = get_closest_family_distances(lds, dispatcher, son="HG005", father="HG006", mother="HG007")
    chinese_from_mother = get_closest_family_distances(lds, dispatcher, son="HG005", mother="HG007", father="HG006")
    return (
        closest_distances,
        ashkenazim_from_father, ashkenazim_from_mother,
        chinese_from_father, chinese_from_mother,
        *wilcoxon_dropna(closest_distances, "subject", "trio"),
        *wilcoxon_dropna(closest_distances, "subject", "outgroup"),
        *wilcoxon_dropna(closest_distances, "trio", "outgroup"),
        *wilcoxon_dropna(ashkenazim_from_father, "father to son", "father to mother"),
        *wilcoxon_dropna(ashkenazim_from_mother, "mother to son", "mother to father"),
        *wilcoxon_dropna(chinese_from_father, "father to son", "father to mother"),
        *wilcoxon_dropna(chinese_from_mother, "mother to son", "mother to father"),
    )

In [None]:
stats = pd.DataFrame(columns=[
    "NA12878", "AshkenazimTrio", "ChineseTrio",
    "cr", "cp",
    "s2t_p", "s2o_p", "t2o_p", "aff_p", "afm_p", "cff_p", "cfm_p",
])

cd_list, aff_list, afm_list, cff_list, cfm_list = [], [], [], [], []
NO_PLOT = False

for chrom in tqdm(RAW_GLOBAL_LDS):
    try:
        lds, Z, data2d, dispatcher, gf = process_lds(RAW_GLOBAL_LDS, chrom, DENSITIES, ecx, no_plot=NO_PLOT, scale=.2)
    except ValueError: # too few observations
        continue
    try:
        cr, cp = cophenetic_correlation(lds, Z)
    except ValueError: # too few observations
        cr, cp = np.nan, np.nan
    if not NO_PLOT:
        fixup_labels(gf, chrom)
        gf.figure.savefig(
            f"{DATA_DIR}/PacBio/haplotypes/clusters-q_arm/"+chrom+".pdf", bbox_inches="tight",
        )
        close(gf.figure)
    cd, aff, afm, cff, cfm, _, _, s2t_p, _, _, s2o_p, _, _, t2o_p, _, _, aff_p, _, _, afm_p, _, _, cff_p, _, _, cfm_p = (
        process_distances(lds, dispatcher)
    )
    cd_list.append(cd)
    aff_list.append(aff)
    afm_list.append(afm)
    cff_list.append(cff)
    cfm_list.append(cfm)
    stats.loc[chrom] = [
        np.nan, np.nan, np.nan,
        cr, cp,
        s2t_p, s2o_p, t2o_p, aff_p, afm_p, cff_p, cfm_p,
    ]
    for trio, subtree_count in count_subtrees(dispatcher).items():
        stats.loc[chrom, trio] = subtree_count

In [None]:
legend, axs = subplots(figsize=(6.7, 2.5), ncols=3, gridspec_kw=dict(width_ratios=(1, 4.5, 3.4), wspace=0))

axs[0].imshow(np.vstack([np.linspace(0, 1, 256)]).T, cmap="Greys_r", aspect="auto")
axs[0].set(xticks=[], yticks=[0, 255])
axs[0].set_yticklabels(["$\geq{}0.15$", "0"], fontsize=14)
axs[0].text(x=-.2, y=128, s="Relative\ndistance\n", rotation=90, ha="right", va="center", fontsize=17)

for x, subject in enumerate(sorted(SUBJECT_TO_TRIO)):
    axs[1].plot([x, x], [0, 1], color="#888", lw=1.5)
    axs[1].scatter([x]*4, np.linspace(.1, .7, 4)+x/30, color=POPULATION_COLORS[subject], marker="_", s=125)

axs[1].set(xlim=(-4.5, 8.5), xticks=[], yticks=[])
twiny = axs[1].twiny()
twiny.set(xlim=(-4.5, 8.5), xticks=[])
for tick in twiny.get_xticklabels():
    tick.set_rotation(80)
twiny.tick_params(axis="both", which="both", length=0)
for spine in "top", "bottom":
    axs[1].spines[spine].set_visible(False)
    twiny.spines[spine].set_visible(False)
axs[1].text(x=-2, y=.5, s="Assignment of\nreads to subjects", rotation=90, fontsize=16, ha="center", va="center")
axs[1].text(x=-1.5, y=1.1, s="/ populations", rotation=90, ha="center", va="bottom", fontsize=16)

csty1 = "angle3,angleA=80,angleB=-60"
csty2 = "angle3,angleA=60,angleB=-70"
asty = "Simple, tail_width=.25, head_width=7, head_length=5"
draw_fancy_arrow(-0.02, 2.1, 1, axs[1], lw=1, csty=csty1)
draw_fancy_arrow(-0.02, 3.1, 1, axs[1], lw=1, asty=asty, csty=csty2)
draw_fancy_arrow(-0.02, 5.1, 4, axs[1], lw=1, csty=csty1)
draw_fancy_arrow(-0.02, 6.1, 4, axs[1], lw=1, asty=asty, csty=csty2)

axs[1].text(x=2.95, y=-.35, s="child{}parent\nrelatedness".format(chr(0x2190)), va="center", ha="center", fontsize=16)
line = Line2D((2.5, 4.1), (-.25, -.12), lw=1, ls="--", color="#888")
line.set_clip_on(False)
axs[1].add_line(line)
line = Line2D((2.5, 2.1), (-.25, -.16), lw=1, ls="--", color="#888")
line.set_clip_on(False)
axs[1].add_line(line)

line = Line2D((-.2, .2), (1.1, 1.1), lw=4, color=POPULATION_COLORS["HG001"])
line.set_clip_on(False)
axs[1].add_line(line)
axs[1].text(x=.4, y=1.1, s="  Utah", ha="center", va="bottom", rotation=50, fontsize=17, color=POPULATION_COLORS["HG001"])

line = Line2D((.8, 3.2), (1.1, 1.1), lw=4, color=POPULATION_COLORS["HG002"])
line.set_clip_on(False)
axs[1].add_line(line)
axs[1].text(x=3.7, y=1.1, s="  Ashkenazim", ha="center", va="bottom", rotation=50, fontsize=17, color=POPULATION_COLORS["HG002"])

line = Line2D((3.8, 6.2), (1.1, 1.1), lw=4, color=POPULATION_COLORS["HG005"])
line.set_clip_on(False)
axs[1].add_line(line)
axs[1].text(x=6, y=1.1, s="  Chinese", ha="center", va="bottom", rotation=50, fontsize=17, color=POPULATION_COLORS["HG005"])

axs[2].add_patch(Rectangle((0,3), 1.5, .65, facecolor="#119933", edgecolor="black"))
axs[2].text(x=1.75, y=3.25, s="TTAGGG", fontsize=15, va="center")
axs[2].add_patch(Rectangle((0,2), 1.5, .65, facecolor="#EEDD77", edgecolor="black"))
axs[2].text(x=1.75, y=2.25, s="TGAGGG", fontsize=15, va="center")
axs[2].add_patch(Rectangle((0,1), 1.5, .65, facecolor="#88DFEF", edgecolor="black"))
axs[2].text(x=1.75, y=1.25, s="TTAGGGG", fontsize=15, va="center")
axs[2].add_patch(Rectangle((0,0), 1.5, .65, facecolor="#DDDDDD", edgecolor="black"))
axs[2].text(x=1.75, y=0.25, s="background", fontsize=13, va="center")
axs[2].set(xticks=[], yticks=[], xlim=(-.4, 4.8), ylim=(-0.7, 4.4))
axs[2].set_title("Motif densities", fontsize=17)

legend.add_artist(Rectangle((.0, -.2), .94, 1.73, edgecolor="black", facecolor="none"))

legend.savefig(f"{DATA_DIR}/PacBio/haplotypes/clusters-q_arm/legend.pdf", bbox_inches="tight")

In [None]:
PRINT_NS = False

if PRINT_NS:
    format_pval = lambda p: "ns" if (p >= .05) else ("<1.0e-300" if (p < 1e-300) else format(p, ".1e"))
else:
    format_pval = lambda p: format(p, ".2f") if (p >= .05) else ("<1.0e-300" if (p < 1e-300) else format(p, ".1e"))

In [None]:
st = stats.reindex(natsorted_chromosomes(stats.index)).copy()
st.index.name = "contig"
tc = st.iloc[:,:3].values.flatten()
tc = tc[~np.isnan(tc)]
print("Max subtree count:", stats.iloc[:,:3].max(axis=0).sort_values().iloc[[-1]].to_string())
print("Median subtree count:", np.median(tc))

In [None]:
st = stats.reindex(natsorted_chromosomes(stats.index)).copy()
st.index.name = "contig"
st = st.iloc[:,3:].reset_index()
st.insert(loc=0, column="chromosome", value=st["contig"].apply(convname))

coph = st.iloc[:,:4].copy()
coph["cp"] = multipletests(coph["cp"], method="bonferroni")[1]
coph["r"] = coph["cr"].apply(lambda r: format(r, ".2f"))
coph["p"] = coph["cp"].apply(format_pval)
coph.drop(columns=["cr", "cp"])

In [None]:
with open("for_wilcoxon-q_arm.pkl", mode="wb") as pkl:
    Q = Namespace(cd_list=cd_list, aff_list=aff_list, afm_list=afm_list, cff_list=cff_list, cfm_list=cfm_list)
    dump(Q, pkl)

In [None]:
cd_all = pd.concat(cd_list)
print(len(cd_all[cd_all["subject"]>cd_all["outgroup"]]), len(cd_all[cd_all["subject"]>cd_all["outgroup"]])/3729)
inter_reads = cd_all[cd_all["subject"]>=cd_all["outgroup"]*2].index
inter_dispatcher = pd.DataFrame(index=inter_reads, data={"subject": inter_reads.map(lambda s: s.split(":")[1])})
print(len(inter_dispatcher), len(inter_dispatcher)/3729)
inter_dispatcher["subject"].value_counts()

In [None]:
inter_dispatcher["chromosome"] = np.nan
inter_dispatcher["rname"] = np.nan

for chrom, lds in RAW_GLOBAL_LDS.items():
    for name in lds.index:
        if name in inter_dispatcher.index:
            inter_dispatcher.loc[name, "rname"] = chrom
inter_dispatcher["chromosome"] = inter_dispatcher["rname"].apply(convname)

In [None]:
inter_counts = inter_dispatcher.groupby(["subject", "chromosome"], as_index=False).count().pivot(
    index="subject", columns="chromosome", values="rname",
)
inter_counts = inter_counts[natsorted_chromosomes(inter_counts.columns)].applymap(lambda x: "" if np.isnan(x) else str(int(x)))
inter_counts