In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
from matplotlib.pyplot import subplots, style, rc
from tqdm import tqdm
from venn import venn, pseudovenn
from collections import defaultdict
from itertools import count, islice
from functools import lru_cache
from pysam import AlignmentFile

In [2]:
%matplotlib inline

In [3]:
SUBJECT_TO_TRIO = {
    "HG001": "NA12878",
    "HG002": "AshkenazimTrio", "HG003": "AshkenazimTrio", "HG004": "AshkenazimTrio",
    "HG005": "ChineseTrio", "HG006": "ChineseTrio", "HG007": "ChineseTrio",
}

In [4]:
ecx = pd.read_csv("assets/hg38ext.fa.ecx", sep="\t", skiprows=1, escapechar="#") \
    .query("flag==16384").query("blacklist=='-'") \
    .drop(columns=["entry", "pos+1", "main_rname", "flag", "link", "blacklist", "class"])

ecx["sorter"] = ecx["chromosome"].apply(lambda c: 999 if c == "chrX" else int(c[3:]))
ecx = ecx.sort_values(by="sorter").drop(columns="sorter")

In [5]:
def load_bam(filename):
    p_arm, q_arm = defaultdict(list), defaultdict(list)
    with AlignmentFile(filename) as bam:
        for entry in bam:
            if (entry.flag & 0x4000 == 0x4000) and (entry.seq is not None) and (entry.flag & 3844 == 0): # tract_anchor
                entry_stats = [
                    entry.reference_start, entry.query_alignment_start,
                    entry.reference_end, entry.query_alignment_end,
                    len(entry.seq)
                ]
                if entry.flag & 0x8000 == 0x8000: # is_q
                    q_arm[entry.reference_name].append(entry_stats)
                else:
                    p_arm[entry.reference_name].append(entry_stats)
    return p_arm, q_arm

In [6]:
def arm_to_counts(arm, name):
    return pd.Series({rname: len(reads) for rname, reads in arm.items()}, name=name)

In [7]:
p_counts, q_counts = [], []

for subject, trio in SUBJECT_TO_TRIO.items():
    p_arm, q_arm = load_bam(f"PacBio/{trio}/{subject}/tailpuller.bam")
    p_counts.append(arm_to_counts(p_arm, subject))
    q_counts.append(arm_to_counts(q_arm, subject))

raw_p_counts = pd.concat(p_counts, axis=1, sort=False).fillna(0).astype(int)
raw_p_counts.index.name = "rname"
raw_p_counts.reset_index(inplace=True)

raw_q_counts = pd.concat(q_counts, axis=1, sort=False).fillna(0).astype(int)
raw_q_counts.index.name = "rname"
raw_q_counts.reset_index(inplace=True)

In [8]:
p_counts = pd.merge(raw_p_counts, ecx.loc[ecx["prime"]==5, ["rname", "chromosome"]]).sort_values(by="chromosome")
p_counts["arm"] = "p"
p_counts = p_counts.set_index(["arm", "chromosome", "rname"])

q_counts = pd.merge(raw_q_counts, ecx.loc[ecx["prime"]==3, ["rname", "chromosome"]]).sort_values(by="chromosome")
q_counts["arm"] = "q"
q_counts = q_counts.set_index(["arm", "chromosome", "rname"])

assert len(p_counts) == len(raw_p_counts)
assert len(q_counts) == len(raw_q_counts)

p_counts.loc[("p", "total", "")] = p_counts.sum()
q_counts.loc[("q", "total", "")] = q_counts.sum()

counts = pd.concat([p_counts, q_counts])
counts.loc[("total", "", "")] = p_counts.loc[("p", "total", "")] + q_counts.loc[("q", "total", "")]

counts["total"] = counts.sum(axis=1)
counts

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,HG001,HG002,HG003,HG004,HG005,HG006,HG007,total
arm,chromosome,rname,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
p,chr10,chr10,2,12,1,5,2,5,4,31
p,chr12,chr12,10,21,16,8,6,16,6,83
p,chr16,chr16,8,2,0,12,0,3,0,25
p,chr17,17ptel_1_500K_1_12_12,0,9,1,7,1,3,8,29
p,chr18,chr18,34,94,46,22,21,36,68,321
p,chr19,19ptel_1-500K_1_12_12,0,7,0,0,2,0,5,14
p,chr2,chr2,10,63,18,25,6,12,11,145
p,chr5,chr5,18,61,14,12,11,10,31,157
p,chr8,chr8,0,0,0,4,0,0,3,7
p,chr9,chr9,9,26,17,16,4,59,73,204


In [9]:
counts.to_csv("Table-S1-coverage.tsv", sep="\t")