In [70]:
import glob
import os
import numpy as np
import pandas as pd
import scienceplots
import seaborn as sns
import yaml
from matplotlib import pyplot as plt
from tqdm import tqdm
from scipy import stats
from matplotlib.ticker import PercentFormatter
import marsilea as ma
import marsilea.plotter as mp
from pybedtools import BedTool
plt.style.use(["science", "nature"])
import pickle

plt.rcParams['xtick.labelsize'] = 5
plt.rcParams['ytick.labelsize'] = 5
plt.rcParams['axes.labelsize'] = 6
plt.rcParams["xtick.top"] = False
plt.rcParams["ytick.right"] = False
plt.rcParams["lines.linewidth"] = 0.5
plt.rcParams["legend.fontsize"] = 6
plt.rcParams['hatch.linewidth'] = 0.5

protocol_map = {
    "Visium": "10X Visium",
    "VisiumHD": "10X Visium HD",
    "Chromium": "10X Chromium",
    "Dropseq": "Drop-seq",
    "Stereoseq": "Stereo-seq",
    "Slideseq": "Slide-seq V2",
    "SpatialTranscriptomics": "ST",
    "Microwell": "Microwell-seq",
    "annotation": "Annotated PAS",
    "Annotation": "Annotated PAS",
    "anno": "Annotated PAS",
}
type_map = {
    "Visium": "Spatial transcriptome",
    "VisiumHD": "Spatial transcriptome",
    "Chromium": "scRNA-seq",
    "Dropseq": "scRNA-seq",
    "Stereoseq": "Spatial transcriptome",
    "Slideseq": "Spatial transcriptome",
    "SpatialTranscriptomics": "Spatial transcriptome",
    "Microwell": "scRNA-seq",
}
order = ["10X Chromium", "Drop-seq", "Microwell-seq", "10X Visium","Stereo-seq", "Slide-seq V2", "ST"]
# order = ["10X Chromium", "Drop-seq", "Microwell-seq", "10X Visium", "10X Visium HD","Stereo-seq", "Slide-seq V2", "Spatial Transcriptomics"]

color = [
    "#386b98",
    "#269a51",
    "#edaa4d",
    "#d34123",
    "#7e648a",
    "#454545",
    "#929292",
]
palette=sns.color_palette(color, 7)
mm = 1/25.4

In [48]:
raw_cs_list = glob.glob("../../data/int_data/data/cs/*multi_cs.bed")


In [49]:
result = []
for file in raw_cs_list:
    sample ="_".join(file.split("/")[-1].split("_")[0:4])
    protocol = sample.split("_")[0]
    raw_bed_counts = len(BedTool(file))
    raw_bed_rm_pa_counts = len(BedTool(file.replace(".bed", "_rm_pa.bed")))
    raw_bed_rm_pa_pt_counts = len(BedTool(file.replace(".bed", "_rm_pa_pt.bed")))
    result.append(
        {
            "sample": sample,
            "protocol": protocol,
            "counts": raw_bed_counts,
            "rm_pa_counts": raw_bed_rm_pa_counts,
            "rm_pa_pt_counts": raw_bed_rm_pa_pt_counts,
        }
    )


In [None]:
counts_df = pd.DataFrame(result)
counts_df["pa_counts"] = counts_df["counts"] - counts_df["rm_pa_counts"]
counts_df["pt_counts"] = counts_df["rm_pa_counts"] - counts_df["rm_pa_pt_counts"]
counts_df["valid_counts"] = counts_df["rm_pa_pt_counts"]
counts_df = counts_df.groupby("protocol").sum()
counts_df["pa_proportion"] = counts_df["pa_counts"] / counts_df["counts"]
counts_df["pt_proportion"] = counts_df["pt_counts"] / counts_df["counts"]
counts_df["valid_proportion"] = counts_df["valid_counts"] / counts_df["counts"]
counts_df = counts_df.reset_index()
counts_df["protocol"] = counts_df["protocol"].map(protocol_map)
counts_df["protocol"] = pd.Categorical(counts_df["protocol"], categories=order)
counts_df.sort_values("protocol", inplace=True)
# counts_df = counts_df.melt(id_vars=["protocol"], value_vars=["pa_proportion", "pt_proportion", "valid_proportion"], var_name="type", value_name="proportion")

In [56]:
counts_df["total_proportion"] = 1
counts_df["rm_pa_proportion"] = 1 - counts_df["pa_proportion"]
counts_df["rm_pa_pt_proportion"] = 1 - counts_df["pt_proportion"] - counts_df["pa_proportion"]

In [None]:
plt.close()
fig, ax = plt.subplots(figsize=(60*mm, 36*mm))
sns.barplot(
    data=counts_df,
    x="total_proportion",
    y="protocol",
    color=palette[2],
    label="polyA",
    ax=ax
)
sns.barplot(
    data=counts_df,
    x="rm_pa_proportion",
    y="protocol",
    color=palette[1],
    label="polyT",
    ax=ax
)
sns.barplot(
    data=counts_df,
    x="rm_pa_pt_proportion",
    y="protocol",
    color=palette[0],
    label="valid",
    ax=ax
)
ax.set_ylabel("")
ax.set_xlabel("Proportion")
ax.tick_params(which="minor", left=False)
ax.tick_params(which="both", direction="out")
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()
plt.savefig("../../figures/suppfig/figS2b.pdf")

In [115]:
cigar_1 = "------------------------------------------"
cigar_2 = "------------------------------------------"
seq1 = "----ATCCATCACCTACTAAAAAAAAAAAAAAAAAAAAAAAA"
cigar_3 = "------------------------------------------"
seq2 = "------------------AAAAAAAAAAAAAAAAAAAAAAAA"
seq3 = "ACCGATCCATCACCTACTAAAAAAAAAAAAAAAAACTAGGCT"
seq_matrix = np.array([list(cigar_1), list(cigar_2), list(seq1), list(cigar_3), list(seq2),  list(seq3)])


In [107]:
color_encode = {
    "A": palette[0],
    "T": palette[1],
    "C": palette[2],
    "G": palette[3],
    "-": "white",
}

In [None]:
h = ma.CatHeatmap(
    seq_matrix, palette=color_encode, height=30*mm, width=90*mm
)
h.add_layer(mp.TextMesh(seq_matrix, color="white", fontsize=5))
h.cut_rows([1,2,3,4,5], spacing=0.05)
h.add_left(mp.Labels(["","", "Read 1", "", "Read 2", "Reference \ngenome"], fontsize=6), pad=2*mm)
h.render()
plt.savefig("../../figures/suppfig/figS2a.pdf", bbox_inches="tight", dpi=300)