In [19]:
import json
import pandas as pd
import plotly.express as px

In [20]:
samples = [
    "test",
]

## trim summary

In [21]:
json_dir = "../data/output/json/"
with open(f"{json_dir}/{samples[0]}.trim.json", "r") as f:
    data = json.load(f)

trim_df = pd.DataFrame(index=samples, columns=data.keys())
for sample in samples:
    with open(f"{json_dir}/{sample}.trim.json", "r") as f:
        trim_df.loc[sample] = json.load(f)

trim_df_per = trim_df.copy()
for col in trim_df_per.columns:
    trim_df_per[col] = trim_df_per[col] / trim_df["total"] * 100

trim_df = trim_df[["without_primer", "no_insert", "yes_insert"]]
trim_df_per = trim_df_per[["without_primer", "no_insert", "yes_insert"]]
trim_df = trim_df.reset_index().melt(id_vars="index")
trim_df.columns = ["sample", "type", "count"]
trim_df_per = trim_df_per.reset_index().melt(id_vars="index")
trim_df_per["value"] = trim_df_per["value"].apply(lambda x: round(x, 2))
trim_df_per.columns = ["sample", "type", "percent"]
trim_df_per["count"] = trim_df["count"].apply(lambda x: f"{x/1e6:.2f}M")

In [22]:
for sample, sub_df in trim_df.groupby("sample"):
    seq_volume = trim_df.loc[trim_df["sample"] == sample, "count"].sum()
    print(f"{sample}: {seq_volume*300/1e9:.1f} Gb")

test: 0.3 Gb


In [23]:
fig = px.bar(
    trim_df_per,
    x="sample",
    y="percent",
    color="type",
    text="count",
    # text_auto=True,
    barmode="stack",
    title="Trim results",
    labels={"sample": ""},
)
fig.show()

## mapping summary

In [24]:
map_rrna = pd.DataFrame(
    columns=["unmapped", "rRNA", "miRNA", "snoRNA", "snRNA", "other_mapped", "mRNA", "mRNA_tss"],
)
for sample in samples:
    with open(f"{json_dir}/{sample}.summary.json", "r") as f:
        map_rrna.loc[sample] = json.load(f)
# convert to percentage per row
map_rrna_per = map_rrna.copy()
map_rrna_per = map_rrna_per.apply(lambda x: x / x.sum() * 100, axis=1)
map_rrna_per = map_rrna_per.reset_index().melt(id_vars="index")
map_rrna_per["value"] = map_rrna_per["value"].apply(lambda x: round(x, 2))
map_rrna_per.columns = ["sample", "type", "percent"]
map_rrna = map_rrna.reset_index().melt(id_vars="index")
map_rrna.columns = ["sample", "type", "count"]
map_rrna_per["count"] = map_rrna["count"].apply(lambda x: f"{x/1e6:.1f}M") + map_rrna_per[
    "percent"
].apply(lambda x: f"({x:.1f}%)")

In [25]:
fig = px.bar(
    map_rrna_per,
    x="sample",
    y="percent",
    color="type",
    barmode="stack",
    title="Map results",
    labels={"sample": ""},
    text="count",
)
fig.show()

## insert size

In [26]:
bed_dir = "../data/interim/bed/"
insert_size = []
for sample in samples:
    df = pd.read_csv(f"{bed_dir}/{sample}_frags.sort.mRNA_tss.bed", sep="\t", header=None)
    if len(df) >= 100000:
        df = df.sample(100000)

    df["insert_size"] = df[2] - df[1]
    df = df[(df["insert_size"] <= 500) & (df["insert_size"] > 0)]
    df["sample"] = sample
    insert_size.append(df[["insert_size", "sample"]].copy())
insert_size = pd.concat(insert_size, ignore_index=True)

In [27]:
import plotly.graph_objects as go
from plotly.colors import n_colors

colors = n_colors("rgb(5, 200, 200)", "rgb(200, 10, 10)", len(samples)+1, colortype="rgb")

fig = go.Figure()
for sample, color in zip(samples, colors):
    fig.add_trace(
        go.Violin(
            x=insert_size[insert_size["sample"] == sample]["insert_size"].values, line_color=color
        )
    )

fig.update_traces(orientation="h", side="positive", width=3, points=False)
# fig.update_layout(xaxis_showgrid=False, xaxis_zeroline=False)
# rename y ticks
fig.update_layout(
    yaxis=dict(
        tickmode="array",
        tickvals=list(range(len(samples))),
        ticktext=samples,
    )
)
# disable legend
fig.update_layout(showlegend=False)
fig.update_layout(title="Insert size distribution")
fig.show()