# Assembly statistics
---

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
sns.set_style("whitegrid")
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [25]:
def read_align(files):
    align_stat = pd.DataFrame()
    for f in files:
        assembly = f.split("/")[-3]
        _df = pd.read_csv(f, header=None, names=["sample", "%"], sep="\t")
        _df["%"] = [float(x.rstrip("%")) for x in _df["%"].values]
        _df = _df.assign(assembly = pd.Series([assembly]*_df.shape[0], index=_df.index))
        align_stat = pd.concat([align_stat, _df], sort=True)
    return align_stat

## Overall assembly statistics

In [None]:
stat_result = pd.read_csv(snakemake.input.stat, sep="\t", header=0)
stat_result_m = pd.melt(stat_result, id_vars=["assembly"])
ax=sns.catplot(kind="bar", x="assembly", y="value", order=sorted(stat_result.assembly),
                    col="variable", data=stat_result_m, height=2.5,
                    sharey=False, col_wrap=4)
ax.set_xticklabels(rotation=90)
ax.set_titles("{col_name}")
plt.savefig(snakemake.output[0], dpi=300, bbox_inches="tight")

## Distribution of contig lengths

In [None]:
sizedist_result = pd.read_csv(snakemake.input.dist, sep="\t", header=0)
ax = sns.lineplot(data=sizedist_result, hue="assembly", x="min_length", y="%", linewidth=1, 
                  hue_order = sorted(set(sizedist_result.assembly)))
ax.set_ylabel("% of total assembly");
ax.set_xlabel("contig length");
plt.savefig(snakemake.output[1], dpi=300, bbox_inches="tight")

## Alignment frequency

In [None]:
align_stat = read_align(snakemake.input.maps)
ax = sns.stripplot(data=align_stat, x="assembly", y="%", hue="assembly", 
                   order=sorted(align_stat.assembly.unique()), hue_order=sorted(align_stat.assembly.unique()))
ax.set_ylabel("% alignment");
ax.set_xlabel("assembly");
plt.savefig(snakemake.output[2], dpi=300, bbox_inches="tight")