In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import datetime
plt.style.use('ggplot')
sns.set_style("whitegrid")
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [None]:
def read_df(f, i):
    items = f.split("/")
    binner, assembly, l = items[i-2], items[i-1], items[i]
    _df = pd.read_csv(f, sep="\t", header=0, index_col=0)
    data = {"binner": [binner]*_df.shape[0],
            "assembly": [assembly]*_df.shape[0],
            "min_contig_length": [l]*_df.shape[0]}
    _df = pd.merge(pd.DataFrame(data, index=_df.index), _df, 
                   left_index=True, right_index=True)
    return _df


def read_summary_stats(f):
    """
    Reads the basic summary statistics for bins
    """
    df = pd.read_csv(f, sep="\t", header=0)
    value_vars = ["Mbp", "GC", "contigs", "n50"]
    id_vars = ["binner", "assembly"]
    # Use contig length for rows if doing multiple plotting
    if len(df.min_contig_length.unique()) > 1:
        id_vars.append("min_contig_length")
        row = "min_contig_length"
    else:
        row = None
    # Melt the stats dataframe
    dfm = pd.melt(df, id_vars=id_vars, value_vars=value_vars)
    bin_counts = df.groupby(["binner","assembly","min_contig_length"]).count().loc[:,"bp"]
    bin_counts = pd.DataFrame(bin_counts).rename(columns={'bp': 'bins'}).reset_index()
    return dfm, bin_counts, row


def read_checkm_stats(f):
    """
    Reads extended statistics from checkm qa
    """
    df = pd.read_csv(f, sep="\t", index_col=0, header=0)
    df = df.assign(Purity = 100 - df.Contamination)
    df = df.assign(Mbp = df["Genome size (bp)"] / 1000000)
    return df

In [None]:
with PdfPages(snakemake.output[0]) as pdf:
    #### Read the summary statistics ####
    stats = [f for f in snakemake.input if os.path.basename(f) == "binning_summary.tsv"]
    df, bin_counts, row = read_summary_stats(stats[0])
    #### Plot the summary stats
    plt.figure(figsize=(8, 8))
    plt.title('Overall statistics')
    sns.catplot(kind="strip", col="variable", hue="assembly", y="value", x="binner", data=df, 
            hue_order=sorted(df.assembly.unique()), sharey=False, linewidth=.5, row=row)
    pdf.savefig(bbox_inches="tight")
    plt.close()
    #### Plot number of bins
    plt.rc('text', usetex=False)
    plt.figure(figsize=(8, 8))
    plt.title("Number of bins")
    sns.catplot(data=bin_counts, hue="assembly", kind="bar", x="binner", y="bins", 
            hue_order=sorted(bin_counts.assembly.unique()), col="min_contig_length", sharey=True)
    pdf.savefig(bbox_inches="tight")
    plt.close()
    
    #### Read checkm stats if available ####
    checkm_stats = [f for f in snakemake.input if os.path.basename(f) == "checkm.stats.tsv"]
    if len(checkm_stats) > 0:
        checkm_df = read_checkm_stats(checkm_stats[0])
        lengths = sorted(checkm_df.min_contig_length.unique())
        plt.rc('text', usetex=False)
        fig, a = plt.subplots(nrows=len(lengths), ncols=1, figsize=(8,6*len(lengths)))
        if len(lengths) > 1:
            axes = [axis for axis in a]
        else:
            axes = [a]
        for i, l in enumerate(lengths):
            ax = sns.scatterplot(data=checkm_df.loc[checkm_df.min_contig_length==l], y="Completeness", x="Purity", style="binner", hue="assembly", 
                           hue_order=sorted(checkm_df.assembly.unique()), size="Mbp", linewidth=.5, ax=axes[i])
            ax.set_title("min contig length={}".format(l))
            ax.legend(bbox_to_anchor=(1,1))
            ax.set_xlabel("Purity (%)")
            ax.set_ylabel("Completeness (%)")
        pdf.savefig(fig, bbox_inches="tight")
        plt.close()
    # Set the file's metadata via the PdfPages object:
    d = pdf.infodict()
    d['Title'] = 'Binning report for NBIS-meta workflow'
    d['Author'] = 'NBIS'
    d['CreationDate'] = datetime.datetime.today()