In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pysam
import os
import glob
import hashlib
import gzip

In [2]:
GDN001 = "/data/GDN001/results/assembly-evaluation-quast/genomeFractionNew.tsv"
EMN001 = "/data/EMN001/results/assembly-evaluation-quast/genomeFractionNew.tsv"

In [3]:
dic1 = {
    "GCF_000008185.1": "NC_002967.9",
    "GCF_000210715.1": "NC_021038.1",
    "GCF_000238215.1": "NC_016610.1",
    "GCF_003019295.1": "NZ_CP028101.1"
}

dic2 = {
    "NC_002967.9": "Treponema\ndenticola",
    "NC_021038.1": "Fretibacterium\nfastidiosum",
    "NC_016610.1": "Tannerella\nforsythia",
    "NZ_CP028101.1": "Fusobacterium\nnucleatum"
}

In [4]:
assemblers = ['carpedeam2.configSafe', 'carpedeam2.configUnsafe', 'megahit.config0', 'penguin.config0', 'spades.config0'] 


In [5]:
def map_assembler(cell):
    if "carpedeam" in cell:
        return "CarpeDeam"
    elif "penguin" in cell:
        return "PenguiN"
    elif "megahit" in cell:
        return "MEGAHIT"
    elif "spades" in cell:
        return "metaSPAdes"
    else:
        return cell  # Return the cell as is if none of the conditions are met

In [6]:
def adjust_assemblerconfig(row):
    if row["assembler_clean"] == "CarpeDeam":
        if "carpedeamSafe" in row["assemblerconfig"]:
            return "CarpeDeam\n(safe mode)"
        elif "carpedeamUnsafe" in row["assemblerconfig"]:
            return "CarpeDeam\n(unsafe mode)"
        else:
            return "CarpeDeam"
    else:
        return row["assembler_clean"]

In [7]:
dic_sort_asm = {"CarpeDeam (safe mode)" : 0, "CarpeDeam (unsafe mode)" : 1, "PenguiN" : 2, "MEGAHIT" : 3, "metaSPAdes" : 4}

In [8]:
def sort_key(x, dic_sort_asm):
    return x[dic_sort_asm]

In [39]:
def plotGF(path, nameDate):
    df = pd.read_csv(path, delim_whitespace=True, header=None)
    df = df[df[0].apply(lambda x: any(assembler in x for assembler in assemblers))]
    df.columns=["file", "Species", "GenomeFraction"]
    
    for assembler in assemblers:
        for gcf in dic1.keys():
            species = gcf
            # Check if the combination of assembler and species exists in the DataFrame
            if not ((df["file"] == assembler) & (df["Species"] == species)).any():
                # Append a new row with missing combination
                new_row = pd.DataFrame([[assembler, species, 0]], columns=["file", "Species", "GenomeFraction"])
                df = pd.concat([df, new_row], ignore_index=True)
    # Step 1: Map GCF IDs to NC/NZ strings
    df["NC_NZ"] = df["Species"].map(dic1)
    
    # Step 2: Map NC/NZ strings to species names
    df["SpeciesName"] = df["NC_NZ"].map(dic2)
    df["assembler"] = df["file"].apply(lambda x: map_assembler(x))
    df["Assembler"] = df.apply(adjust_assemblerconfig, axis=1)

    # Define a custom palette for consistent color coding
    custom_palette = ['#a1c9f4', '#b9f2f0', '#8de5a1', '#ffb482', '#fab0e4']
    
    # Set custom order for assemblers
    custom_order = [
        'CarpeDeam (safe mode)',
        'CarpeDeam (unsafe mode)',
        'PenguiN',
        'MEGAHIT',
        'metaSPAdes'
    ]
    hue_order = sorted(df['Assembler'].unique(), key=lambda x: custom_order.index(x))

    pal = sns.color_palette()
    plt.figure(figsize=(6, 6))
    # Ordering the species names alphabetically
    species_order = sorted(df['SpeciesName'].unique())
    
    sns.barplot(data=df, x='SpeciesName', y='GenomeFraction', hue='Assembler', hue_order=hue_order,palette=['#a1c9f4', '#b9f2f0', '#8de5a1', '#ffb482', '#fab0e4'], order=species_order)
    plt.xticks(rotation=45, ha="right")
    plt.title(f"Dataset: {nameDate}")
    plt.ylabel("Genome Fraction (%)", fontsize=10)
    plt.xlabel("Species Name")
    plt.legend(bbox_to_anchor=(0.5, -0.5), loc='upper center', ncol=3)
    plt.tight_layout()
    plt.savefig(f"plots/figure7/GenomeFraction_{nameDate}.svg", format="svg", bbox_inches="tight")
    plt.show()

In [None]:
plotGF(GDN001, "GDN001")

In [None]:
plotGF(EMN001, "EMN001")