In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import glob
import hashlib
import gzip
import os

In [2]:
def get_md5sum(x):
    return hashlib.md5(x.encode("utf-8")).hexdigest()[:10]

In [3]:
dataset = "Gut"
gut_quast=f"/data/ancient{dataset}/results/assembly-evaluation-quast/*report.tsv.gz"
gut_mapping=f"/data/ancient{dataset}/results/assembly-mapping/*stats.txt"

dataset = "Calc"
calculus_quast=f"/data/ancient{dataset}/results/assembly-evaluation-quast/*report.tsv.gz"
calculus_mapping=f"/data/ancient{dataset}/results/assembly-mapping/*stats.txt"

dataset = "Horse"
bone_quast=f"/data/ancient{dataset}/results/assembly-evaluation-quast/*report.tsv.gz"
bone_mapping=f"/data/ancient{dataset}/results/assembly-mapping/*stats.txt"

In [None]:
labels = ["gut_sum_high_c3", "gut_sum_high_c5", "gut_sum_high_c10", \
 "calc_2095_high_c3", "calc_2095_high_c5", "calc_2095_high_c10", \
 "horse_sum_high_c3", "horse_sum_high_c5", "horse_sum_high_c10", ]

labels_clean = [
    "Gut:\nHigh Damage; Cov. 3X",
    "Gut:\nHigh Damage; Cov. 5X",
    "Gut:\nHigh Damage; Cov. 10X",
    "Calculus:\nHigh Damage; Cov. 3X",
    "Calculus:\nHigh Damage; Cov. 5X",
    "Calculus:\nHigh Damage; Cov. 10X",
    "Bone:\nHigh Damage; Cov. 3X",
    "Bone:\nHigh Damage; Cov. 5X",
    "Bone:\nHigh Damage; Cov. 10X",
]

labels_dict = {key: get_md5sum(key) for key in labels}
labels_dict_inv = {value: key for key, value in labels_dict.items()}
print(labels_dict_inv)

labels_dict_clean = {labels[i] : labels_clean[i] for i in range(len(labels))}

In [5]:
def map_assembler(cell):
    if "carpedeam" in cell:
        return "CarpeDeam"
    elif "penguin" in cell:
        return "Penguin"
    elif "megahit" in cell:
        return "MEGAHIT"
    elif "spades" in cell:
        return "metaSPAdes"
    else:
        return cell  # Return the cell as is if none of the conditions are met

In [6]:
def adjust_assemblerconfig(row):
    if row["assembler_clean"] == "CarpeDeam":
        if "carpedeamSafe" in row["assemblerconfig"]:
            return "CarpeDeam\n(safe mode)"
        elif "carpedeamUnsafe" in row["assemblerconfig"]:
            return "CarpeDeam\n(unsafe mode)"
        else:
            return "CarpeDeam"
    else:
        return row["assembler_clean"]

In [7]:
def quast_df(path_tsv):
    files = glob.glob(path_tsv)
    dfs = []
    for file in files:
        df = pd.read_csv(file, compression='gzip', sep='\t')
        dfs.append(df)
    big_df = pd.concat(dfs, ignore_index=True)
    big_df["assemblerconfig"] = big_df["assembler"] + " " + big_df["config"]
    big_df["assembler_clean"] = big_df["assembler"].apply(map_assembler)
    big_df["assembler_final"] = big_df.apply(adjust_assemblerconfig, axis=1)

    big_df["label"] = big_df["label"].astype(str)
    big_df["label_human"] = big_df["label"].map(labels_dict_inv)
    big_df["label_clean"] = big_df["label_human"].replace(labels_dict_clean)
    return big_df

In [8]:
def mapped_fraction(file_path):
    with open(file_path) as f:
        lines = f.readlines()
    if len(lines) > 0:
        total = int(lines[0].split(" ")[0])
        mapped = int(lines[6].split(" ")[0])
        if mapped != 0:
            fraction = mapped/total
        else:
            fraction = 0
    else:
        fraction = 0
    return fraction
    
    
def get_config(file_path):
    """
    Extracts the primary mapped percentage and assembler configuration from a given file.
    """
    # Extracting the assembler and config from the filename
    file = os.path.basename(file_path)
    parts = file.split(".")
    label = parts[0]
    assembler = parts[3]  # Assuming assembler is after the third dot
    config = parts[4]  # Assuming config is after the fourth dot
    assembler_config = f"{assembler} {config}"
    return label, assembler_config, config

def mapping_df(path_tsv):
    files = glob.glob(path_tsv)
    dfs = []
    for file in files:
        ident = os.path.basename(file)
        label, assemblerconfig, config = get_config(file)
        fraction = mapped_fraction(file)
        dfs.append([ident, label, assemblerconfig, config, fraction])
        

    df = pd.DataFrame(dfs, columns=["assembly", "label", "assemblerconfig", "config", "mappedfraction"])
    
    df["label_human"] = df["label"].map(labels_dict_inv)
    df["assembly"] = df["assembly"].apply(lambda x: x.replace(".stats.txt", ""))
    return df


In [9]:
# Define a function to highlight the maximum value in each column
def highlight_max(s):
    is_max = s == s.max()
    return ['background-color: yellow' if v else '' for v in is_max]

In [10]:
gut_q = quast_df(gut_quast)
calc_q = quast_df(calculus_quast)
bone_q = quast_df(bone_quast)
#quasts = pd.concat([gut_q, calc_q, bone_q], ignore_index=True)

In [11]:
gut_m = mapping_df(gut_mapping)
calc_m = mapping_df(calculus_mapping)
bone_m = mapping_df(bone_mapping)

In [None]:
## ONE TABLE PER DATASET ###

main=['carpedeam2 configSafe', 'carpedeam2 configUnsafe', 'megahit config0', 'penguin config0', 'spades config0']

for quast, mapping in zip([gut_q, calc_q, bone_q], [gut_m, calc_m, bone_m]):
    
    mapping["assembly"] = mapping["assembly"].apply(lambda x: x.replace("raw-raw.mapping", "raw_raw.assm"))
    merge_df = quast.merge(mapping[['assembly', 'mappedfraction']], on='assembly', how='left')
    main_df = merge_df[merge_df["assemblerconfig"].isin(main)]
    main_df = main_df[["assembler_final" ,"label_clean", "genome_fraction_perc", "largest_contig", "largest_alignment", "mappedfraction", \
                       "n50", "l50", "na50", "la50", "num_misassemblies", "total_aligned_length", "duplication_ratio", "num_contigs_ge_0_bp", \
                       "num_mismatches_per_100_kbp"]]

    
    # Custom order for 'label_clean' column
    custom_order_label = {
    "Gut: High Damage; Coverage 3X" : 1,
    "Gut: High Damage; Coverage 5X" : 2,
    "Gut: High Damage; Coverage 10X" : 3,
    "Calculus: High Damage; Coverage 3X" : 1,
    "Calculus: High Damage; Coverage 5X" : 2,
    "Calculus: High Damage; Coverage 10X" : 3,
    "Bone: High Damage; Coverage 3X" : 1,
    "Bone: High Damage; Coverage 5X" : 2,
    "Bone: High Damage; Coverage 10X" : 3}

    # Sorting by 'label_clean' with custom order, then by 'assembler_clean' alphabetically
    main_df = main_df.sort_values(by=['label_clean', 'assembler_final'],
                                  key=lambda x: x.map(custom_order_label) if x.name == 'label_clean' else x)

    main_df.rename(columns={"assembler_final" : "Assembler" ,"label_clean": "Dataset", "largest_contig": "Largest Contig", "genome_fraction_perc" : "Genome Fraction", \
                            "largest_alignment" : "Largest Alignment", "mappedfraction" : "Reads Mapped Fraction", "n50" : "N50", \
                            "l50" : "L50", "na50" : "NA50", "la50" : "LA50", "num_misassemblies" : "# Misassemblies", \
                            "total_aligned_length" : "Total Aligned Length", "duplication_ratio" : "Duplication Ratio", "num_mismatches_per_100_kbp" : "# Mismatches per 100kb"}, inplace = True)
    
    main_df["Damage"] = main_df["Dataset"].apply(lambda x: x.split(" ")[1])
    main_df["Coverage"] = main_df["Dataset"].apply(lambda x: x.split(" ")[4])
    main_df["Misassemblies per contig"] = main_df["# Misassemblies"]/main_df["num_contigs_ge_0_bp"]
    main_df["Misassemblies per aligned base"] = main_df["# Misassemblies"]/main_df["Total Aligned Length"]
    main_df["# Mismatches per 100kb"] = pd.to_numeric(main_df["# Mismatches per 100kb"])
    
    main_df = main_df[["Assembler" ,"Coverage", "Damage", "Genome Fraction", "Largest Alignment", "Reads Mapped Fraction", "NA50", "LA50", "Misassemblies per contig", "# Mismatches per 100kb", "Duplication Ratio"]]
    #main_df['Largest Contig'] = main_df['Largest Contig'].apply(lambda x: f'{x:,}')
    numeric_cols = main_df.select_dtypes(include=['int64']).columns

    # Apply formatting to all selected columns
    # Apply the lambda function to each element in the specified columns
    main_df[numeric_cols] = main_df[numeric_cols].applymap(lambda x: f'{x:,}')
    main_df["Reads Mapped Fraction"] = main_df["Reads Mapped Fraction"].apply(lambda x: f'{x:.3f}')
    main_df["Genome Fraction"] = main_df["Genome Fraction"].apply(lambda x: f'{x:.3f}')
    main_df["Misassemblies per contig"] = main_df["Misassemblies per contig"].apply(lambda x: f'{x:.3f}')
    main_df["Duplication Ratio"] = main_df["Duplication Ratio"].apply(lambda x: f'{x:.3f}')
    main_df["# Mismatches per 100kb"] = main_df["# Mismatches per 100kb"].apply(lambda x: f'{x:.3f}')
    main_df.index = range(len(main_df))

    main_df = main_df[main_df['Damage'] != 'Mid']
    main_df = main_df[["Assembler" ,"Coverage", "Genome Fraction", "Largest Alignment", "Reads Mapped Fraction", "NA50", "LA50", "Misassemblies per contig", "# Mismatches per 100kb", "Duplication Ratio"]]

    
    print(main_df["# Mismatches per 100kb"].to_latex(index=False))

