In [1]:
import pandas as pd
import glob
import os
import hashlib
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib_venn import venn2, venn3

In [2]:
def get_md5sum(x):
    return hashlib.md5(x.encode("utf-8")).hexdigest()[:10]

In [3]:
def map_assembler(cell):
    if "carpedeam" in cell:
        return "CarpeDeam"
    elif "penguin" in cell:
        return "PenguiN"
    elif "megahit" in cell:
        return "MEGAHIT"
    elif "spades" in cell:
        return "metaSPAdes"
    else:
        return cell  # Return the cell as is if none of the conditions are met

In [None]:
labels = ["ERR3579753", "ERR3579736"]

labels_dict = {key: get_md5sum(key) for key in labels}
labels_dict_inv = {value: key for key, value in labels_dict.items()}
print(labels_dict_inv)

In [5]:
sample_map = {'87bf691987' : 'EMN001', 'eebf379d54' : 'GDN001'}

In [6]:
def find_files(directory, suffix):
    aln_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(suffix):
                aln_files.append(os.path.join(root, file))
    return aln_files

In [13]:
def venn_diagram_species(dfs, titles, dataset, third, number):
    # Ensure the input is a list of DataFrames and titles
    if not isinstance(dfs, list) or not all(isinstance(df, pd.DataFrame) for df in dfs):
        raise ValueError("Input must be a list of pandas DataFrames.")
    if len(dfs) > 3:
        print("More than 3 DataFrames provided. Only the first 3 will be used for the Venn diagram.")
        dfs = dfs[:3]

    # Extract unique 'target' values from each DataFrame
    sets = [set(df['reference'].unique()) for df in dfs]
    
    # Plotting
    plt.figure(figsize=(10, 8))
    if third == "penguin":
        venn3(sets, titles, set_colors=("blue", 'orange', 'green'))
    else:
        venn3(sets, titles, set_colors=("blue", 'orange', 'red'))       

    
    if dataset == "RISE":
        dataset = "RISE397"
    
    plt.title(f"Taxonomy Assignment on Species Level:\nNumber of Matches between Translated Contigs and Genome Taxonomy Database (GTDB)\nDataset: {dataset}")
    #plt.savefig(f'taxonomy/taxonomy_{dataset}_{number}.svg', format="svg")
    #plt.show()



In [39]:
def curate_report_df(file):
    """
    Returns a list of dataframes. Each dataframe belongs to a file/assembler. The analyzed files are from mmseq taxonomy:
    (1) Target identifier
    (2) Number of sequences aligning to target
    (3) Unique coverage of target uniqueAlignedResidues / targetLength
    (4) Target coverage alignedResidues / targetLength
    (5) Average sequence identity
    (6) Taxonomical information identifier, species, lineage
    """
    
    df_aln = pd.read_csv(file, sep='\t', names=["target", "coveredBy", "uniqCov", "totalCov", "seq.Id.", "taxId", "species1", "species2"])
    df_aln["reference"] = df_aln["target"].str.rsplit('_', n=1).str[0]

    df_aln = df_aln[ df_aln["uniqCov"] >= 0.9 ] #df_aln["uniqCov"] >= 0.99
    df_aln.reset_index(inplace=True)
    
    df_aln["file"] = os.path.basename(file)
    df_aln["dataLabel"]=df_aln["file"].str.split(".").str[0].map(sample_map)
    df_aln["assembler"] = df_aln["file"].apply(lambda file: map_assembler(file))
    
    return df_aln

In [None]:
### TOPHIT REPORT ANALYSIS

assemblers = ['carpedeam.config0', 'megahit.config0', 'penguin.config0', 'spades.config0']
samples = ["GDN001", "EMN001"]

dic = {value: {} for value in samples}
print(dic)
for sample in samples:
    path=f"data/{sample}/results/assembly-easytaxonomy-eval_gtdb"
    files_aln = find_files(path, "tophit_report")
    for file in files_aln:
        if any(assembler in file for assembler in assemblers):
            results = curate_report_df(file)
            readname = results["dataLabel"][0]
            assembler_id = results["assembler"][0]
            dic[readname][assembler_id] = results


for dataset in dic.keys():
    try:
        venn_diagram_species([dic[dataset]["CarpeDeam"], dic[dataset]["MEGAHIT"], dic[dataset]["PenguiN"]], ["CarpeDeam", "MEGAHIT", "PenguiN"], dataset, "penguin" ,1)
    except:
        pass
    try:
        venn_diagram_species([dic[dataset]["CarpeDeam"], dic[dataset]["MEGAHIT"], dic[dataset]["metaSPAdes"]], ["CarpeDeam", "MEGAHIT", "metaSPAdes"], dataset, "spades", 2)
    except:
        pass
    
 