# Treemap figure representing the categories of starting dataset

In [72]:
import pandas as pd
from pathlib import Path
import random
import yaml
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

In [73]:
with open("config.yaml", "r") as f:
    notebook_configuration = yaml.safe_load(f)
notebook_configuration

{'bgcflow_dir': '/datadrive/bgcflow'}

In [81]:
# Write samples.csv table to config directory of qc_strepto_ncbi project
bgcflow_dir = Path(notebook_configuration["bgcflow_dir"])
project_name_1 = "qc_strepto_ncbi"
processed_dir_1 = bgcflow_dir / "data" / "processed" / project_name_1

project_name_2 = "qc_gtdbtk"
processed_dir_2 = bgcflow_dir / "data" / "processed" / project_name_2

# Read output tables from the processed directory
ncbi_meta_table = processed_dir_1 / "tables"/ "df_ncbi_meta.csv"
df_ncbi_meta = pd.read_csv(ncbi_meta_table, index_col= 0)

gtdb_meta_table = processed_dir_1 / "tables"/ "df_gtdb_meta_curated.csv"
df_gtdb_meta = pd.read_csv(gtdb_meta_table, index_col= 0)

seqfu_meta_table_1 = processed_dir_1 / "tables"/ "df_seqfu_stats.csv"
df_seqfu_meta_1 = pd.read_csv(seqfu_meta_table_1, index_col= 0)

seqfu_meta_table_2 = processed_dir_2 / "tables"/ "df_seqfu_stats.csv"
df_seqfu_meta_2 = pd.read_csv(seqfu_meta_table_2, index_col= 0)

# Filter out indices that don't exist in df_seqfu_meta_2 index
genomes_to_remove = [idx for idx in df_seqfu_meta_1.index if idx in df_seqfu_meta_2.index]

# Drop the filtered indices from df_seqfu_meta_2 as they are already present in df_sedf_seqfu_meta_1
df_seqfu_meta_NBC = df_seqfu_meta_2.drop(genomes_to_remove)

# Combine two dataframes
df_seqfu_meta = pd.concat([df_seqfu_meta_1, df_seqfu_meta_NBC])

filters_table = processed_dir_1 / "tables" / "df_filters.csv"
df_filter_quality = pd.read_csv(filters_table, index_col=0)
df_filter_quality["genus"] = df_filter_quality.genus.fillna("g__")

In [80]:
df_nbc = pd.read_csv("/datadrive/bgcflow/data/external/df_nbc.csv", sep=";", index_col=0)

In [79]:
df_filter_quality.groupby(by=["quality", "source"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,genome_id.1,genus,species,completeness,contamination,N50,contigs,genome_len,gc,strain
quality,source,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
HQ,Jørgensen et. al. 2024,860,860,860,860,860,860,860,860,860,860
HQ,NCBI,501,501,501,501,501,501,501,501,501,501
LQ,Jørgensen et. al. 2024,10,10,10,10,10,10,10,10,10,10
LQ,NCBI,1254,1254,1254,1254,1254,1254,1254,1254,1254,1254
MQ,Jørgensen et. al. 2024,32,32,32,32,32,32,32,32,32,32
MQ,Jørgensen et. al. 2024 (sourced from NCBI on 30 June 2023),121,121,121,121,121,121,121,121,121,121
MQ,NCBI,1062,1062,1062,1062,1062,1062,1062,1062,1062,1062


In [83]:
# df_filter_quality.loc[df_ncbi_meta.index, "strain"] = df_ncbi_meta.strain
# df_filter_quality.loc[df_seqfu_meta_NBC.index, "strain"] = df_seqfu_meta_NBC.index

In [58]:
# df_filter_quality.sort_values(by=["source","genus", "species", "strain"], inplace=True)

In [84]:
# df_filter_quality.to_csv("assets/data/df_quality_meta.csv")

In [86]:
# df_filter_quality[(df_filter_quality["source"] == "NBC") & (df_filter_quality.quality == "LQ")].to_csv("assets/data/checkm_contamination.csv")
# df_filter_quality

In [87]:
# # Replace genera with other if not Streptomyces
# df = df_filter_quality.copy()

# for genome_id in df.index:
#     genus = df.loc[genome_id, "genus"]
#     if genus != "Streptomyces":
#         df.loc[genome_id, "genus"] = "Other"

# Treemap

In [103]:
import plotly.graph_objects as go
df = df_filter_quality.copy()

for genome_id in df.index:
    genus = df.loc[genome_id, "genus"]
    if genus == "Streptomyces":
        df.loc[genome_id, "genus_plot"] = genus
    else:
        df.loc[genome_id, "genus_plot"] = "Other"
# Define your color scheme
color_dict = {
    'LQ': '#E69F00',  # Orange
    'MQ': '#56B4E9',  # Sky Blue
    'HQ': '#009E73',  # Bluish Green
}

treemap_fig = px.treemap(df, path=['source', 'genus_plot', 'quality'], 
                 color='quality',  # Use 'quality' to determine color
                 hover_data=['source', 'genus_plot', 'quality'],
                 color_discrete_map=color_dict,
                        )  # Use your color scheme

treemap_fig.update_traces(textposition='top left', 
                  textinfo='label+text+value', 
                  hovertemplate='<b>%{label} </b> <br> Count: %{value}',
                  textfont=dict(size=12)  # Adjust the font size of the text displayed in the boxes
                 )


# Add title, set size, and adjust title position
treemap_fig.update_layout(
    title_text="Overview of starting dataset",
    title_x=0.5,  # Center the title
    title_y=0.95,  # Position the title right above the plot
    width=600, 
    height=800
)

# Save as SVG
treemap_fig.write_image("assets/figures/Figure_1/a.svg")

# Save as PNG
treemap_fig.write_image("assets/figures/Figure_1/a.png")

treemap_fig.show()

In [102]:
df.groupby(by=['source', 'genus_plot', 'quality']).count()["genus"]

source                                                       genus_plot    quality
Jørgensen et. al. 2024                                       Other         HQ          133
                                                                           LQ            6
                                                                           MQ            5
                                                             Streptomyces  HQ          727
                                                                           LQ            4
                                                                           MQ           27
Jørgensen et. al. 2024  (sourced from NCBI on 30 June 2023)  Other         MQ            4
                                                             Streptomyces  MQ          117
NCBI                                                         Other         HQ           13
                                                                           LQ           60
       