# iTOL figures NBC_actino

In [1]:
# Import libraries
import os
from Bio import SeqIO
import pandas as pd
import networkx as nx
from shutil import copyfile
from Bio.Align.Applications import MuscleCommandline
from collections import OrderedDict
import seaborn as sns
import matplotlib.pyplot as plt
from Bio import Phylo
import yaml
from pathlib import Path

In [None]:
with open("config.yaml", "r") as f:
    notebook_configuration = yaml.safe_load(f)
notebook_configuration

In [3]:
# Write samples.csv table to config directory of qc_strepto_ncbi project
bgcflow_dir = Path(notebook_configuration["bgcflow_dir"])
project_name_1 = "mq_strepto"
processed_dir_1 = bgcflow_dir / "data" / "processed" / project_name_1

# Read output tables from the processed directory
ncbi_meta_table = processed_dir_1 / "tables"/ "df_ncbi_meta.csv"
df_ncbi_meta = pd.read_csv(ncbi_meta_table, index_col= 0)

gtdb_meta_table = processed_dir_1 / "tables"/ "df_gtdb_meta_curated.csv"
df_gtdb_meta = pd.read_csv(gtdb_meta_table, index_col= 0)

seqfu_meta_table = processed_dir_1 / "tables"/ "df_seqfu_stats.csv"
df_seqfu_meta = pd.read_csv(seqfu_meta_table, index_col= 0)

mash_table = processed_dir_1 / "mash"/ "df_mash.csv"
df_mash = pd.read_csv(mash_table, index_col= 0)

filters_table = processed_dir_1 / "tables" / "df_filters.csv"
df_filter_quality = pd.read_csv(filters_table, index_col=0)

antismash_table = processed_dir_1 / "tables" / "df_antismash_7.0.0_summary.csv"
df_antismash = pd.read_csv(antismash_table, index_col=0, low_memory=False)

df_mash_clusters = pd.read_csv("assets/tables/df_mash_clusters_main.csv", index_col=0)
df_silhouette_filtered = pd.read_csv("assets/tables/df_mash_clusters_main_reduced.csv", index_col=0)

gtdbtk_tree_path = processed_dir_1 / "gtdbtk_tree" / "gtdbtk.bac120.rooted.itol.tree"
get_phylo_tree_path = processed_dir_1 / "getphylo" / "getphylo.rooted.tree"
automlst_tree_path = processed_dir_1 / "automlst_wrapper" / "final.newick"

In [None]:
df_subclusters = pd.DataFrame()
for group in ["P1", "P2", "P3", "P4", "P5", "P6", "P7"]:
    df_subclusters_group = pd.read_csv("assets/tables/df_subclusters_" + group + "_reduced.csv", index_col=0)
    df_subclusters = pd.concat([df_subclusters, df_subclusters_group])
df_subclusters

In [5]:
df_subclusters.to_csv("assets/tables/df_subclusters_reduced.csv")

# Get GTDBTk tree results

In [6]:
t = Phylo.read(gtdbtk_tree_path, 'newick')

In [7]:
genome_tree_order = [node.name for node in t.get_terminals() if node.name in df_seqfu_meta.index]

In [8]:
phylo_genome_len_bar_path = processed_dir_1 / "iTOL" / 'df_gtdb_genome_len.csv'
phylo_colored_range_path =  processed_dir_1 / "iTOL" / 'df_gtdb_colored_range.csv'

# Color strips
phylo_silhouette_path = processed_dir_1 / "iTOL" / 'df_gtdb_color_strip_silhouette.csv'
phylo_subclusters_path = processed_dir_1 / "iTOL" / 'df_gtdb_color_strip_subclusters.csv'
phylo_filters_path = processed_dir_1 / "iTOL" / 'df_gtdb_color_strip_filters.csv'

In [None]:
filter_color_dict = {"HQ_NCBI": "#8B0000",
                    "MQ_NCBI": "#FA8072",
                    "HQ_NBC": "#00008B",
                    "MQ_NBC": "#ADD8E6"}

df_phylo_filters = pd.DataFrame(index= genome_tree_order)
df_phylo_filters['node_label'] = df_filter_quality.loc[genome_tree_order, 'species'].tolist()

for genome_id in df_phylo_filters.index:
    source_qaulity = df_filter_quality.loc[genome_id, "quality"] + "_" + df_filter_quality.loc[genome_id, "source"]
    df_phylo_filters.loc[genome_id, 'range_color'] = filter_color_dict[source_qaulity]
    df_phylo_filters.loc[genome_id, 'range_label'] = source_qaulity

df_phylo_filters.to_csv(phylo_filters_path)
df_phylo_filters

In [None]:
df_phylo_genome_len = pd.DataFrame(index= genome_tree_order)
df_phylo_genome_len['node_label'] = df_filter_quality.loc[genome_tree_order, 'species'].tolist()
df_phylo_genome_len['genome_len'] = df_seqfu_meta.loc[genome_tree_order,'Total'].tolist()
df_phylo_genome_len.to_csv(phylo_genome_len_bar_path)
df_phylo_genome_len

In [None]:
df_phylo_colored_range = pd.DataFrame(index= genome_tree_order)
df_phylo_colored_range['node_label'] = df_filter_quality.loc[genome_tree_order, 'species'].tolist()

df_phylo_colored_range['range_color'] = df_mash_clusters.loc[genome_tree_order, 'Cluster_Color'].tolist()
df_phylo_colored_range['range_label'] = df_mash_clusters.loc[genome_tree_order, 'Cluster'].tolist()

df_phylo_colored_range.to_csv(phylo_colored_range_path)
df_phylo_colored_range

In [None]:
df_phylo_colored_subclusters = pd.DataFrame(index= df_subclusters.index)
df_phylo_colored_subclusters['node_label'] = df_filter_quality.loc[df_subclusters.index, 'species'].tolist()

df_phylo_colored_subclusters['range_color'] = df_subclusters['Cluster_Color']
df_phylo_colored_subclusters['range_label'] = df_subclusters['Cluster']
df_phylo_colored_subclusters.to_csv(phylo_subclusters_path)
df_phylo_colored_subclusters

In [None]:
df_phylo_colored_silhouette = pd.DataFrame(index= df_silhouette_filtered.index)
df_phylo_colored_silhouette['node_label'] = df_filter_quality.loc[df_silhouette_filtered.index, 'species'].tolist()

df_phylo_colored_silhouette['range_color'] = df_mash_clusters.loc[df_silhouette_filtered.index, 'Cluster_Color'].tolist()
df_phylo_colored_silhouette['range_label'] = df_silhouette_filtered.loc[:, 'Cluster'].tolist()

df_phylo_colored_silhouette.to_csv(phylo_silhouette_path)
df_phylo_colored_silhouette

# Get get_phylo tree results

In [14]:
t = Phylo.read(get_phylo_tree_path, 'newick')

In [15]:
genome_tree_order = [node.name for node in t.get_terminals() if node.name in df_seqfu_meta.index]

In [16]:
phylo_genome_len_bar_path = processed_dir_1 / "iTOL" / 'df_getphylo_genome_len.csv'
phylo_colored_range_path =  processed_dir_1 / "iTOL" / 'df_getphylo_colored_range.csv'
phylo_subclusters_path = processed_dir_1 / "iTOL" / 'df_getphylo_color_strip_subclusters.csv'
phylo_filters_path = processed_dir_1 / "iTOL" / 'df_getphylo_color_strip_filters.csv'
phylo_silhouette_path = processed_dir_1 / "iTOL" / 'df_getphylo_color_strip_silhouette.csv'

In [None]:
filter_color_dict = {"HQ_NCBI": "#8B0000",
                    "MQ_NCBI": "#FA8072",
                    "HQ_NBC": "#00008B",
                    "MQ_NBC": "#ADD8E6"}

df_phylo_filters = pd.DataFrame(index= genome_tree_order)
df_phylo_filters['node_label'] = df_filter_quality.loc[genome_tree_order, 'species'].tolist()

for genome_id in df_phylo_filters.index:
    source_qaulity = df_filter_quality.loc[genome_id, "quality"] + "_" + df_filter_quality.loc[genome_id, "source"]
    df_phylo_filters.loc[genome_id, 'range_color'] = filter_color_dict[source_qaulity]
    df_phylo_filters.loc[genome_id, 'range_label'] = source_qaulity

df_phylo_filters.to_csv(phylo_filters_path)
df_phylo_filters

In [None]:
df_phylo_genome_len = pd.DataFrame(index= genome_tree_order)
df_phylo_genome_len['node_label'] = df_filter_quality.loc[genome_tree_order, 'species'].tolist()
df_phylo_genome_len['genome_len'] = df_seqfu_meta.loc[genome_tree_order,'Total'].tolist()
df_phylo_genome_len.to_csv(phylo_genome_len_bar_path)
df_phylo_genome_len

In [None]:
df_phylo_colored_range = pd.DataFrame(index= genome_tree_order)
df_phylo_colored_range['node_label'] = df_filter_quality.loc[genome_tree_order, 'species'].tolist()

df_phylo_colored_range['range_color'] = df_mash_clusters.loc[genome_tree_order, 'Cluster_Color'].tolist()
df_phylo_colored_range['range_label'] = df_mash_clusters.loc[genome_tree_order, 'Cluster'].tolist()

df_phylo_colored_range.to_csv(phylo_colored_range_path)
df_phylo_colored_range

In [None]:
df_phylo_colored_silhouette = pd.DataFrame(index= df_silhouette_filtered.index)
df_phylo_colored_silhouette['node_label'] = df_filter_quality.loc[df_silhouette_filtered.index, 'species'].tolist()

df_phylo_colored_silhouette['range_color'] = df_silhouette_filtered.loc[:, 'Cluster_Color'].tolist()
df_phylo_colored_silhouette['range_label'] = df_silhouette_filtered.loc[:, 'Cluster'].tolist()

df_phylo_colored_silhouette.to_csv(phylo_silhouette_path)
df_phylo_colored_silhouette

In [None]:
df_phylo_colored_subclusters = pd.DataFrame(index= df_subclusters.index)
df_phylo_colored_subclusters['node_label'] = df_filter_quality.loc[df_subclusters.index, 'species'].tolist()

df_phylo_colored_subclusters['range_color'] = df_subclusters['Cluster_Color']
df_phylo_colored_subclusters['range_label'] = df_subclusters['Cluster']

df_phylo_colored_subclusters.to_csv(phylo_subclusters_path)
df_phylo_colored_subclusters

# Get automlst tree results

In [22]:
with open(automlst_tree_path, "r") as f:
    data = f.readlines()

value_to_replace = [i.split(":")[0] for i in data[0].replace("(", "").split(",")]

hnew_dict = {}
df = pd.read_csv(processed_dir_1 / "automlst_wrapper/df_genomes_tree.csv")
genome_ids = list(df.genome_id)
for g in genome_ids:
    for v in value_to_replace:
        if v.startswith(g.split(".")[0]):
            new_dict[v] = g
            value_to_replace.remove(v)

data = data[0]
for k in new_dict.keys():
    data = data.replace(k, new_dict[k])

automlst_tree_corrected_path = processed_dir_1 / "automlst_wrapper/final_corrected.newick"
with open(automlst_tree_corrected_path, "w") as f:
    f.write(data)

In [23]:
automlst_tree_corrected_path = processed_dir_1 / "automlst_wrapper/final_corrected.newick"
t = Phylo.read(automlst_tree_corrected_path, 'newick')
genome_tree_order = [node.name for node in t.get_terminals() if node.name in df_seqfu_meta.index]

In [24]:
phylo_genome_len_bar_path = processed_dir_1 / "iTOL" / 'df_automlst_genome_len.csv'
phylo_colored_range_path =  processed_dir_1 / "iTOL" / 'df_automlst_colored_range.csv'
phylo_subclusters_path = processed_dir_1 / "iTOL" / 'df_automlst_color_strip_subclusters.csv'
phylo_filters_path = processed_dir_1 / "iTOL" / 'df_automlst_color_strip_filters.csv'
phylo_silhouette_path = processed_dir_1 / "iTOL" / 'df_automlst_color_strip_silhouette.csv'

In [None]:
df_phylo_genome_len = pd.DataFrame(index= genome_tree_order)
df_phylo_genome_len['node_label'] = df_filter_quality.loc[genome_tree_order, 'species'].tolist()
df_phylo_genome_len['genome_len'] = df_seqfu_meta.loc[genome_tree_order,'Total'].tolist()
df_phylo_genome_len.to_csv(phylo_genome_len_bar_path)
df_phylo_genome_len

In [None]:
df_phylo_colored_range = pd.DataFrame(index= genome_tree_order)
df_phylo_colored_range['node_label'] = df_filter_quality.loc[genome_tree_order, 'species'].tolist()

df_phylo_colored_range['range_color'] = df_mash_clusters.loc[genome_tree_order, 'Cluster_Color'].tolist()
df_phylo_colored_range['range_label'] = df_mash_clusters.loc[genome_tree_order, 'Cluster'].tolist()

df_phylo_colored_range.to_csv(phylo_colored_range_path)
df_phylo_colored_range

In [None]:
df_phylo_colored_subclusters = pd.DataFrame(index= df_subclusters.index)
df_phylo_colored_subclusters['node_label'] = df_filter_quality.loc[df_subclusters.index, 'species'].tolist()

df_phylo_colored_subclusters['range_color'] = df_subclusters['Cluster_Color']
df_phylo_colored_subclusters['range_label'] = df_subclusters['Cluster']

df_phylo_colored_subclusters.to_csv(phylo_subclusters_path)
df_phylo_colored_subclusters

In [None]:
df_phylo_colored_silhouette = pd.DataFrame(index= df_silhouette_filtered.index)
df_phylo_colored_silhouette['node_label'] = df_filter_quality.loc[df_silhouette_filtered.index, 'species'].tolist()

df_phylo_colored_silhouette['range_color'] = df_silhouette_filtered.loc[:, 'Cluster_Color'].tolist()
df_phylo_colored_silhouette['range_label'] = df_silhouette_filtered.loc[:, 'Cluster'].tolist()

df_phylo_colored_silhouette.to_csv(phylo_silhouette_path)
df_phylo_colored_silhouette

# Add RED values to iTOL

In [58]:
RED_colors = {"Streptomyces_RG1": "#3cb44b", 
             "Streptomyces_RG2": "#ffe119", 
             "Streptomyces_RG12": "#0082c8", 
             "Streptomyces_RG3": "#46f0f0", 
             "Streptomyces_RG4": "#f58231", 
             "Streptomyces_RG6": "#e6194b"}

In [70]:
df_RED = pd.read_csv("assets/tables/RED_groups.csv", index_col=0, sep=";")
df_RED_species = df_mash_clusters.copy()
for RED in df_RED.index:
    species_list = df_RED.loc[RED, "GTDB_species_list"].split("&")
    for species in species_list:
        species = species.strip()
        species_id = species.strip().split(" ")[1]
        if species_id in df_RED_species.Species.tolist():
            df_species_tmp = df_RED_species[df_RED_species.Species == species_id]
            df_RED_species.loc[df_species_tmp.index, "RED"] = RED
            if RED in RED_colors.keys():
                df_RED_species.loc[df_species_tmp.index, "RED_color"] = RED_colors[RED]
            else:                
                df_RED_species.loc[df_species_tmp.index, "RED_color"] = "#808080"

df_RED_species["RED"] = df_RED_species.RED.fillna("NA")
df_RED_species["RED_color"] = df_RED_species.RED_color.fillna("#ffffff")
df_RED_species.to_csv(processed_dir_1 / "iTOL" / 'df_RED_values.csv')