# iTOL figures NBC_actino

In [1]:
# Import libraries
import os
from Bio import SeqIO
import pandas as pd
import networkx as nx
from shutil import copyfile
from Bio.Align.Applications import MuscleCommandline
from collections import OrderedDict
import seaborn as sns
import matplotlib.pyplot as plt
from Bio import Phylo
import yaml
from pathlib import Path

In [2]:
with open("config.yaml", "r") as f:
    notebook_configuration = yaml.safe_load(f)
notebook_configuration

{'bgcflow_dir': '/datadrive/bgcflow'}

In [3]:
# Write samples.csv table to config directory of qc_strepto_ncbi project
bgcflow_dir = Path(notebook_configuration["bgcflow_dir"])
project_name_1 = "mq_strepto"
processed_dir_1 = bgcflow_dir / "data" / "processed" / project_name_1

# Read output tables from the processed directory
ncbi_meta_table = processed_dir_1 / "tables"/ "df_ncbi_meta.csv"
df_ncbi_meta = pd.read_csv(ncbi_meta_table, index_col= 0)

gtdb_meta_table = processed_dir_1 / "tables"/ "df_gtdb_meta_curated.csv"
df_gtdb_meta = pd.read_csv(gtdb_meta_table, index_col= 0)

seqfu_meta_table = processed_dir_1 / "tables"/ "df_seqfu_stats.csv"
df_seqfu_meta = pd.read_csv(seqfu_meta_table, index_col= 0)

mash_table = processed_dir_1 / "mash"/ "df_mash.csv"
df_mash = pd.read_csv(mash_table, index_col= 0)

filters_table = processed_dir_1 / "tables" / "df_filters.csv"
df_filter_quality = pd.read_csv(filters_table, index_col=0)

antismash_table = processed_dir_1 / "tables" / "df_antismash_7.0.0_summary.csv"
df_antismash = pd.read_csv(antismash_table, index_col=0, low_memory=False)

df_mash_clusters = pd.read_csv("assets/tables/df_mash_clusters_main.csv", index_col=0)
df_silhouette_filtered = pd.read_csv("assets/tables/df_mash_clusters_main_reduced.csv", index_col=0)

gtdbtk_tree_path = processed_dir_1 / "gtdbtk_tree" / "gtdbtk.bac120.rooted.itol.tree"
get_phylo_tree_path = processed_dir_1 / "getphylo" / "getphylo.rooted.tree"
automlst_tree_path = processed_dir_1 / "automlst_wrapper" / "final.newick"

In [4]:
df_subclusters = pd.DataFrame()
for group in ["P1", "P2", "P3", "P4", "P5", "P6", "P7"]:
    df_subclusters_group = pd.read_csv("assets/tables/df_subclusters_" + group + "_reduced.csv", index_col=0)
    df_subclusters = pd.concat([df_subclusters, df_subclusters_group])
df_subclusters

Unnamed: 0_level_0,Cluster,Cluster_Color,Species
genome_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GCF_004005495.1,1,#e6194b,sp004005495
NBC_00110,1,#e6194b,sp003403055
NBC_00649,1,#e6194b,sp.
NBC_01280,1,#e6194b,sp.
NBC_01397,1,#e6194b,sp003403055
...,...,...,...
GCF_023376055.1,3,#ffe119,fradiae
GCF_001750785.1,3,#ffe119,fradiae
GCF_024600715.1,3,#ffe119,kanasensis
GCF_000478605.2,3,#ffe119,thermolilacinus


In [5]:
df_subclusters.to_csv("assets/tables/df_subclusters_reduced.csv")

# Get GTDBTk tree results

In [6]:
t = Phylo.read(gtdbtk_tree_path, 'newick')

In [7]:
genome_tree_order = [node.name for node in t.get_terminals() if node.name in df_seqfu_meta.index]

In [8]:
phylo_genome_len_bar_path = processed_dir_1 / "iTOL" / 'df_gtdb_genome_len.csv'
phylo_colored_range_path =  processed_dir_1 / "iTOL" / 'df_gtdb_colored_range.csv'

# Color strips
phylo_silhouette_path = processed_dir_1 / "iTOL" / 'df_gtdb_color_strip_silhouette.csv'
phylo_subclusters_path = processed_dir_1 / "iTOL" / 'df_gtdb_color_strip_subclusters.csv'
phylo_filters_path = processed_dir_1 / "iTOL" / 'df_gtdb_color_strip_filters.csv'

In [9]:
filter_color_dict = {"HQ_NCBI": "#8B0000",
                    "MQ_NCBI": "#FA8072",
                    "HQ_NBC": "#00008B",
                    "MQ_NBC": "#ADD8E6"}

df_phylo_filters = pd.DataFrame(index= genome_tree_order)
df_phylo_filters['node_label'] = df_filter_quality.loc[genome_tree_order, 'species'].tolist()

for genome_id in df_phylo_filters.index:
    source_qaulity = df_filter_quality.loc[genome_id, "quality"] + "_" + df_filter_quality.loc[genome_id, "source"]
    df_phylo_filters.loc[genome_id, 'range_color'] = filter_color_dict[source_qaulity]
    df_phylo_filters.loc[genome_id, 'range_label'] = source_qaulity

df_phylo_filters.to_csv(phylo_filters_path)
df_phylo_filters

Unnamed: 0,node_label,range_color,range_label
GCF_024436035.1,sp024436035,#FA8072,MQ_NCBI
GCF_000237305.1,cattleya,#8B0000,HQ_NCBI
GCF_000240165.1,cattleya,#8B0000,HQ_NCBI
GCF_009862885.1,cattleya,#FA8072,MQ_NCBI
GCF_003144095.1,sp003144095,#FA8072,MQ_NCBI
...,...,...,...
GCF_007896905.1,albidoflavus,#FA8072,MQ_NCBI
GCF_001704195.1,albidoflavus,#8B0000,HQ_NCBI
GCF_001865315.1,albidoflavus,#FA8072,MQ_NCBI
GCF_009846825.1,albidoflavus,#FA8072,MQ_NCBI


In [10]:
df_phylo_genome_len = pd.DataFrame(index= genome_tree_order)
df_phylo_genome_len['node_label'] = df_filter_quality.loc[genome_tree_order, 'species'].tolist()
df_phylo_genome_len['genome_len'] = df_seqfu_meta.loc[genome_tree_order,'Total'].tolist()
df_phylo_genome_len.to_csv(phylo_genome_len_bar_path)
df_phylo_genome_len

Unnamed: 0,node_label,genome_len
GCF_024436035.1,sp024436035,7069061
GCF_000237305.1,cattleya,8092553
GCF_000240165.1,cattleya,8095515
GCF_009862885.1,cattleya,6283062
GCF_003144095.1,sp003144095,6326228
...,...,...
GCF_007896905.1,albidoflavus,7446125
GCF_001704195.1,albidoflavus,7070328
GCF_001865315.1,albidoflavus,7261502
GCF_009846825.1,albidoflavus,7235067


In [11]:
df_phylo_colored_range = pd.DataFrame(index= genome_tree_order)
df_phylo_colored_range['node_label'] = df_filter_quality.loc[genome_tree_order, 'species'].tolist()

df_phylo_colored_range['range_color'] = df_mash_clusters.loc[genome_tree_order, 'Cluster_Color'].tolist()
df_phylo_colored_range['range_label'] = df_mash_clusters.loc[genome_tree_order, 'Cluster'].tolist()

df_phylo_colored_range.to_csv(phylo_colored_range_path)
df_phylo_colored_range

Unnamed: 0,node_label,range_color,range_label
GCF_024436035.1,sp024436035,#f58231,5
GCF_000237305.1,cattleya,#f58231,5
GCF_000240165.1,cattleya,#f58231,5
GCF_009862885.1,cattleya,#f58231,5
GCF_003144095.1,sp003144095,#f58231,5
...,...,...,...
GCF_007896905.1,albidoflavus,#0082c8,4
GCF_001704195.1,albidoflavus,#0082c8,4
GCF_001865315.1,albidoflavus,#0082c8,4
GCF_009846825.1,albidoflavus,#0082c8,4


In [12]:
df_phylo_colored_subclusters = pd.DataFrame(index= df_subclusters.index)
df_phylo_colored_subclusters['node_label'] = df_filter_quality.loc[df_subclusters.index, 'species'].tolist()

df_phylo_colored_subclusters['range_color'] = df_subclusters['Cluster_Color']
df_phylo_colored_subclusters['range_label'] = df_subclusters['Cluster']
df_phylo_colored_subclusters.to_csv(phylo_subclusters_path)
df_phylo_colored_subclusters

Unnamed: 0_level_0,node_label,range_color,range_label
genome_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GCF_004005495.1,sp004005495,#e6194b,1
NBC_00110,sp003403055,#e6194b,1
NBC_00649,sp.,#e6194b,1
NBC_01280,sp.,#e6194b,1
NBC_01397,sp003403055,#e6194b,1
...,...,...,...
GCF_023376055.1,fradiae,#ffe119,3
GCF_001750785.1,fradiae,#ffe119,3
GCF_024600715.1,kanasensis,#ffe119,3
GCF_000478605.2,thermolilacinus,#ffe119,3


In [13]:
df_phylo_colored_silhouette = pd.DataFrame(index= df_silhouette_filtered.index)
df_phylo_colored_silhouette['node_label'] = df_filter_quality.loc[df_silhouette_filtered.index, 'species'].tolist()

df_phylo_colored_silhouette['range_color'] = df_mash_clusters.loc[df_silhouette_filtered.index, 'Cluster_Color'].tolist()
df_phylo_colored_silhouette['range_label'] = df_silhouette_filtered.loc[:, 'Cluster'].tolist()

df_phylo_colored_silhouette.to_csv(phylo_silhouette_path)
df_phylo_colored_silhouette

Unnamed: 0_level_0,node_label,range_color,range_label
genome_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GCF_020092725.1,olivaceus,#3cb44b,2
GCF_020092675.1,olivaceus,#3cb44b,2
GCF_020092695.1,olivaceus,#3cb44b,2
GCF_020092565.1,olivaceus,#3cb44b,2
GCF_020092625.1,olivaceus,#3cb44b,2
...,...,...,...
GCF_003935055.1,alboflavus,#e6194b,1
GCF_014650815.1,flavofungini,#e6194b,1
GCF_016411765.1,flavofungini,#e6194b,1
GCF_001482415.1,typhae,#e6194b,1


# Get get_phylo tree results

In [14]:
t = Phylo.read(get_phylo_tree_path, 'newick')

In [15]:
genome_tree_order = [node.name for node in t.get_terminals() if node.name in df_seqfu_meta.index]

In [16]:
phylo_genome_len_bar_path = processed_dir_1 / "iTOL" / 'df_getphylo_genome_len.csv'
phylo_colored_range_path =  processed_dir_1 / "iTOL" / 'df_getphylo_colored_range.csv'
phylo_subclusters_path = processed_dir_1 / "iTOL" / 'df_getphylo_color_strip_subclusters.csv'
phylo_filters_path = processed_dir_1 / "iTOL" / 'df_getphylo_color_strip_filters.csv'
phylo_silhouette_path = processed_dir_1 / "iTOL" / 'df_getphylo_color_strip_silhouette.csv'

In [17]:
filter_color_dict = {"HQ_NCBI": "#8B0000",
                    "MQ_NCBI": "#FA8072",
                    "HQ_NBC": "#00008B",
                    "MQ_NBC": "#ADD8E6"}

df_phylo_filters = pd.DataFrame(index= genome_tree_order)
df_phylo_filters['node_label'] = df_filter_quality.loc[genome_tree_order, 'species'].tolist()

for genome_id in df_phylo_filters.index:
    source_qaulity = df_filter_quality.loc[genome_id, "quality"] + "_" + df_filter_quality.loc[genome_id, "source"]
    df_phylo_filters.loc[genome_id, 'range_color'] = filter_color_dict[source_qaulity]
    df_phylo_filters.loc[genome_id, 'range_label'] = source_qaulity

df_phylo_filters.to_csv(phylo_filters_path)
df_phylo_filters

Unnamed: 0,node_label,range_color,range_label
GCF_003144095.1,sp003144095,#FA8072,MQ_NCBI
GCF_019890615.1,parmotrematis,#FA8072,MQ_NCBI
GCF_029269905.1,sp.,#FA8072,MQ_NCBI
GCF_024436035.1,sp024436035,#FA8072,MQ_NCBI
GCF_009862885.1,cattleya,#FA8072,MQ_NCBI
...,...,...,...
GCF_008124985.1,anthocyanicus,#FA8072,MQ_NCBI
GCF_008124905.1,anthocyanicus,#FA8072,MQ_NCBI
GCF_009862445.1,anthocyanicus,#FA8072,MQ_NCBI
GCF_008125035.1,anthocyanicus,#FA8072,MQ_NCBI


In [18]:
df_phylo_genome_len = pd.DataFrame(index= genome_tree_order)
df_phylo_genome_len['node_label'] = df_filter_quality.loc[genome_tree_order, 'species'].tolist()
df_phylo_genome_len['genome_len'] = df_seqfu_meta.loc[genome_tree_order,'Total'].tolist()
df_phylo_genome_len.to_csv(phylo_genome_len_bar_path)
df_phylo_genome_len

Unnamed: 0,node_label,genome_len
GCF_003144095.1,sp003144095,6326228
GCF_019890615.1,parmotrematis,7401816
GCF_029269905.1,sp.,7881186
GCF_024436035.1,sp024436035,7069061
GCF_009862885.1,cattleya,6283062
...,...,...
GCF_008124985.1,anthocyanicus,8584323
GCF_008124905.1,anthocyanicus,8585353
GCF_009862445.1,anthocyanicus,8667507
GCF_008125035.1,anthocyanicus,8584931


In [19]:
df_phylo_colored_range = pd.DataFrame(index= genome_tree_order)
df_phylo_colored_range['node_label'] = df_filter_quality.loc[genome_tree_order, 'species'].tolist()

df_phylo_colored_range['range_color'] = df_mash_clusters.loc[genome_tree_order, 'Cluster_Color'].tolist()
df_phylo_colored_range['range_label'] = df_mash_clusters.loc[genome_tree_order, 'Cluster'].tolist()

df_phylo_colored_range.to_csv(phylo_colored_range_path)
df_phylo_colored_range

Unnamed: 0,node_label,range_color,range_label
GCF_003144095.1,sp003144095,#f58231,5
GCF_019890615.1,parmotrematis,#f58231,5
GCF_029269905.1,sp.,#f58231,5
GCF_024436035.1,sp024436035,#f58231,5
GCF_009862885.1,cattleya,#f58231,5
...,...,...,...
GCF_008124985.1,anthocyanicus,#3cb44b,2
GCF_008124905.1,anthocyanicus,#3cb44b,2
GCF_009862445.1,anthocyanicus,#3cb44b,2
GCF_008125035.1,anthocyanicus,#3cb44b,2


In [20]:
df_phylo_colored_silhouette = pd.DataFrame(index= df_silhouette_filtered.index)
df_phylo_colored_silhouette['node_label'] = df_filter_quality.loc[df_silhouette_filtered.index, 'species'].tolist()

df_phylo_colored_silhouette['range_color'] = df_silhouette_filtered.loc[:, 'Cluster_Color'].tolist()
df_phylo_colored_silhouette['range_label'] = df_silhouette_filtered.loc[:, 'Cluster'].tolist()

df_phylo_colored_silhouette.to_csv(phylo_silhouette_path)
df_phylo_colored_silhouette

Unnamed: 0_level_0,node_label,range_color,range_label
genome_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GCF_020092725.1,olivaceus,#3cb44b,2
GCF_020092675.1,olivaceus,#3cb44b,2
GCF_020092695.1,olivaceus,#3cb44b,2
GCF_020092565.1,olivaceus,#3cb44b,2
GCF_020092625.1,olivaceus,#3cb44b,2
...,...,...,...
GCF_003935055.1,alboflavus,#e6194b,1
GCF_014650815.1,flavofungini,#e6194b,1
GCF_016411765.1,flavofungini,#e6194b,1
GCF_001482415.1,typhae,#e6194b,1


In [21]:
df_phylo_colored_subclusters = pd.DataFrame(index= df_subclusters.index)
df_phylo_colored_subclusters['node_label'] = df_filter_quality.loc[df_subclusters.index, 'species'].tolist()

df_phylo_colored_subclusters['range_color'] = df_subclusters['Cluster_Color']
df_phylo_colored_subclusters['range_label'] = df_subclusters['Cluster']

df_phylo_colored_subclusters.to_csv(phylo_subclusters_path)
df_phylo_colored_subclusters

Unnamed: 0_level_0,node_label,range_color,range_label
genome_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GCF_004005495.1,sp004005495,#e6194b,1
NBC_00110,sp003403055,#e6194b,1
NBC_00649,sp.,#e6194b,1
NBC_01280,sp.,#e6194b,1
NBC_01397,sp003403055,#e6194b,1
...,...,...,...
GCF_023376055.1,fradiae,#ffe119,3
GCF_001750785.1,fradiae,#ffe119,3
GCF_024600715.1,kanasensis,#ffe119,3
GCF_000478605.2,thermolilacinus,#ffe119,3


# Get automlst tree results

In [22]:
# with open(automlst_tree_path, "r") as f:
#     data = f.readlines()

# value_to_replace = [i.split(":")[0] for i in data[0].replace("(", "").split(",")]

# hnew_dict = {}
# df = pd.read_csv(processed_dir_1 / "automlst_wrapper/df_genomes_tree.csv")
# genome_ids = list(df.genome_id)
# for g in genome_ids:
#     for v in value_to_replace:
#         if v.startswith(g.split(".")[0]):
#             new_dict[v] = g
#             value_to_replace.remove(v)

# data = data[0]
# for k in new_dict.keys():
#     data = data.replace(k, new_dict[k])

# automlst_tree_corrected_path = processed_dir_1 / "automlst_wrapper/final_corrected.newick"
# with open(automlst_tree_corrected_path, "w") as f:
#     f.write(data)

In [23]:
automlst_tree_corrected_path = processed_dir_1 / "automlst_wrapper/final_corrected.newick"
t = Phylo.read(automlst_tree_corrected_path, 'newick')
genome_tree_order = [node.name for node in t.get_terminals() if node.name in df_seqfu_meta.index]

In [24]:
phylo_genome_len_bar_path = processed_dir_1 / "iTOL" / 'df_automlst_genome_len.csv'
phylo_colored_range_path =  processed_dir_1 / "iTOL" / 'df_automlst_colored_range.csv'
phylo_subclusters_path = processed_dir_1 / "iTOL" / 'df_automlst_color_strip_subclusters.csv'
phylo_filters_path = processed_dir_1 / "iTOL" / 'df_automlst_color_strip_filters.csv'
phylo_silhouette_path = processed_dir_1 / "iTOL" / 'df_automlst_color_strip_silhouette.csv'

In [25]:
df_phylo_genome_len = pd.DataFrame(index= genome_tree_order)
df_phylo_genome_len['node_label'] = df_filter_quality.loc[genome_tree_order, 'species'].tolist()
df_phylo_genome_len['genome_len'] = df_seqfu_meta.loc[genome_tree_order,'Total'].tolist()
df_phylo_genome_len.to_csv(phylo_genome_len_bar_path)
df_phylo_genome_len

Unnamed: 0,node_label,genome_len
GCF_016598575.1,sp016598575,5282528
GCF_028752555.1,clavuligerus,9161304
GCF_001693675.1,clavuligerus,7590758
GCF_015767815.1,clavuligerus,7790985
GCF_000148465.1,clavuligerus,9143376
...,...,...
GCF_014203495.1,griseoloalbus,7925849
GCF_000718785.1,sp000718785,7915558
GCF_001905725.1,sp001905725,8931064
GCF_018069565.1,sp018069565,7150798


In [26]:
df_phylo_colored_range = pd.DataFrame(index= genome_tree_order)
df_phylo_colored_range['node_label'] = df_filter_quality.loc[genome_tree_order, 'species'].tolist()

df_phylo_colored_range['range_color'] = df_mash_clusters.loc[genome_tree_order, 'Cluster_Color'].tolist()
df_phylo_colored_range['range_label'] = df_mash_clusters.loc[genome_tree_order, 'Cluster'].tolist()

df_phylo_colored_range.to_csv(phylo_colored_range_path)
df_phylo_colored_range

Unnamed: 0,node_label,range_color,range_label
GCF_016598575.1,sp016598575,#3cb44b,2
GCF_028752555.1,clavuligerus,#f58231,5
GCF_001693675.1,clavuligerus,#f58231,5
GCF_015767815.1,clavuligerus,#f58231,5
GCF_000148465.1,clavuligerus,#f58231,5
...,...,...,...
GCF_014203495.1,griseoloalbus,#3cb44b,2
GCF_000718785.1,sp000718785,#3cb44b,2
GCF_001905725.1,sp001905725,#3cb44b,2
GCF_018069565.1,sp018069565,#3cb44b,2


In [27]:
df_phylo_colored_subclusters = pd.DataFrame(index= df_subclusters.index)
df_phylo_colored_subclusters['node_label'] = df_filter_quality.loc[df_subclusters.index, 'species'].tolist()

df_phylo_colored_subclusters['range_color'] = df_subclusters['Cluster_Color']
df_phylo_colored_subclusters['range_label'] = df_subclusters['Cluster']

df_phylo_colored_subclusters.to_csv(phylo_subclusters_path)
df_phylo_colored_subclusters

Unnamed: 0_level_0,node_label,range_color,range_label
genome_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GCF_004005495.1,sp004005495,#e6194b,1
NBC_00110,sp003403055,#e6194b,1
NBC_00649,sp.,#e6194b,1
NBC_01280,sp.,#e6194b,1
NBC_01397,sp003403055,#e6194b,1
...,...,...,...
GCF_023376055.1,fradiae,#ffe119,3
GCF_001750785.1,fradiae,#ffe119,3
GCF_024600715.1,kanasensis,#ffe119,3
GCF_000478605.2,thermolilacinus,#ffe119,3


In [28]:
df_phylo_colored_silhouette = pd.DataFrame(index= df_silhouette_filtered.index)
df_phylo_colored_silhouette['node_label'] = df_filter_quality.loc[df_silhouette_filtered.index, 'species'].tolist()

df_phylo_colored_silhouette['range_color'] = df_silhouette_filtered.loc[:, 'Cluster_Color'].tolist()
df_phylo_colored_silhouette['range_label'] = df_silhouette_filtered.loc[:, 'Cluster'].tolist()

df_phylo_colored_silhouette.to_csv(phylo_silhouette_path)
df_phylo_colored_silhouette

Unnamed: 0_level_0,node_label,range_color,range_label
genome_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GCF_020092725.1,olivaceus,#3cb44b,2
GCF_020092675.1,olivaceus,#3cb44b,2
GCF_020092695.1,olivaceus,#3cb44b,2
GCF_020092565.1,olivaceus,#3cb44b,2
GCF_020092625.1,olivaceus,#3cb44b,2
...,...,...,...
GCF_003935055.1,alboflavus,#e6194b,1
GCF_014650815.1,flavofungini,#e6194b,1
GCF_016411765.1,flavofungini,#e6194b,1
GCF_001482415.1,typhae,#e6194b,1


# Add RED values to iTOL

In [57]:
df_RED_species.RED.value_counts()

RED
NA                    1324
Streptomyces_RG1       436
Streptomyces_RG2       252
Streptomyces_RG12      116
Streptomyces_RG3        65
Streptomyces_RG4        54
Streptomyces_RG6        18
Streptomyces_RG10       15
Streptomyces_RG11       15
Streptomyces_RG5        13
Streptomyces_RG8        12
Streptomyces_RG9        12
Streptomyces_RG7         9
Streptomyces_B_RG2       7
Streptomyces_B_RG1       5
Streptomyces_RG16        5
Streptomyces_RG15        4
Streptomyces_RG14        3
Streptomyces_C_RG1       3
Streptomyces_C_RG2       1
Streptomyces_RG17        1
Streptomyces_RG21        1
Name: count, dtype: int64

In [58]:
RED_colors = {"Streptomyces_RG1": "#3cb44b", 
             "Streptomyces_RG2": "#ffe119", 
             "Streptomyces_RG12": "#0082c8", 
             "Streptomyces_RG3": "#46f0f0", 
             "Streptomyces_RG4": "#f58231", 
             "Streptomyces_RG6": "#e6194b"}

In [70]:
df_RED = pd.read_csv("assets/tables/RED_groups.csv", index_col=0, sep=";")
df_RED_species = df_mash_clusters.copy()
for RED in df_RED.index:
    species_list = df_RED.loc[RED, "GTDB_species_list"].split("&")
    for species in species_list:
        species = species.strip()
        species_id = species.strip().split(" ")[1]
        if species_id in df_RED_species.Species.tolist():
            df_species_tmp = df_RED_species[df_RED_species.Species == species_id]
            df_RED_species.loc[df_species_tmp.index, "RED"] = RED
            if RED in RED_colors.keys():
                df_RED_species.loc[df_species_tmp.index, "RED_color"] = RED_colors[RED]
            else:                
                df_RED_species.loc[df_species_tmp.index, "RED_color"] = "#808080"

df_RED_species["RED"] = df_RED_species.RED.fillna("NA")
df_RED_species["RED_color"] = df_RED_species.RED_color.fillna("#ffffff")
df_RED_species.to_csv(processed_dir_1 / "iTOL" / 'df_RED_values.csv')

In [67]:
df_RED_species

Unnamed: 0_level_0,Cluster,Cluster_Color,Species,RED,RED_color
genome_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NBC_00510,1,#e6194b,sp.,,
NBC_00165,1,#e6194b,glauciniger,Streptomyces_B_RG2,#808080
NBC_01746,1,#e6194b,glauciniger,Streptomyces_B_RG2,#808080
GCF_000373665.1,1,#e6194b,glauciniger,Streptomyces_B_RG2,#808080
GCF_009864915.1,1,#e6194b,glauciniger,Streptomyces_B_RG2,#808080
...,...,...,...,...,...
GCF_001905665.1,7,#46f0f0,sp001905665,Streptomyces_RG3,#46f0f0
GCF_019733325.1,7,#46f0f0,sp001905665,Streptomyces_RG3,#46f0f0
NBC_00077,7,#46f0f0,exfoliatus,Streptomyces_RG3,#46f0f0
GCF_001426405.1,7,#46f0f0,sp001426405,Streptomyces_RG3,#46f0f0
