# Overview map of GCFs in M6_hq phylogroup

In [1]:
import os
import pandas as pd
import yaml
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
with open("config.yaml", "r") as f:
    notebook_configuration = yaml.safe_load(f)
notebook_configuration

In [29]:
# Write samples.csv table to config directory of qc_strepto_ncbi project
bgcflow_dir = Path(notebook_configuration["bgcflow_dir"])
project_name_1 = "mq_strepto"
processed_dir_1 = bgcflow_dir / "data" / "processed" / project_name_1

# Read output tables from the processed directory
ncbi_meta_table = processed_dir_1 / "tables"/ "df_ncbi_meta.csv"
df_ncbi_meta = pd.read_csv(ncbi_meta_table, index_col= 0)

gtdb_meta_table = processed_dir_1 / "tables"/ "df_gtdb_meta_curated.csv"
df_gtdb_meta = pd.read_csv(gtdb_meta_table, index_col= 0)

seqfu_meta_table = processed_dir_1 / "tables"/ "df_seqfu_stats.csv"
df_seqfu_meta = pd.read_csv(seqfu_meta_table, index_col= 0)

mash_table = processed_dir_1 / "mash"/ "df_mash.csv"
df_mash = pd.read_csv(mash_table, index_col= 0)

df_clusters = pd.read_csv("assets/tables/df_clusters.csv", index_col=0)

filters_table = processed_dir_1 / "tables" / "df_filters.csv"
df_filter_quality = pd.read_csv(filters_table, index_col=0)

antismash_summary_table = processed_dir_1 / "tables" / "df_antismash_7.0.0_summary.csv"
df_antismash_summary = pd.read_csv(antismash_summary_table, index_col=0, low_memory=False)

bgcs_summary_table = processed_dir_1 / "tables" / "df_regions_antismash_7.0.0.csv"
df_bgcs_antismash = pd.read_csv(bgcs_summary_table, index_col=0)

arts_table = processed_dir_1 / "tables" / "df_arts_as-7.0.0.csv"
df_arts = pd.read_csv(arts_table, index_col=0)
df_clusters["Subcluster_label"] = df_clusters.Subcluster_label.fillna("NA")
df_clusters["Cluster_label"] = df_clusters.Subcluster_label.fillna("NA")

# Read bigslice results
df_gcf_presence = pd.read_csv(processed_dir_1 / "bigslice" / "cluster_as_7.0.0" / "df_gcf_presence_combined.csv", index_col=0)
df_gcfs_bigslice = pd.read_csv(processed_dir_1 / "bigslice" / "cluster_as_7.0.0" / "df_gcfs_combined.csv",index_col=0) 
df_bgcs_bigslice = pd.read_csv(processed_dir_1 / "bigslice" / "cluster_as_7.0.0" / "df_bgcs_curated_with_combined.csv", index_col=0) 

In [31]:
project_name_2 = "M6_hq"
processed_dir_2 = bgcflow_dir / "data" / "processed" / project_name_2

antismash_summary_table_2 = processed_dir_2 / "tables" / "df_antismash_7.0.0_summary.csv"
df_antismash_summary_phylo = pd.read_csv(antismash_summary_table_2, index_col=0, low_memory=False)

bgcs_summary_table_2 = processed_dir_2 / "tables" / "df_regions_antismash_7.0.0.csv"
df_bgcs_antismash_phylo = pd.read_csv(bgcs_summary_table_2, index_col=0)

bigscape_bgcs_summary_table_2 = processed_dir_2 / "bigscape" / "for_cytoscape_antismash_7.0.0"/ "2023-09-19 13_56_57_df_clusters_0.30.csv"
df_bgcs_bigscape_phylo = pd.read_csv(bigscape_bgcs_summary_table_2, index_col=0) 

bigscape_net_table_2 = processed_dir_2 / "bigscape" / "for_cytoscape_antismash_7.0.0"/ "2023-09-19 13_56_57_df_network_0.30.csv"
df_bigscape_net_phylo = pd.read_csv(bigscape_net_table_2, index_col=0) 

bigscape_gcfs_summary_table_2 = processed_dir_2 / "bigscape" / "for_cytoscape_antismash_7.0.0"/ "2023-09-19 13_56_57_df_families_0.30.csv"
df_gcfs_bigscape_phylo = pd.read_csv(bigscape_gcfs_summary_table_2, index_col=0) 

bigscape_gcfs_presence_table_2 = processed_dir_2 / "bigscape" / "for_cytoscape_antismash_7.0.0"/ "2023-09-19 13_56_57_df_family_presence_0.30.csv"
df_gcfs_presence_phylo = pd.read_csv(bigscape_gcfs_presence_table_2, index_col=0) 

In [32]:
df_gcfs_presence_bigslice_phylo = df_gcf_presence.loc[df_antismash_summary_phylo.index, :]
df_gcfs_presence_bigslice_phylo = df_gcfs_presence_bigslice_phylo.loc[:, df_gcfs_presence_bigslice_phylo.sum(0) > 0]
df_gcfs_bigslice_phylo = df_gcfs_bigslice.loc[df_gcfs_presence_bigslice_phylo.columns,:]
df_bgcs_bigslice_phylo = df_bgcs_bigslice.loc[df_bgcs_antismash_phylo.index,:]

In [33]:
df_genomes_phylo = df_filter_quality.loc[df_antismash_summary_phylo.index,:]

In [34]:
df_clusters_hq = df_clusters.loc[df_filter_quality[df_filter_quality.quality=="HQ"].index, :]

In [35]:
df_genomes = df_clusters_hq[df_clusters_hq.Cluster_label == "P2_5"]
## Create HQ project
df_samples = pd.read_csv("/datadrive/bgcflow/config/P2_5_hq/samples.csv", index_col=0)
df_samples_hq = df_samples.loc[df_genomes.index,:]
df_samples_hq.to_csv("/datadrive/bgcflow/config/P2_5_hq/samples.csv")

# Expand the bigscape network with additional connections

### Add connection to neighbouring BGCs

In [36]:
df_bigscape_net_phylo["Neighbours"] = "No"
accn_list = df_bgcs_antismash_phylo.accession.value_counts().index.tolist()
max_id = df_bigscape_net_phylo.index.max()

for accession in accn_list:
    df_bgcs_selected = df_bgcs_antismash_phylo[df_bgcs_antismash_phylo.accession == accession]
    for idx in range(df_bgcs_selected.shape[0] - 1):
        max_id = max_id + 1
        df_bigscape_net_phylo.loc[max_id, "Clustername 1"] = df_bgcs_selected.index[idx]
        df_bigscape_net_phylo.loc[max_id, "Clustername 2"] = df_bgcs_selected.index[idx + 1]
        df_bigscape_net_phylo.loc[max_id, "Neighbours"] = "Yes"

In [37]:
# Add BiGSLICE GCF nodes and antismash known simialirity
df_bigscape_net_phylo["bigslice_model"] = "No"

max_id = df_bigscape_net_phylo.index.max()
for bgc_id in df_bgcs_bigslice_phylo.index:
    max_id = max_id + 1
    df_bigscape_net_phylo.loc[max_id, "Clustername 1"] = bgc_id
    df_bigscape_net_phylo.loc[max_id, "Clustername 2"] = df_bgcs_bigslice_phylo.loc[bgc_id, "gcf_combined_id"]
    df_bigscape_net_phylo.loc[max_id, "bigslice_model"] = df_bgcs_bigslice_phylo.loc[bgc_id, "gcf_combined_id"]

    if df_bgcs_antismash_phylo.loc[bgc_id, "similarity"] > 0.2:
        max_id = max_id + 1
        df_bigscape_net_phylo.loc[max_id, "Clustername 1"] = bgc_id
        df_bigscape_net_phylo.loc[max_id, "Clustername 2"] = df_bgcs_antismash_phylo.loc[bgc_id, "most_similar_known_cluster_id"]
        df_bigscape_net_phylo.loc[max_id, "known_cluster_blast"] = df_bgcs_antismash_phylo.loc[bgc_id, "similarity"]    

In [None]:
bigscape_net_table_enriched = processed_dir_2 / "bigscape" / "for_cytoscape_antismash_7.0.0"/ "df_network_0.30_enriched.csv"
df_bigscape_net_phylo.to_csv(bigscape_net_table_enriched)

## Create network with GCFs

In [39]:
df_gcf_net = df_bigscape_net_phylo[["Clustername 1", "Clustername 2", "Raw distance", "Neighbours"]]
for edge_idx in df_gcf_net.index:
    bgc_1 = df_gcf_net.loc[edge_idx, "Clustername 1"]
    bgc_2 = df_gcf_net.loc[edge_idx, "Clustername 2"]
    if bgc_1 in df_bgcs_bigscape_phylo.index:
        df_gcf_net.loc[edge_idx, "Clustername 1"] = df_bgcs_bigscape_phylo.loc[bgc_1, "fam_id_0.30"]
    if bgc_2 in df_bgcs_bigscape_phylo.index:
        df_gcf_net.loc[edge_idx, "Clustername 2"] = df_bgcs_bigscape_phylo.loc[bgc_2, "fam_id_0.30"]

In [40]:
df_gcf_nbh = df_gcf_net[df_gcf_net.Neighbours == "Yes"]
df_gcf_nbh_counts = pd.DataFrame(columns=["gcf_1", "gcf_2", "Edge_count"])
idx = 0
for gcf_1 in df_gcf_nbh["Clustername 1"].unique():
    df_gcf_nbh_sel = df_gcf_nbh[df_gcf_nbh["Clustername 1"] == gcf_1]
    df_gcf_2_counts = df_gcf_nbh_sel["Clustername 2"].value_counts()
    for gcf_2 in df_gcf_2_counts.index:
        df_gcf_nbh_counts.loc[idx, "gcf_1"] = gcf_1
        df_gcf_nbh_counts.loc[idx, "gcf_2"] = gcf_2
        df_gcf_nbh_counts.loc[idx, "Edge_count"] = df_gcf_2_counts[gcf_2]
        idx = idx + 1

In [41]:
bigscape_net_table_gcf_enriched = processed_dir_2 / "bigscape" / "for_cytoscape_antismash_7.0.0"/ "df_gcf_nbh.csv"
df_gcf_nbh_counts.to_csv(bigscape_net_table_gcf_enriched)

In [42]:
bigscape_net_table_gcf_enriched = processed_dir_2 / "bigscape" / "for_cytoscape_antismash_7.0.0"/ "df_network_0.30_enriched_gcf.csv"
df_gcf_net.to_csv(bigscape_net_table_gcf_enriched)

In [43]:
df_gcf_nodes = df_gcfs_bigscape_phylo.copy()
accn_list = df_bgcs_antismash_phylo.accession.value_counts().index.tolist()
df_gcf_edges = pd.DataFrame(columns=["gcf_1", "gcf_2"])
edge_id = 0

for gcf_id in df_gcf_nodes.index:
    for gcf_id in df_gcf_nodes.index:
        
    df_bgcs_selected = df_bgcs_antismash_phylo[df_bgcs_antismash_phylo.accession == accession]
    for idx in range(df_bgcs_selected.shape[0] - 1):
        max_id = max_id + 1
        df_bigscape_net_phylo.loc[max_id, "Clustername 1"] = df_bgcs_selected.index[idx]
        df_bigscape_net_phylo.loc[max_id, "Clustername 2"] = df_bgcs_selected.index[idx + 1]
        df_bigscape_net_phylo.loc[max_id, "Neighbours"] = "Yes"

# autoMLST tree

In [44]:
automlst_tree_path = processed_dir_2 / "automlst_wrapper" / "final.newick"
with open(automlst_tree_path, "r") as f:
    data = f.readlines()

value_to_replace = [i.split(":")[0] for i in data[0].replace("(", "").split(",")]

new_dict = {}
df = pd.read_csv(processed_dir_2 / "automlst_wrapper/df_genomes_tree.csv")
genome_ids = list(df.genome_id)
for g in genome_ids:
    for v in value_to_replace:
        if v.startswith(g.split(".")[0]):
            new_dict[v] = g
            value_to_replace.remove(v)

data = data[0]
for k in new_dict.keys():
    data = data.replace(k, new_dict[k])

automlst_tree_corrected_path = processed_dir_2 / "automlst_wrapper/final_corrected.newick"
with open(automlst_tree_corrected_path, "w") as f:
    f.write(data)

In [45]:
phylo_genome_len_bar_path = processed_dir_2 / "iTOL" / 'df_automlst_genome_len.csv'
phylo_colored_range_path =  processed_dir_2 / "iTOL" / 'df_automlst_colored_range.csv'
phylo_subclusters_path = processed_dir_2 / "iTOL" / 'df_automlst_color_strip_subclusters.csv'
phylo_filters_path = processed_dir_2 / "iTOL" / 'df_automlst_color_strip_filters.csv'
phylo_silhouette_path = processed_dir_2 / "iTOL" / 'df_automlst_color_strip_silhouette.csv'

In [47]:
genome_ids_P4 = df_genomes_phylo.index
df_phylo_genome_len = pd.DataFrame(index= genome_ids_P4)
df_phylo_genome_len['node_label'] = genome_ids_P4
df_phylo_genome_len['genome_len'] = df_seqfu_meta.loc[genome_ids_P4,'Total'].tolist()
df_phylo_genome_len.to_csv(phylo_genome_len_bar_path)

In [48]:
df_phylo_colored_subclusters = pd.DataFrame(index= genome_ids_P4)
df_phylo_colored_subclusters['node_label'] = genome_ids_P4

df_phylo_colored_subclusters['range_color'] = df_clusters.loc[genome_ids_P4, 'Subcluster_Color']
df_phylo_colored_subclusters['range_label'] = df_clusters.loc[genome_ids_P4, 'Subcluster_label']

df_phylo_colored_subclusters.to_csv(phylo_subclusters_path)

In [49]:
filter_color_dict = {"HQ_NCBI": "#8B0000",
                    "MQ_NCBI": "#FA8072",
                    "HQ_NBC": "#00008B",
                    "MQ_NBC": "#ADD8E6"}

df_phylo_filters = pd.DataFrame(index= genome_ids_P4)
df_phylo_filters['node_label'] = genome_ids_P4

for genome_id in df_phylo_filters.index:
    source_qaulity = df_filter_quality.loc[genome_id, "quality"] + "_" + df_filter_quality.loc[genome_id, "source"]
    df_phylo_filters.loc[genome_id, 'range_color'] = filter_color_dict[source_qaulity]
    df_phylo_filters.loc[genome_id, 'range_label'] = source_qaulity

df_phylo_filters.to_csv(phylo_filters_path)