# Overview map of GCFs in P4 phylogroup

In [1]:
import os
import pandas as pd
import yaml
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
with open("config.yaml", "r") as f:
    notebook_configuration = yaml.safe_load(f)
notebook_configuration

{'bgcflow_dir': '/datadrive/bgcflow'}

In [3]:
# Write samples.csv table to config directory of qc_strepto_ncbi project
bgcflow_dir = Path(notebook_configuration["bgcflow_dir"])
project_name_1 = "mq_strepto"
processed_dir_1 = bgcflow_dir / "data" / "processed" / project_name_1

# Read output tables from the processed directory
ncbi_meta_table = processed_dir_1 / "tables"/ "df_ncbi_meta.csv"
df_ncbi_meta = pd.read_csv(ncbi_meta_table, index_col= 0)

gtdb_meta_table = processed_dir_1 / "tables"/ "df_gtdb_meta_curated.csv"
df_gtdb_meta = pd.read_csv(gtdb_meta_table, index_col= 0)

seqfu_meta_table = processed_dir_1 / "tables"/ "df_seqfu_stats.csv"
df_seqfu_meta = pd.read_csv(seqfu_meta_table, index_col= 0)

mash_table = processed_dir_1 / "mash"/ "df_mash.csv"
df_mash = pd.read_csv(mash_table, index_col= 0)

df_clusters = pd.read_csv("assets/tables/df_clusters.csv", index_col=0)

filters_table = processed_dir_1 / "tables" / "df_filters.csv"
df_filter_quality = pd.read_csv(filters_table, index_col=0)

antismash_summary_table = processed_dir_1 / "tables" / "df_antismash_7.0.0_summary.csv"
df_antismash_summary = pd.read_csv(antismash_summary_table, index_col=0, low_memory=False)

bgcs_summary_table = processed_dir_1 / "tables" / "df_regions_antismash_7.0.0.csv"
df_bgcs_antismash = pd.read_csv(bgcs_summary_table, index_col=0)

arts_table = processed_dir_1 / "tables" / "df_arts_as-7.0.0.csv"
df_arts = pd.read_csv(arts_table, index_col=0)
df_clusters["Subcluster_label"] = df_clusters.Subcluster_label.fillna("NA")
df_clusters["Cluster_label"] = df_clusters.Subcluster_label.fillna("NA")

# Read bigslice results
df_gcf_presence = pd.read_csv(processed_dir_1 / "bigslice" / "cluster_as_7.0.0" / "df_gcf_presence_combined.csv", index_col=0)
df_gcfs_bigslice = pd.read_csv(processed_dir_1 / "bigslice" / "cluster_as_7.0.0" / "df_gcfs_combined.csv",index_col=0) 
df_bgcs_bigslice = pd.read_csv(processed_dir_1 / "bigslice" / "cluster_as_7.0.0" / "df_bgcs_curated_with_combined.csv", index_col=0) 

In [4]:
project_name_2 = "P4"
processed_dir_2 = bgcflow_dir / "data" / "processed" / project_name_2

antismash_summary_table_2 = processed_dir_2 / "tables" / "df_antismash_7.0.0_summary.csv"
df_antismash_summary_phylo = pd.read_csv(antismash_summary_table_2, index_col=0, low_memory=False)

bgcs_summary_table_2 = processed_dir_2 / "tables" / "df_regions_antismash_7.0.0.csv"
df_bgcs_antismash_phylo = pd.read_csv(bgcs_summary_table_2, index_col=0)

bigscape_bgcs_summary_table_2 = processed_dir_2 / "bigscape" / "for_cytoscape_antismash_7.0.0"/ "2023-08-04 18_19_14_df_clusters_0.30.csv"
df_bgcs_bigscape_phylo = pd.read_csv(bigscape_bgcs_summary_table_2, index_col=0) 

bigscape_net_table_2 = processed_dir_2 / "bigscape" / "for_cytoscape_antismash_7.0.0"/ "2023-08-04 18_19_14_df_network_0.30.csv"
df_bigscape_net_phylo = pd.read_csv(bigscape_net_table_2, index_col=0) 

bigscape_gcfs_summary_table_2 = processed_dir_2 / "bigscape" / "for_cytoscape_antismash_7.0.0"/ "2023-08-04 18_19_14_df_families_0.30.csv"
df_gcfs_bigscape_phylo = pd.read_csv(bigscape_gcfs_summary_table_2, index_col=0) 

bigscape_gcfs_presence_table_2 = processed_dir_2 / "bigscape" / "for_cytoscape_antismash_7.0.0"/ "2023-08-04 18_19_14_df_family_presence_0.30.csv"
df_gcfs_presence_phylo = pd.read_csv(bigscape_gcfs_presence_table_2, index_col=0) 

In [5]:
df_gcfs_presence_bigslice_phylo = df_gcf_presence.loc[df_antismash_summary_phylo.index, :]
df_gcfs_presence_bigslice_phylo = df_gcfs_presence_bigslice_phylo.loc[:, df_gcfs_presence_bigslice_phylo.sum(0) > 0]
df_gcfs_bigslice_phylo = df_gcfs_bigslice.loc[df_gcfs_presence_bigslice_phylo.columns,:]
df_bgcs_bigslice_phylo = df_bgcs_bigslice.loc[df_bgcs_antismash_phylo.index,:]

In [6]:
df_genomes_phylo = df_filter_quality.loc[df_antismash_summary_phylo.index,:]

# Expand the bigscape network with additional connections

### Add connection to neighbouring BGCs

In [7]:
accn_list = df_bgcs_antismash_phylo.accession.value_counts().index.tolist()
max_id = df_bigscape_net_phylo.index.max()

for accession in accn_list:
    df_bgcs_selected = df_bgcs_antismash_phylo[df_bgcs_antismash_phylo.accession == accession]
    for idx in range(df_bgcs_selected.shape[0] - 1):
        max_id = max_id + 1
        df_bigscape_net_phylo.loc[max_id, "Clustername 1"] = df_bgcs_selected.index[idx]
        df_bigscape_net_phylo.loc[max_id, "Clustername 2"] = df_bgcs_selected.index[idx + 1]
        df_bigscape_net_phylo.loc[max_id, "Neighbours"] = "Yes"

In [8]:
# Add BiGSLICE GCF nodes and antismash known simialirity 
max_id = df_bigscape_net_phylo.index.max()
for bgc_id in df_bgcs_bigslice_phylo.index:
    max_id = max_id + 1
    df_bigscape_net_phylo.loc[max_id, "Clustername 1"] = bgc_id
    df_bigscape_net_phylo.loc[max_id, "Clustername 2"] = df_bgcs_bigslice_phylo.loc[bgc_id, "gcf_combined_id"]
    df_bigscape_net_phylo.loc[max_id, "bigslice_model"] = df_bgcs_bigslice_phylo.loc[bgc_id, "gcf_combined_id"]

    if df_bgcs_antismash_phylo.loc[bgc_id, "similarity"] > 0.2:
        max_id = max_id + 1
        df_bigscape_net_phylo.loc[max_id, "Clustername 1"] = bgc_id
        df_bigscape_net_phylo.loc[max_id, "Clustername 2"] = df_bgcs_antismash_phylo.loc[bgc_id, "most_similar_known_cluster_id"]
        df_bigscape_net_phylo.loc[max_id, "known_cluster_blast"] = df_bgcs_antismash_phylo.loc[bgc_id, "similarity"]    

In [9]:
bigscape_net_table_enriched = processed_dir_2 / "bigscape" / "for_cytoscape_antismash_7.0.0"/ "df_network_0.30_enriched.csv"
df_bigscape_net_phylo.to_csv(bigscape_net_table_enriched)

In [107]:
## Create HQ project
df_samples = pd.read_csv("/datadrive/bgcflow/config/P4_hq/samples.csv", index_col=0)
df_samples_hq = df_samples.loc[df_genomes_phylo_hq.index,:]
df_samples_hq.to_csv("/datadrive/bgcflow/config/P4_hq/samples.csv")

  values = values.astype(str)


## Create network with GCFs

In [10]:
df_genomes_phylo_hq = df_genomes_phylo[df_genomes_phylo.quality == "HQ"]
df_bgcs_antismash_phylo_hq = df_bgcs_antismash_phylo[df_bgcs_antismash_phylo.genome_id.isin(df_genomes_phylo_hq.index)]

In [27]:
df_bgcs_bigslice.loc["NZ_CP047147.1.region019", :]

dataset_id                                                               1
name                               GCF_009834105.1/NZ_CP047147.1.region019
type                                                                   as7
on_contig_edge                                                           0
length_nt                                                            61511
orig_folder                                                GCF_009834105.1
orig_filename                                  NZ_CP047147.1.region019.gbk
genome_id                                                  GCF_009834105.1
gcf_id                                                                6845
membership_value                                                  0.067275
known_cluster_blast                                             BGC0002358
known_cluster_blast_name                                 cyclofaulknamycin
most_similar_known_cluster_type                                 Polyketide
gcf_combined             

In [81]:
df_genomes_sel = df_clusters.loc[df_bgcs_bigslice[df_bgcs_bigslice.gcf_id == 7818].genome_id,:]
# df_genomes_sel = df_genomes_sel[df_genomes_sel.Cluster != 4]


Unnamed: 0_level_0,Cluster,Cluster_Color,Species,Subcluster_label,Subcluster_Color,Cluster_label,Mash_species
genome_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GCF_003428925.1,4.0,#0082c8,albidoflavus,P4_5,#f58231,P4_5,albidoflavus


In [103]:
df_gcfs_bigscape_phylo

Unnamed: 0_level_0,fam_type,fam_name,clusters_in_fam,mibig_ids
fam_id_0.30,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,known_family,streptamidine,138,BGC0002115
2,known_family,desferrioxamine E;FW0622;desferrioxamin B;desf...,125,BGC0001478;BGC0002690;BGC0000940;BGC0001453;BG...
3,known_family,ectoine;ectoine,121,BGC0000853;BGC0002052
4,unknown_family,u_Others_4,119,
5,known_family,10-epi-HSAF;10-epi-3-deOH-HSAF;10-epi-maltophi...,119,BGC0002365;BGC0001043;BGC0002509
...,...,...,...,...
141,unknown_family,u_RiPPs_141,1,
142,unknown_family,u_RiPPs_142,1,
143,unknown_family,u_RiPPs_143,1,
144,unknown_family,u_RiPPs_144,1,


In [102]:
df_bgcs_bigslice_phylo_hq = df_bgcs_bigslice.loc[df_bgcs_antismash_phylo_hq.index,:]
df_bgcs_bigscape_phylo_hq = df_bgcs_bigscape_phylo.loc[df_bgcs_antismash_phylo_hq.index,:]
df_bgcs_bigscape_phylo_hq

Unnamed: 0_level_0,product,bigscape_class,genome_id,accn_id,gcf_0.30,Clan Number,fam_id_0.30,fam_type_0.30,fam_known_compounds_0.30
bgc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
CP108653.region001,lanthipeptide-class-iii.lanthipeptide-class-ii,RiPPs,NBC_01103,CP108653,3164,2528.0,50,unknown_family,u_RiPPs_50
CP108653.region002,transAT-PKS.PKS-like.NRPS-like.NRPS,PKS-NRP_Hybrids,NBC_01103,CP108653,3163,2553.0,43,unknown_family,u_PKS-NRP_Hybrids_43
CP108653.region003,NRPS-like.T1PKS,PKS-NRP_Hybrids,NBC_01103,CP108653,2617,2553.0,39,unknown_family,u_PKS-NRP_Hybrids_39
CP108653.region004,T1PKS.NRPS,PKS-NRP_Hybrids,NBC_01103,CP108653,4479,4479.0,5,known_family,10-epi-HSAF;10-epi-3-deOH-HSAF;10-epi-maltophi...
CP108653.region005,terpene,Terpene,NBC_01103,CP108653,2554,2545.0,8,unknown_family,u_Terpene_8
...,...,...,...,...,...,...,...,...,...
NZ_CP085039.1.region017,NI-siderophore,Others,GCF_020535385.1,NZ_CP085039.1,4606,4592.0,2,known_family,desferrioxamine E;FW0622;desferrioxamin B;desf...
NZ_CP085039.1.region018,LAP.NRPS,Others,GCF_020535385.1,NZ_CP085039.1,5110,5110.0,14,unknown_family,u_Others_14
NZ_CP085040.1.region001,NRPS.terpene.T1PKS,Others,GCF_020535385.1,NZ_CP085040.1,2790,4592.0,65,unknown_family,u_Others_65
NZ_CP085040.1.region002,butyrolactone,Others,GCF_020535385.1,NZ_CP085040.1,4125,4592.0,51,unknown_family,u_Others_51


In [87]:
df_gcfs_bigslice_phylo.bigslice_gcf_count.value_counts()

bigslice_gcf_count
1      96
3       5
7       5
4       3
2       2
5       2
9       2
105     2
11      1
12      1
24      1
56      1
138     1
17      1
31      1
8       1
33      1
20      1
6       1
46      1
Name: count, dtype: int64

In [None]:
df_bgcs_cyclofaulknamycin = df_bgcs_bigslice[df_bgcs_bigslice.gcf_combined_id == "BGC0002358"]
genome_selected = df_bgcs_cyclofaulknamycin[df_bgcs_cyclofaulknamycin.similarity > 0.5].genome_id.unique()
df_clusters.loc[genome_selected,:].Cluster_label.value_counts()

In [40]:
df_bgcs_cyclofaulknamycin = df_bgcs_antismash[df_bgcs_antismash.most_similar_known_cluster_id == "BGC0002358"]
genome_selected = df_bgcs_cyclofaulknamycin[df_bgcs_cyclofaulknamycin.similarity > 0.5].genome_id.unique()
df_clusters.loc[genome_selected,:].Cluster_label.value_counts()

Cluster_label
P4_2    72
P2_2     6
NA       4
P6_3     4
P4_5     2
P6_2     1
P7_3     1
P2_7     1
P5_1     1
Name: count, dtype: int64

# autoMLST tree

In [14]:
automlst_tree_path = processed_dir_2 / "automlst_wrapper" / "final.newick"
with open(automlst_tree_path, "r") as f:
    data = f.readlines()

value_to_replace = [i.split(":")[0] for i in data[0].replace("(", "").split(",")]

new_dict = {}
df = pd.read_csv(processed_dir_2 / "automlst_wrapper/df_genomes_tree.csv")
genome_ids = list(df.genome_id)
for g in genome_ids:
    for v in value_to_replace:
        if v.startswith(g.split(".")[0]):
            new_dict[v] = g
            value_to_replace.remove(v)

data = data[0]
for k in new_dict.keys():
    data = data.replace(k, new_dict[k])

automlst_tree_corrected_path = processed_dir_2 / "automlst_wrapper/final_corrected.newick"
with open(automlst_tree_corrected_path, "w") as f:
    f.write(data)

In [15]:
phylo_genome_len_bar_path = processed_dir_2 / "iTOL" / 'df_automlst_genome_len.csv'
phylo_colored_range_path =  processed_dir_2 / "iTOL" / 'df_automlst_colored_range.csv'
phylo_subclusters_path = processed_dir_2 / "iTOL" / 'df_automlst_color_strip_subclusters.csv'
phylo_filters_path = processed_dir_2 / "iTOL" / 'df_automlst_color_strip_filters.csv'
phylo_silhouette_path = processed_dir_2 / "iTOL" / 'df_automlst_color_strip_silhouette.csv'

In [18]:
genome_ids_P4 = df_genomes_phylo.index
df_phylo_genome_len = pd.DataFrame(index= genome_ids_P4)
df_phylo_genome_len['node_label'] = genome_ids_P4
df_phylo_genome_len['genome_len'] = df_seqfu_meta.loc[genome_ids_P4,'Total'].tolist()
df_phylo_genome_len.to_csv(phylo_genome_len_bar_path)

In [20]:
df_clusters

Unnamed: 0_level_0,Cluster,Cluster_Color,Species,Subcluster_label,Subcluster_Color,Cluster_label,Mash_species
genome_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GCF_020092725.1,2.0,#3cb44b,olivaceus,P2_3,#ffe119,P2_3,olivaceus
GCF_020092675.1,2.0,#3cb44b,olivaceus,P2_3,#ffe119,P2_3,olivaceus
GCF_020092695.1,2.0,#3cb44b,olivaceus,P2_3,#ffe119,P2_3,olivaceus
GCF_020092565.1,2.0,#3cb44b,olivaceus,P2_3,#ffe119,P2_3,olivaceus
GCF_020092625.1,2.0,#3cb44b,olivaceus,P2_3,#ffe119,P2_3,olivaceus
...,...,...,...,...,...,...,...
NBC_01786,0.0,#808080,scopuliridis,,#808080,,scopuliridis
NBC_01794,0.0,#808080,sp.,,#808080,,sp_mash_064
NBC_01795,0.0,#808080,sp.,,#808080,,sp_mash_023
NBC_01803,0.0,#808080,sp.,,#808080,,sp_mash_200


In [22]:
df_phylo_colored_subclusters = pd.DataFrame(index= genome_ids_P4)
df_phylo_colored_subclusters['node_label'] = genome_ids_P4

df_phylo_colored_subclusters['range_color'] = df_clusters.loc[genome_ids_P4, 'Subcluster_Color']
df_phylo_colored_subclusters['range_label'] = df_clusters.loc[genome_ids_P4, 'Subcluster_label']

df_phylo_colored_subclusters.to_csv(phylo_subclusters_path)

In [24]:
filter_color_dict = {"HQ_NCBI": "#8B0000",
                    "MQ_NCBI": "#FA8072",
                    "HQ_NBC": "#00008B",
                    "MQ_NBC": "#ADD8E6"}

df_phylo_filters = pd.DataFrame(index= genome_ids_P4)
df_phylo_filters['node_label'] = genome_ids_P4

for genome_id in df_phylo_filters.index:
    source_qaulity = df_filter_quality.loc[genome_id, "quality"] + "_" + df_filter_quality.loc[genome_id, "source"]
    df_phylo_filters.loc[genome_id, 'range_color'] = filter_color_dict[source_qaulity]
    df_phylo_filters.loc[genome_id, 'range_label'] = source_qaulity

df_phylo_filters.to_csv(phylo_filters_path)

In [25]:
df_filter_quality.loc["GCF_014216335.1",:]

genome_id.1      GCF_014216335.1
genus               Streptomyces
source                      NCBI
species              diastaticus
quality                       HQ
completeness               99.89
contamination               0.21
N50                      6848830
contigs                        2
genome_len               6996347
gc                      0.733315
Name: GCF_014216335.1, dtype: object