# BiG-SCAPE
Summary of GCFs found in each genome from project: `[{{ project().name }}]` using [BiG-SCAPE](https://github.com/medema-group/BiG-SCAPE)

Download all [BiG-SCAPE result]({{ project().file_server() }}/bigscape/result_as6.1.1){:target="_blank" .md-button}

## BGC Distribution
[BiG-SCAPE](https://github.com/medema-group/BiG-SCAPE) constructs sequence similarity networks of Biosynthetic Gene Clusters (BGCs) and groups them into Gene Cluster Families (GCFs). BiG-SCAPE does this by rapidly calculating a distance matrix between gene clusters based on a comparison of their protein domain content, order, copy number and sequence identity.

In [None]:
import pandas as pd
from pathlib import Path

from IPython.display import display, Markdown, HTML
from jinja2 import Template
from itables import to_html_datatable as DT
import itables.options as opt
import altair as alt
opt.classes = ["display", "compact"]
opt.lengthMenu = [5, 10, 20, 50, 100, 200, 500]

import warnings
warnings.filterwarnings('ignore')

report_dir = Path("../")

In [None]:
# Read tables that are generated when BiGSCAPE rule is TRUE
antismash_table = report_dir / "tables/df_antismash_6.1.1_summary.csv"
gtdb_table = report_dir / "tables/df_gtdb_meta.csv"
ncbi_table = report_dir / "tables/df_ncbi_meta.csv"
bigscape_dir = report_dir / "bigscape/for_cytoscape_antismash_6.1.1/"
bgc_table = [i for i in bigscape_dir.glob("*_df_clusters_0.30.csv")][0]
gcf_table = [i for i in bigscape_dir.glob("*_df_families_0.30.csv")][0]
mibig_table = [i for i in bigscape_dir.glob("*_df_known_0.30.csv")][0]
gcf_presence_table = [i for i in bigscape_dir.glob("*_df_family_presence_0.30.csv")][0]

df_antismash = pd.read_csv(antismash_table, index_col=0)
df_gtdb = pd.read_csv(gtdb_table, index_col=0)
df_ncbi = pd.read_csv(ncbi_table, index_col=0)

df_bgcs = pd.read_csv(bgc_table, index_col=0)
df_gcf_presence = pd.read_csv(gcf_presence_table, index_col=0)
df_gcfs = pd.read_csv(gcf_table, index_col=0)
df_mibig = pd.read_csv(mibig_table, index_col=0)

## Result Summary

In [None]:
text_line1 = f"""BiG-SCAPE detected {int(df_gcfs.shape[0])} GCFs of the {int(df_bgcs.shape[0])} BGCs"""
text_line2 = f"""Number of known GCFs: {int(df_gcfs.value_counts('fam_type')["known_family"])}"""
text_line3 = f"""Number of unknown GCFs: {int(df_gcfs.value_counts('fam_type')["unknown_family"])}"""
text_line4 = f"""Number of unique GCFs: {int(df_gcfs[df_gcfs.clusters_in_fam==1].shape[0])}"""

display(Markdown(text_line1))
display(Markdown(text_line2))
display(Markdown(text_line3))
display(Markdown(text_line4))

In [None]:
source = df_bgcs.copy()
alt.data_transformers.disable_max_rows()
chart_class = alt.Chart(source).mark_bar().encode(
    x= alt.X('genome_id', axis=alt.Axis(title='Genome ID')),
    y= alt.Y('count(product)', axis=alt.Axis(title='Number of BGCs')),
    color='bigscape_class',
    tooltip=['genome_id', 'bigscape_class','count(bigscape_class)']
).properties(
    width=300,
    height=300,
    title = "BGCs count overview",
).interactive()
# chart_class = chart.configure_title(fontSize=20, offset=10, orient='top', anchor='middle')

source['fam_type'] = source['fam_type_0.30']

chart_known = alt.Chart(source).mark_bar().encode(
    x= alt.X('genome_id', axis=alt.Axis(title='')),
    y= alt.Y('count(product)', axis=alt.Axis(title='Number of BGCs')),
    color='fam_type',
    tooltip=['genome_id', 'fam_type', 'count(fam_type)']
).properties(
    width=300,
    height=300,
    title = "BGCs count overview",
).interactive()

chart = alt.hconcat(chart_class, chart_known)
chart = chart.configure_title(fontSize=20, offset=10, orient='top', anchor='middle')
chart

## Summary Tables

### Genome overview
Number of BGCs of various types (known, unknown, unique) present in each genome. Additionally, number of BGCs of each of the BiG-SCAPE defined biosynthetic classes are also listed.

In [None]:
df_genomes = pd.DataFrame(index=df_antismash.index)
df_genomes["Genome ID"] = df_antismash['genome_id']
df_genomes['Organism name'] = df_ncbi.loc[df_genomes.index, 'organism']
df_genomes['GTDB species'] = [df_gtdb.loc[idx, 'Organism'].split('__')[1] for idx in df_genomes.index]
df_genomes['BGCs'] = df_antismash.loc[df_genomes.index, 'bgcs_count']

bigscape_class_list = df_bgcs.bigscape_class.unique()

for i in df_genomes.index:
    gid = df_genomes.loc[i, 'Genome ID']
    if "known_family" in df_bgcs['fam_type_0.30']:
        df_genomes.loc[gid, 'Known BGCs'] = df_bgcs[df_bgcs.genome_id == gid].value_counts('fam_type_0.30')['known_family']
    if "unknown_family" in df_bgcs['fam_type_0.30']:
        df_genomes.loc[gid, 'Unknown BGCs'] = df_bgcs[df_bgcs.genome_id == gid].value_counts('fam_type_0.30')['unknown_family']
    df_genomes.loc[gid, 'Unique BGCs'] = df_gcf_presence.loc[:, [str(idx) for idx in df_gcfs[df_gcfs.clusters_in_fam==1].index]].sum(1)[gid]
    
    df_bigscape_class_counts = df_bgcs[df_bgcs.genome_id == gid].value_counts('bigscape_class')
    for bigscape_class in bigscape_class_list:
        if bigscape_class in df_bigscape_class_counts.index:
            df_genomes.loc[gid, bigscape_class] = df_bigscape_class_counts[bigscape_class]

    server_path = "<a href='{{ project().file_server() }}/antismash/6.1.1/"
    df_genomes.loc[i, "Genome ID"] = server_path + f"{gid}/index.html' target='_blank''>{gid}</a>"
df_genomes = df_genomes.reset_index(drop=True)

In [None]:
display(HTML(DT(df_genomes, columnDefs=[{"className": "dt-center", "targets": "_all", "searchable": True}], maxColumns=df_genomes.shape[1], maxBytes=0, scrollX=True)))

### BGC overview
BGCs table with assignment of the GCFs based on the BiG-SCAPE.

In [None]:
display(HTML(DT(df_bgcs, columnDefs=[{"className": "dt-center", "targets": "_all", "searchable": True}], maxColumns=df_bgcs.shape[1], maxBytes=0, scrollX=True)))

### GCF overview
GCFs table with metadata and statistics.

In [None]:
display(HTML(DT(df_gcfs, columnDefs=[{"className": "dt-center", "targets": "_all", "searchable": True}], maxColumns=df_gcfs.shape[1], maxBytes=0, scrollX=True)))

### GCF-presence matrix
GCF presence absence matrix across all the genomes. Note that the columns are represented by the GCF IDs that can be found in GCF overview table above.


In [None]:
df_gcf_presence_final = pd.DataFrame(index=df_antismash.index, columns=["Genome ID"])

non_unique_families = [str(idx) for idx in df_gcfs[df_gcfs.clusters_in_fam>1].index]
df_gcf_presence_final[non_unique_families] = df_gcf_presence[non_unique_families]

for gid in df_gcf_presence_final.index:
    server_path = "<a href='{{ project().file_server() }}/antismash/6.1.1/"
    df_gcf_presence_final.loc[gid, "Genome ID"] = server_path + f"{gid}/index.html' target='_blank''>{gid}</a>"
    
df_gcf_presence_final = df_gcf_presence_final.reset_index(drop=True)

In [None]:
display(HTML(DT(df_gcf_presence_final, columnDefs=[{"className": "dt-center", "targets": "_all", "searchable": True}], maxColumns=df_gcf_presence_final.shape[1], maxBytes=0, scrollX=True)))

### MIBIG overview
Information on the known clusters from MIBIG database detected in the genomes using BiG-SCAPE.

In [None]:
df_mibig = df_mibig[~df_mibig['fam_id_0.30'].isna()]

df_mibig_final = pd.DataFrame(index=df_mibig.index)
df_mibig_final['MIBIG ID'] = df_mibig.index
df_mibig_final['BGC type'] = df_mibig['product'] 
df_mibig_final['Compounds'] = df_mibig['compounds'] 
df_mibig_final['Activity'] = df_mibig['chem_acts']

for mibig_id in df_mibig_final.index:
    fam_id = str(int((df_mibig.loc[mibig_id, 'fam_id_0.30'])))
    df_mibig_final.loc[mibig_id, 'Genomes'] = df_gcf_presence[fam_id].sum()
    df_mibig_final.loc[mibig_id, 'GCF ID'] = fam_id
    server_path = "<a href='https://mibig.secondarymetabolites.org/repository/" 
    df_mibig_final.loc[mibig_id, "MIBIG ID"] = server_path + f"{mibig_id[:-2]}/' target='_blank''>{mibig_id}</a>"

df_mibig_final = df_mibig_final.reset_index(drop=True)

In [None]:
display(HTML(DT(df_mibig_final, columnDefs=[{"className": "dt-center", "targets": "_all", "searchable": True}], maxColumns=df_mibig_final.shape[1], maxBytes=0, scrollX=True)))

## References
<font size="2">
{% for i in project().rule_used['antismash']['references'] %}
- {{ i }} 
{% endfor %}
</font>