# antiSMASH 
Summary of BGCs detected in each genome for: `[{{ project().name }}]`

## Description
> antiSMASH allows the rapid genome-wide identification, annotation and analysis of secondary metabolite biosynthesis gene clusters in bacterial and fungal genomes.

In [None]:
import pandas as pd
from pathlib import Path
from IPython.display import display, Markdown, HTML
import json
import altair as alt

import warnings
warnings.filterwarnings('ignore')

from itables import to_html_datatable as DT
import itables.options as opt
opt.css = """
.itables table td { font-style: italic; font-size: .8em;}
.itables table th { font-style: oblique; font-size: .8em; }
"""
opt.classes = ["display", "compact"]
opt.lengthMenu = [5, 10, 20, 50, 100, 200, 500]

import seaborn as sns
cm = sns.light_palette("green", as_cmap=True)

report_dir = Path("../")

In [None]:
dependency_version = report_dir / "metadata/dependency_versions.json"
with open(dependency_version, "r") as file:
    dependency_version = json.load(file)
antismash_version = dependency_version["antismash"]

In [None]:
antismash_table = report_dir / f"tables/df_antismash_{antismash_version}_summary.csv"
gtdb_table = report_dir / "tables/df_gtdb_meta.csv"

df_antismash = pd.read_csv(antismash_table).set_index("genome_id", drop=False)
df_gtdb = pd.read_csv(gtdb_table).set_index("genome_id", drop=False)

# correct organism name
for idx in df_gtdb.index:
    if df_gtdb.loc[idx, "Organism"] == "s__":
        genus = df_gtdb.loc[idx, 'Genus'].split('__')[1]
        df_gtdb.loc[idx, 'Organism'] = f"s__{genus} sp." 
        
df_raw = pd.DataFrame(index=df_antismash.index)
df_raw["Genome ID"] = df_antismash['genome_id']
df_raw['GTDB genus'] = [df_gtdb.loc[idx, 'Genus'].split('__')[1] for idx in df_raw.index]
df_raw['GTDB species'] = [f"{df_gtdb.loc[idx, 'Organism'].split('__')[1]}" for idx in df_raw.index]
try:
    df_raw['Strain'] = df_antismash.loc[df_raw.index, 'strain']
except KeyError:
    pass
df_raw['BGCs'] = df_antismash.loc[df_raw.index, 'bgcs_count']
df_raw['Incomplete BGCs'] = df_antismash.loc[df_raw.index, 'bgcs_on_contig_edge']

df = df_raw.copy()
for i in df.index:
    gid = df.loc[i, 'Genome ID']
    server_path = "<a href='{{ project().file_server() }}/antismash/{{project().dependency_version()}}/"
    df.loc[i, "Genome ID"] = server_path + f"{gid}/index.html' target='_blank''>{gid}</a>"
df = df.reset_index(drop=True)

## Result Summary

In [None]:
region = df_antismash.bgcs_count
incomplete = df_antismash.bgcs_on_contig_edge
text = f"""AntiSMASH detected **{int(region.sum())}** BGCs from **{len(region)}** genomes with the median of **{int(region.median())}**. Out of these, **{'{:.2%}'.format(1 - incomplete.sum()/region.sum())}** are deemed as complete."""
display(Markdown(text))

> Note: Here the incomplete BGCs are denoted by those that were identified to be on the contig edge by antiSMASH and thus are likely to be incomplete.

In [None]:
source = df_raw

base = alt.Chart(source)

bar = base.mark_bar().encode(
    x=alt.X('BGCs:Q', bin=True, axis=alt.Axis(title='BGCs')), 
    y=alt.X('count()', axis=alt.Axis(title='Genomes')),
    color='GTDB genus',
    tooltip=['Genome ID', 'GTDB species', 'BGCs', 'Incomplete BGCs']
).interactive()

bar 

## Summary Table
Click on the genome ids to get the antiSMASH result.

[Download Table]({{ project().file_server() }}/tables/df_antismash_{{project().dependency_version()}}_summary.csv){:target="_blank" .md-button}

In [None]:
df = df.fillna(0)
for c in df.columns:
    if c in ["BGCs", "Incomplete BGCs"]:
        df[c] = df[c].astype(int)
    else:
        df[c] = df[c].astype(str)

In [None]:
display(HTML(DT(df.style.background_gradient(cmap=cm, axis=None), columnDefs=[{"className": "dt-center", "targets": "_all"}],)))

## Regions Summary

In [None]:
regions_table = report_dir / f"tables/df_regions_antismash_{antismash_version}.csv"
df_regions_table = pd.read_csv(regions_table)

server_path = "<a href='{{ project().file_server() }}/antismash/{{project().dependency_version()}}/"
df_regions_table["similarity"] = df_regions_table['similarity'].fillna(0)

df_regions_table.most_similar_known_cluster_type = df_regions_table.most_similar_known_cluster_type.fillna("No Hits")
for i in df_regions_table.index:
    value = df_regions_table.loc[i, "most_similar_known_cluster_type"]
    value = list(set(value.split(":")[0].split("+")))
    if len(value) > 1:
        value = ["Hybrid"]
    df_regions_table.loc[i, "most_similar_known_cluster_type_simplified"] = value[0]

for i in df_regions_table.index:
    gid = df_regions_table.loc[i, "genome_id"]
    r, c = str(df_regions_table.loc[i, "region"]).split(".")
    region_id = f"#r{r}c{c}"
    bgc_id = df_regions_table.loc[i, "bgc_id"]
    df_regions_table.loc[i, "Genome ID"] = server_path + f"{gid}/index.html' target='_blank''>{gid}</a>"
    df_regions_table.loc[i, "BGC ID"] = server_path + f"{gid}/index.html{region_id}' target='_blank''>{bgc_id}</a>"
    if df_regions_table.loc[i, "similarity"] > 1:
        df_regions_table.loc[i, "similarity"] = 1
    df_regions_table.loc[i, "Similarity"] = f"{df_regions_table.loc[i, 'similarity']:.1%}"
    most_similar_known_cluster_description = df_regions_table.loc[i, 'most_similar_known_cluster_description']
    most_similar_known_cluster_id = df_regions_table.loc[i, 'most_similar_known_cluster_id']
    if type(most_similar_known_cluster_id) == str:
        df_regions_table.loc[i, "Most Similar Known Cluster"] =  f"<a href='https://mibig.secondarymetabolites.org/repository/{most_similar_known_cluster_id}/index.html' target='_blank''>{most_similar_known_cluster_description}</a>"
    else:
        df_regions_table.loc[i, "Most Similar Known Cluster"] = most_similar_known_cluster_id

In [None]:
#How many in total?
total_bgcs = len(df_regions_table)

#How many known BGCs? above 80%
known_count = df_regions_table.loc[df_regions_table['similarity'] > 0.8].shape[0]
medium_count = df_regions_table.loc[(df_regions_table['similarity'] <= 0.8) & (df_regions_table['similarity'] > 0.4)].shape[0]
low_count = df_regions_table.loc[(df_regions_table['similarity'] <= 0.4) & (df_regions_table['similarity'] > 0)].shape[0]

#what are they?
items = df_regions_table.loc[df_regions_table['similarity'] > 0.8, "most_similar_known_cluster_description"].value_counts().to_dict()

#How many does not have hits?
unknown_count = df_regions_table.loc[df_regions_table.most_similar_known_cluster_type_simplified == "No Hits"].shape[0]

text1 = f"""
* KnownClusterBlast hits **{known_count}** out of **{total_bgcs}** BGC regions with *high similarity* (>80%) to MIBIG entries.
* Those entries includes: {', '.join([f'**{k}** ({v})' for k,v in items.items()])}.
* There are **{medium_count}** BGC regions with **medium** similarity (<80%) to MIBIG entries.
* There are **{low_count}** BGC regions with **low** similarity (<40%) to MIBIG entries.
* There are also **{unknown_count}** BGC regions that **does not have any hits** to the MIBIG entries.
"""
display(Markdown(text1))

In [None]:
options = [i for i in df_regions_table['most_similar_known_cluster_type_simplified'].unique()]

resize = alt.selection_interval(bind='scales')

# Create a brush selection for the y-axis
#brush = alt.selection_interval(encodings=['y'])

source = df_regions_table

base = alt.Chart(source)

labels = [option + ' ' for option in options]

input_dropdown = alt.binding_select(options=options + [None],
                                    labels=labels + ['All '],
                                    name='Most Similar Known Cluster Type ')

selection = alt.selection_point(fields=['most_similar_known_cluster_type_simplified'], 
                                bind=input_dropdown)

color = alt.condition(
    selection,
    alt.Color('most_similar_known_cluster_type_simplified:N').legend(None),
    alt.value('lightgray')
)

scatter = base.mark_circle(size=75).encode(
    x=alt.X('region_length:Q', title="BGC Region Length (bp)"),
    y=alt.Y('similarity:Q', title="KnownClusterBlast Similarity (%)").axis(format='%'),
    color=color,
    #opacity=alt.condition(brush, alt.value(0.8), alt.value(0.01)),
    tooltip=['bgc_id', 'genome_id', 'region', 'accession', 'start_pos', 'end_pos',
             'contig_edge', 'product', 'region_length',
             'most_similar_known_cluster_id',
             'most_similar_known_cluster_description',
             'most_similar_known_cluster_type', 'Similarity']
).add_params(
    selection,
    #brush
).properties(
    height=400,
    width=600
).add_selection(
    resize
)

legend = base.mark_circle(size=75).encode(
    alt.Y('most_similar_known_cluster_type_simplified:N', title="Most Similar Known Cluster Type").axis(orient='right'),
    color=color
)

chart2 = base.mark_bar().encode(
    x=alt.X('count()', title='BGC Region Count'),
    y=alt.Y('similarity:Q', title="").axis(format='%').bin(maxbins=30),
    color=color
).add_params(
    selection,
    #brush
).properties(
    height=400,
    width=100
).add_selection(
    resize
)

scatter | chart2 | legend

Click on the BGC ids to go to the region's antiSMASH result.

[Download Table]({{ project().file_server() }}/tables/df_regions_antismash_{{project().dependency_version()}}.csv){:target="_blank" .md-button}

In [None]:
display(HTML(DT(df_regions_table.loc[:, ["BGC ID", "Genome ID", "region", "product", "Most Similar Known Cluster", 
                                         "Similarity", "contig_edge"]], columnDefs=[{"className": "dt-center", "targets": "_all"}],)))

## References
<font size="2">
{% for i in project().rule_used['antismash']['references'] %}
- {{ i }} 
{% endfor %}
</font>