# GTDB-Tk
Summary of [GTDB-tk](https://ecogenomics.github.io/GTDBTk/index.html) results from project: `[{{ project().name }}]` 

## Description
[GTDB-tk](https://ecogenomics.github.io/GTDBTk/index.html) is a software toolkit for assigning objective taxonomic classifications to bacterial and archaeal genomes based on the Genome Database Taxonomy [GTDB](https://gtdb.ecogenomic.org/)

## GTDB Taxonomy overview

In [None]:
# Load libraries
from pathlib import Path
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_context("paper")
import altair as alt
import warnings
warnings.filterwarnings('ignore')

from IPython.display import display, Markdown, HTML
from itables import to_html_datatable as DT
import itables.options as opt
opt.css = """
.itables table td { font-style: italic; font-size: .8em;}
.itables table th { font-style: oblique; font-size: .8em; }
"""
opt.classes = ["display", "compact"]
opt.lengthMenu = [5, 10, 20, 50, 100, 200, 500]

def update_gtdb(df_gtdb, df_gtdbtk):
    df = df_gtdb.copy()
    tax_mapping = {'d' : 'Domain',
               'p' : 'Phylum',
               'c' : 'Class',
               'o' : 'Order',
               'f' : 'Family',
               'g' : 'Genus',
               's' : 'Organism'
              }
    for i in df_gtdbtk.index:
        for item in df_gtdbtk.loc[i, "classification"].split(";"):
            try:
                key, value = item.split("__", 1)
            except ValueError:
                assert item == "Unclassified Bacteria"
                print(i, item, df_gtdbtk.loc[i, "classification"])
            if key == 's':
                try:
                    species = value.split(" ", 1)[-1]
                    if len(species) > 0:
                        df.loc[i, 'Species'] = species
                        df.loc[i, 'Organism'] = item
                    else:
                        df.loc[i, 'Species'] = "sp."
                        df.loc[i, 'Organism'] = "s__" + df.loc[i, 'Genus'].split("__")[1] + " sp."
                    
                except IndexError:
                    print(species)
                    pass
            else:
                df.loc[i, tax_mapping[key]] = item
    return df

In [None]:
report_dir = Path("../")
#seqfu_table = report_dir / "tables/df_seqfu_stats.csv"
gtdb_table = report_dir / "tables/df_gtdb_meta.csv"
gtdbtk_table = report_dir / "tables/gtdbtk.bac120.summary.tsv"
gtdb_gtdbtk_table = report_dir / "tables/df_gtdb_gtdbtk_meta.csv"
#df_seqfu = pd.read_csv(seqfu_table)
#df_seqfu = df_seqfu.rename(columns={'File' : 'genome_id'}).set_index('genome_id')
df_gtdb = pd.read_csv(gtdb_table).set_index('genome_id')
df_gtdbtk = pd.read_csv(gtdbtk_table, sep="\t").set_index('user_genome', drop=False)
# update gtdb information with gtdbtk results
df_gtdb = update_gtdb(df_gtdb, df_gtdbtk)
df_gtdb.to_csv(gtdb_gtdbtk_table)
df = df_gtdb.reset_index()
#df = pd.concat([df_seqfu, df_gtdb], axis=1).reset_index()

In [None]:
display(HTML(DT(df.loc[:, ['genome_id', 'Phylum', 'Class', 'Order',
       'Family', 'Genus', 'Organism']], columnDefs=[{"className": "dt-center", "targets": "_all"}],)))

In [None]:
source = df

chart = alt.Chart(source).mark_bar().encode(
    x='count(Organism)',
    y=alt.Y('Genus', sort='-x'),
    color='Species',
    tooltip=['genome_id', 'Genus', 'Species', 'Organism']   
).properties(
    title = "GTDB Species Distribution",
).properties(
    width=300,
    height=500,
).interactive()

chart = chart.configure_title(fontSize=20, offset=10, orient='top', anchor='middle')

chart

[Download Table]({{ project().file_server() }}/tables/gtdbtk.bac120.summary.tsv){:target="_blank" .md-button}

## References
<font size="2">
{% for i in project().rule_used['seqfu']['references'] %}
- *{{ i }}*
{% endfor %}
</font>