# CheckM
Summary of [CheckM](https://github.com/Ecogenomics/CheckM) results from project: `[{{ project().name }}]` 

## Description
Assess the quality of microbial genomes recovered from isolates, single cells, and metagenomes

In [None]:
import pandas as pd
from pathlib import Path
import altair as alt
import warnings
warnings.filterwarnings('ignore')

In [None]:
def checkm_eval(df):
    # Filter for data without CheckM results
    mask = df.loc[:, "Completeness"].isnull()

    # count number of MIMAG categories
    ctr_unknown = 0
    ctr_high = 0
    ctr_medium = 0 
    ctr_low = 0
    ctr_contaminated = 0

    # count and label unknown quality genomes
    for i in df[mask].index:
        #print(i, df.loc[i, ["checkm_completeness", "checkm_contamination"]].to_list())
        MIMAG_quality = "unknown"
        df.loc[i, "genome_quality"] = MIMAG_quality
        ctr_unknown = ctr_unknown + 1

    # count and label genomes to MIMAG standard
    for i in df[~mask].index:
        #print(df.loc[i, "checkm_completeness"])
        if df.loc[i, "Completeness"] > 90 and df.loc[i, "Contamination"] < 5:
            MIMAG_quality = "high_quality"
            ctr_high = ctr_high + 1
        elif df.loc[i, "Completeness"] >= 50 and df.loc[i, "Contamination"] < 10:
            MIMAG_quality = "medium_quality"
            ctr_medium = ctr_medium + 1
        elif df.loc[i, "Completeness"] < 50 and df.loc[i, "Contamination"] < 10:
            MIMAG_quality = "low_quality"
            print(MIMAG_quality, i, df.loc[i, ["Completeness", "Contamination", 'Count', 'Total', 'N50', 'gc']].to_list())
            ctr_low = ctr_low + 1
        else:
            MIMAG_quality = "contaminated"
            print(MIMAG_quality, i, df.loc[i, ["Completeness", "Contamination", 'Count', 'Total', 'N50', 'gc']].to_list())
            ctr_contaminated = ctr_contaminated + 1
        df.loc[i, "genome_quality"] = MIMAG_quality

    print(f"high_quality={ctr_high}, medium_quality={ctr_medium}, low_quality={ctr_low}, contaminated={ctr_contaminated}, unknown={ctr_unknown}")
    return df

## Result

In [None]:
report_dir = Path("../")
#gtdb_table = report_dir / "tables/df_gtdb_gtdbtk_meta.csv"
checkm_table = report_dir / "tables/df_checkm_stats.csv"

#df_gtdb = pd.read_csv(gtdb_table).set_index('genome_id')
df_checkm = pd.read_csv(checkm_table).set_index('genome_id')
#df = pd.concat([df_gtdb, df_checkm], axis=1).reset_index()
df = checkm_eval(df_checkm) 

In [None]:
source = df.copy()
source = source.reset_index(drop=False)
x_col = 'Completeness'
y_col = 'Contamination'
chart_one = alt.Chart(source).mark_point().encode(
    alt.X(x_col,
          scale=alt.Scale(domain=(min(source[x_col]), 100)),
          title=f'{x_col} (%)'),
    alt.Y(y_col,
          scale=alt.Scale(domain=(round(0, 1), 
                                  round(source[y_col].max() + 1, 2))),
          title=f"{y_col} (%)"
         ),
    #shape='sequence_quality',
    color="genome_quality",
    tooltip=['genome_id'] + list(df_checkm.columns)
).mark_point(
    filled=True,
    stroke='black',
    strokeWidth=0.5,
    opacity=0.8,
    size=100
).configure_axis(
    labelFontSize=10,
    titleFontSize=12
).configure_legend(
    labelFontSize=10,
    titleFontSize=12,
).configure_view(
    continuousHeight=500,
    continuousWidth=500,
)


chart_one

## References

<font size="2">

{% for i in project().rule_used['checkm']['references'] %}
- *{{ i }}*
{% endfor %}

</font>