# FastANI
Summary of [FastANI](https://github.com/ParBLiSS/FastANI) results from project: `[{{ project().name }}]`

## Description
Fast Whole-Genome Similarity (ANI) Estimation

In [None]:
import pandas as pd
from pathlib import Path

import warnings
warnings.filterwarnings('ignore')

import os
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as shc
from sklearn.cluster import AgglomerativeClustering 
import networkx as nx
import community as community_louvain
sns.set_context("paper")

In [None]:
report_dir = Path("../")

In [None]:
# path setup
bgcflow_dir = Path('/datadrive/data2/bgcflow')
external_data_dir = bgcflow_dir / 'data/external/'
interim_data_dir = bgcflow_dir / 'data/interim/'
processed_data_dir = bgcflow_dir / 'data/processed/'
config_dir = bgcflow_dir / 'config/'

In [None]:
project_name = 'test'

In [None]:
df_fastani = pd.read_csv(report_dir / 'fastani/df_fastani.csv', index_col=0)
df_ncbi = pd.read_csv(report_dir / 'tables' / 'df_ncbi_meta.csv', index_col='genome_id')
df_gtdb = pd.read_csv(report_dir / 'tables' / 'df_gtdb_meta.csv', index_col='genome_id')

In [None]:
df_fastani_corr = df_fastani.corr()

In [None]:
plt.figure(figsize=(30, 7))
plt.title("FastANI Similarity")

selected_data = df_fastani_corr.copy()
clusters = shc.linkage(selected_data, 
            method='ward', 
            metric="euclidean",
            optimal_ordering=True)
shc.dendrogram(Z=clusters)
plt.show()

In [None]:
n_clusters = 3
top_clusters = 5
Agg_hc = AgglomerativeClustering(n_clusters = n_clusters, affinity = 'euclidean', linkage = 'ward')
y_hc = Agg_hc.fit_predict(df_fastani_corr)
color_set3 = ['#8dd3c7','#ffffb3','#bebada','#fb8072','#80b1d3','#fdb462','#b3de69','#fccde5','#d9d9d9','#bc80bd','#ccebc5','#ffed6f']

df_hclusts = pd.DataFrame(index=df_fastani_corr.index, columns=['hcluster', 'color_code'])
df_hclusts['hcluster'] = y_hc
top_clusters = df_hclusts.hcluster.value_counts().index.tolist()[:top_clusters]
dict_top_colors = dict(zip(top_clusters, color_set3[:len(top_clusters)]))

for genome_id in df_hclusts.index:
    cluster_id = df_hclusts.loc[genome_id, 'hcluster']
    if cluster_id in top_clusters:
        df_hclusts.loc[genome_id, 'color_code'] = dict_top_colors[cluster_id]
    else:
        df_hclusts.loc[genome_id, 'color_code'] = "#808080"
        
comm_colors = df_hclusts['color_code']
plt.figure()
# sns.set_theme(color_codes=True)
g = sns.clustermap(df_fastani_corr,
                  figsize=(50,50), row_linkage=clusters, col_linkage=clusters,
                  row_colors=comm_colors, col_colors=comm_colors)
g.ax_heatmap.set_xlabel('Genomes')
g.ax_heatmap.set_ylabel('Genomes')

## References
<font size="2">
{% for i in project().rule_used['fastani']['references'] %}
- *{{ i }}*
{% endfor %}
</font>