In [6]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import os

# K-mer analysis visualisation
In this notebook, we'll visualise the results from the k-mer analysis done using [simka](../../scripts/simka.pbs).

In [7]:
# List of filenames
filenames = [
    "../../data/assembly/kmer_analysis/mat_abundance_ab-jaccard.csv.gz",
    "../../data/assembly/kmer_analysis/mat_abundance_braycurtis.csv.gz",
    "../../data/assembly/kmer_analysis/mat_abundance_simka-jaccard_asym.csv.gz",
    "../../data/assembly/kmer_analysis/mat_presenceAbsence_jaccard.csv.gz",
    "../../data/assembly/kmer_analysis/mat_presenceAbsence_simka-jaccard.csv.gz",
    "../../data/assembly/kmer_analysis/mat_abundance_ab-ochiai.csv.gz",
    "../../data/assembly/kmer_analysis/mat_abundance_jaccard.csv.gz",
    "../../data/assembly/kmer_analysis/mat_presenceAbsence_braycurtis.csv.gz",
    "../../data/assembly/kmer_analysis/mat_presenceAbsence_kulczynski.csv.gz",
    "../../data/assembly/kmer_analysis/mat_presenceAbsence_simka-jaccard_asym.csv.gz",
    "../../data/assembly/kmer_analysis/mat_abundance_ab-sorensen.csv.gz",
    "../../data/assembly/kmer_analysis/mat_abundance_simka-jaccard.csv.gz",
    "../../data/assembly/kmer_analysis/mat_presenceAbsence_chord.csv.gz",
    "../../data/assembly/kmer_analysis/mat_presenceAbsence_ochiai.csv.gz",
    "../../data/assembly/kmer_analysis/mat_presenceAbsence_whittaker.csv.gz",
]

In [28]:
def get_colors_from_labels(df, identifier_index):
    identifiers = sorted(set([label.split("_")[identifier_index] for label in df.index]))
    if identifier_index == 0:
        palette = sns.color_palette("Set2", len(identifiers))
    elif identifier_index == 1:
        palette = sns.color_palette("Set1", len(identifiers))
    lut = dict(zip(identifiers, palette))
    colors = pd.Series(df.index).apply(lambda x: lut[x.split("_")[identifier_index]]).to_frame()
    colors.index = df.index  # Set the index of colors DataFrame to match the input df
    # Rename the column name to match the identifier
    if identifier_index == 0:
        colors.rename(columns={0: "Month"}, inplace=True)
    elif identifier_index == 1:
        colors.rename(columns={0: "Station"}, inplace=True)
    return colors

def create_clustermap(filename, output_directory="../../figures/assembly/simka_analysis"):
    # Read the compressed CSV file
    df = pd.read_csv(filename, compression="gzip", index_col=0, sep=";")

    # Get the base filename and extension
    base_filename, extension = os.path.splitext(filename)
    
    # Remove the ".csv" extension if it exists
    if extension == ".csv":
        base_filename, _ = os.path.splitext(base_filename)
        
    # Configure plot aesthetics
    sns.set_style("white")
    plt.rcParams["font.family"] = "Times New Roman"
    plt.rcParams["font.size"] = 8

    # Get colors for month and station from row and column identifiers
    month_colors = get_colors_from_labels(df, 0)  # Get colors for the first part of the identifier (month)
    station_colors = get_colors_from_labels(df, 1)  # Get colors for the second part of the identifier (station)

    row_colors = pd.concat([month_colors, station_colors], axis=1)
    col_colors = row_colors.copy()  # Assuming the same ordering for columns

    # Create the cluster map
    g = sns.clustermap(
        df,
        xticklabels=True,
        yticklabels=True,
        row_cluster=True,
        row_colors=row_colors,
        col_colors=col_colors,
        figsize=(12, 12),
    )

    # Customize the plot
    g.fig.suptitle(Path(base_filename).stem.replace("_", " ").capitalize())

    g.savefig(f"{output_directory}/{Path(base_filename).stem}_clustermap.svg", dpi=600, bbox_inches="tight")
    plt.close()

In [29]:
# Iterate over the files and create heatmaps
for filename in filenames:
    create_clustermap(filename)