In [None]:
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
from scipy.optimize import linear_sum_assignment


In [None]:
signatures_folder_path = "nmf_runs/signature_matrices_failing"
cosmic_signatures_path = "cosmic_signatures/COSMIC_v3.4_SBS_GRCh37.txt"

signature_sets = {
    8:  ["SBS17b", "SBS86", "SBS98", "SBS39", "SBS22a", "SBS43", "SBS17a", "SBS13"],
    15: ["SBS17b", "SBS86", "SBS98", "SBS39", "SBS22a", "SBS43", "SBS17a", "SBS13", 
         "SBS54", "SBS33", "SBS21", "SBS59", "SBS60", "SBS87", "SBS37"],
    25: ["SBS17b", "SBS86", "SBS98", "SBS39", "SBS22a", "SBS43", "SBS17a", "SBS13", 
         "SBS54", "SBS33", "SBS21", "SBS59", "SBS60", "SBS87", "SBS37", "SBS96", 
         "SBS28", "SBS55", "SBS99", "SBS26", "SBS3", "SBS1", "SBS12", "SBS93", "SBS22b"]
}

In [None]:
scenario = 8  # or 15, 25
signatures_to_extract = signature_sets[scenario]


# LOAD COSMIC DATA

signatures_df = pd.read_csv(cosmic_signatures_path, sep="\t")
if "Type" in signatures_df.columns:
    signatures_df.set_index("Type", inplace=True)

extracted_signatures = signatures_df[signatures_to_extract]
true_signatures = extracted_signatures.to_numpy()
true_sig_names = extracted_signatures.columns

In [None]:
# From sig_profiler_extractor
def evaluation(true_sigs, est_sigs, cutoff=0.2, dist="cos", verbose=False):
    if true_sigs.shape[1] >= est_sigs.shape[1]:
        mat1 = est_sigs
        mat2 = true_sigs
    else:
        mat1 = true_sigs
        mat2 = est_sigs

    if dist == "cos":
        con_mat = cdist(mat1.T, mat2.T, "cosine")
    elif dist == "cor":
        con_mat = cdist(mat1.T, mat2.T, "correlation")

    row_ind, col_ind = linear_sum_assignment(con_mat)
    con_mat = 1 - con_mat  # Convert distance -> similarity

    idxPair = {}
    true_positives = 0
    for x, y in zip(row_ind, col_ind):
        idxPair[x] = y
        if con_mat[x, y] >= cutoff:
            true_positives += 1

    computedFalsePositives = mat1.shape[1] - true_positives
    computedFalseNegatives = computedFalsePositives

    if true_sigs.shape[1] >= est_sigs.shape[1]:
        baseFalsePositives = 0
        baseFalseNegatives = true_sigs.shape[1] - est_sigs.shape[1]
    else:
        baseFalsePositives = est_sigs.shape[1] - true_sigs.shape[1]
        baseFalseNegatives = 0

    false_positives = baseFalsePositives + computedFalsePositives
    false_negatives = baseFalseNegatives + computedFalseNegatives
    number_of_ground_truth_signatures = true_sigs.shape[1]
    number_of_detected_signature = est_sigs.shape[1]

    try:
        precision = round(true_positives / (true_positives + false_positives), 2)
        recall = round(true_positives / (true_positives + false_negatives), 2)
        f1_score = round(2 * precision * recall / (precision + recall), 2)
    except ZeroDivisionError:
        precision = 0
        recall = 0
        f1_score = 0

    return (
        number_of_ground_truth_signatures,
        number_of_detected_signature,
        true_positives,
        false_positives,
        false_negatives,
        precision,
        recall,
        f1_score,
        idxPair,
    )

In [None]:
# Same setup as before: loading CSV files, evaluating with the evaluation() function,
# and having true_signatures, true_sig_names, etc. already defined.

csv_files = glob.glob(os.path.join(signatures_folder_path, "*.csv"))
cutoff = 0.8
print("With a cutoff of", cutoff)

for csv_file in csv_files:
    print(f"Evaluating {os.path.basename(csv_file)}")
    nmf_signatures_df = pd.read_csv(csv_file, header=None)
    nmf_signatures = nmf_signatures_df.to_numpy()

    (
        gt_count, det_count, tp, fp, fn,
        prec, rec, f1, idx_pair
    ) = evaluation(true_signatures, nmf_signatures, cutoff=cutoff, dist="cos")

    print(f"  True Positives: {tp}")

    # -----------------------------
    # LEFT (Ground Truth)
    # -----------------------------
    left_matrix = true_signatures
    left_names = true_sig_names

    # -----------------------------
    # RIGHT (Estimated, Reordered)
    # -----------------------------
    right_matrix = np.zeros_like(left_matrix)
    right_names = ["NoMatch"] * left_matrix.shape[1]

    for est_col, true_col in idx_pair.items():
        right_matrix[:, true_col] = nmf_signatures[:, est_col]
        right_names[true_col] = f"EstSig_{est_col}"

    # -----------------------------
    # PLOT WITH MATPLOTLIB
    # -----------------------------
    fig, axes = plt.subplots(1, 2, figsize=(12, 5), dpi=150)

    # 1) LEFT HEATMAP
    im_left = axes[0].imshow(left_matrix, aspect="auto", cmap="viridis")
    axes[0].set_title("True Signatures")
    # X-axis labels
    axes[0].set_xticks(range(left_matrix.shape[1]))
    axes[0].set_xticklabels(left_names, rotation=90)
    # Hide Y-axis labels if not meaningful
    axes[0].set_yticks([])
    # Add colorbar
    fig.colorbar(im_left, ax=axes[0], fraction=0.046, pad=0.04)

    # 2) RIGHT HEATMAP
    im_right = axes[1].imshow(right_matrix, aspect="auto", cmap="viridis")
    axes[1].set_title("Estimated Signatures")
    axes[1].set_xticks(range(right_matrix.shape[1]))
    axes[1].set_xticklabels(right_names, rotation=90)
    axes[1].set_yticks([])
    fig.colorbar(im_right, ax=axes[1], fraction=0.046, pad=0.04)

    fig.suptitle(f"File: {os.path.basename(csv_file)} | TP: {tp}", y=1.05)

    plt.tight_layout()
    plt.show()
    print("-" * 50)
