# Imports

In [None]:
import math

import numpy as np

import pandas as pd
from pandas.plotting import parallel_coordinates

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from source.constants import (
    ALL_EXTRACTOR_MODELS,
    ALL_IMG_NORMS,
    ALL_DISTANCE_METRICS,
    ALL_DIMENSIONALITY_REDUCTION_METHODS,
    ALL_CLUSTERING_ALGORITHMS
)
from source.constants import ORIGINAL_2_PRETTY_MODEL_NAMES

In [None]:
CANCER_TYPE = 'lung_aca'
OPTIMIZING_METRIC = 'Fowlkes-Mallows Index'

# Load DataFrame

In [None]:
results_df = pd.read_csv(
    f"eval_results/cancer_type={CANCER_TYPE}#extractor_name=all#img_norm=all#distance_metric=all#dimensionality_reduction=all#clustering=all.csv")
info_column_2_options = {
    'extractor_name': ALL_EXTRACTOR_MODELS,
    'img_norm': list(ALL_IMG_NORMS),
    'distance_metric': ALL_DISTANCE_METRICS,
    'dimensionality_reduction': ALL_DIMENSIONALITY_REDUCTION_METHODS,
    'clustering': ALL_CLUSTERING_ALGORITHMS,
}
info_columns = list(info_column_2_options.keys())

results_df[info_columns] = results_df['Unnamed: 0'].str.split("#", expand=True)
results_df.drop(columns=['Unnamed: 0'], inplace=True)


original_2_shorter_metric_names = {
    'Adjusted Rand Index (ARI)': 'Adjusted Rand Index',
    'Normalized Mutual Information (NMI)': 'Normalized Mutual Info',
}
results_df = results_df.rename(columns=original_2_shorter_metric_names)

conf_matrix_columns = ["TP", "FP", "FN", "TN"]
# precision_colomns = ["precision@1", "precision@5"]
metrics_columns = [
    col for col in results_df.columns 
    if col not in set(info_columns).union(set(conf_matrix_columns))#.union(set(precision_colomns))
]
metric_columns_wo_precision = [
    col for col in metrics_columns
    if 'precision' not in col
]

# reorder columns
results_df = results_df[
    info_columns \
    + conf_matrix_columns \
    + metrics_columns
    # + precision_colomns
]
# drop rows with 'resize_only_original' img_norm
results_df = results_df[results_df['img_norm'] !=
                        'resize_only_original'].reset_index(drop=True)
for col in info_columns:
    assert set(results_df[col].unique()).issubset(
        set(info_column_2_options[col]))
    print(f"{col}: {results_df[col].value_counts()}")

# sort by optimizing metric
results_df.sort_values(by=[OPTIMIZING_METRIC], ascending=False, inplace=True)
results_df.reset_index(drop=True, inplace=True)

results_df

In [None]:
results_df.img_norm.unique()

In [None]:
original_2_pretty_normalization_names = {
    # 'imagenet': 'ImageNet',
    # 'resize_only': 'resize',
    'lc25k-lung_aca-resized': 'lung_aca'
}

results_df['extractor_name'] = results_df['extractor_name'].replace(
    ORIGINAL_2_PRETTY_MODEL_NAMES)
results_df['img_norm'] = results_df['img_norm'].replace(original_2_pretty_normalization_names)

results_df

In [None]:
best_performance_df = results_df.loc[results_df.groupby('extractor_name')[OPTIMIZING_METRIC].idxmax()]

# Reset index for better readability
best_performance_df.sort_values(by=[OPTIMIZING_METRIC], ascending=False, inplace=True, ignore_index=True)

# Display the best performance dataframe
display(best_performance_df)

In [None]:
col_2_default_value = {}
for col in info_columns:
    default_value = results_df.iloc[0][col]
    col_2_default_value[col] = default_value
col_2_default_value

In [None]:
standard_subset = \
    (results_df['distance_metric'] == col_2_default_value['distance_metric']) &  \
    (results_df['dimensionality_reduction'] == col_2_default_value['dimensionality_reduction']) & \
    (results_df['clustering'] == col_2_default_value['clustering'])

standard_subset_df = results_df[standard_subset].sort_values(by=[OPTIMIZING_METRIC], ascending=False, ignore_index=True)
standard_subset_df

In [None]:
basic_subset = \
    (results_df['distance_metric'] == "euclidean") &  \
    (results_df['dimensionality_reduction'] == "NoReduction") & \
    (results_df['clustering'] == "kmeans")

basic_subset_df = results_df[basic_subset].sort_values(by=[OPTIMIZING_METRIC], ascending=False, ignore_index=True)
basic_subset_df

## Plotting functions

In [None]:
def plot_precision_bar_chart(df, title=""):
    plt.figure(figsize=(6, 3))
    # Setting the positions and width for the bars
    positions = np.arange(
        len(df['extractor_name']))
    width = 0.4

    # Plotting both the precision@1 and precision@5
    plt.bar(positions - width/2, df['precision@1'],
            width=width, label='Precision@1')
    plt.bar(positions + width/2, df['precision@5'],
            width=width, label='Precision@5')

    # Adding some labels and title
    plt.xlabel('Feature Extractor')
    plt.ylabel('Precision')
    plt.ylim(0.85, 1.01)
    plt.title(title)
    plt.xticks(positions, df['extractor_name'], rotation=30, ha='right')

    # Adding a legend
    plt.legend(loc='lower left')

    # Show the plot
    plt.show()

In [None]:
def create_radar_chart(df, metrics, title="", baseline=0.5):
    categories = metrics
    num_vars = len(categories)

    colors = plt.cm.Set1(np.linspace(0, 1, math.ceil(num_vars/2)))
    line_styles = ['-', '--']
    color_style_combinations = [(color, style)
                                for color in colors for style in line_styles]

    # Compute angle for each axis
    angles = [n / float(num_vars) * 2 * math.pi for n in range(num_vars)]
    angles += angles[:1]

    # Initialize the radar plot
    fig, ax = plt.subplots(figsize=(9, 9), subplot_kw=dict(polar=True))

    for i, row in df.iterrows():
        color, line_style = color_style_combinations[i % len(
            color_style_combinations)]
        values = row[metrics].values.flatten().tolist()
        values += values[:1]
        ax.plot(angles, values, linewidth=1,
                linestyle=line_style, color=color, label=row['extractor_name'])
        ax.fill(angles, values, alpha=0.1)

    # Draw one axe per variable
    plt.xticks(angles[:-1], categories, color='black', size=10)

    # Set the baseline for the radial axis
    ax.set_ylim(baseline, 1.0)

    # Add a title
    plt.title(title, size=20, color='black', y=1.1)

    # Add legend
    plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))

    plt.show()

## Heatmap of Correlations

Create a heatmap to visualize the correlation between different metrics and combinations of parameters

In [None]:
corr = results_df[metrics_columns].corr(method='spearman') # ranks
# corr = standard_subset_df[metrics_columns].corr(method='spearman')

# Generate a heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Heatmap of Metrics Correlation')
plt.show()

## Grouped Bar Chart

Create grouped bar charts to compare the performance of different combinations for specific metrics.

### Aggregate over all evaluated combinations

In [None]:
# average performance over all distance metrics, dimensionality reduction methods, and clustering algorithms
plt.figure(figsize=(6, 3))
sns.barplot(
    data=results_df.sort_values(by=[OPTIMIZING_METRIC], ascending=False),
    x='extractor_name',
    y=OPTIMIZING_METRIC,
    hue='img_norm'
)
# plt.title('Extractor Name and Clustering Method')
plt.xlabel('Feature Extractor')
plt.xticks(rotation=30, ha='right')
plt.show()

### Basic subset: euclidean distance, no reduction, k-means clustering

In [None]:
display(basic_subset_df)

# plot precision@5 for each extractor model and image normalization
plt.figure(figsize=(6, 2))
sns.barplot(
    data=basic_subset_df,
    x='extractor_name',
    y='precision@5',
    hue='img_norm'
)
plt.xlabel('Feature Extractor')
plt.ylim(0.85, 1.01)
plt.legend(title='Image Normalization', loc='lower left')
plt.xticks(rotation=30, ha='right')
plt.xticks(None)
plt.show()

# ----------------------------------------------------------------

plt.figure(figsize=(6, 2))
sns.barplot(
    data=basic_subset_df,
    x='extractor_name',
    y=OPTIMIZING_METRIC,
    hue='img_norm'
)
plt.xlabel('Feature Extractor')
plt.legend(title='Image Normalization', loc='lower left')
plt.xticks(rotation=30, ha='right')
plt.show()

In [None]:
# Create a figure with two subplots (axes) arranged vertically
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(6, 4), sharex=True)

# Plot on the first axes
sns.barplot(
    data=basic_subset_df,
    x='extractor_name',
    y='precision@5',
    hue='img_norm',
    ax=ax1
)
ax1.set_xlabel(None)  # Remove x-label for the first plot
ax1.set_ylim(0.85, 1.01)
ax1.legend(title='Image Normalization', loc='lower left')
ax1.tick_params(axis='x', rotation=30)  # Rotate x-tick labels

# Plot on the second axes
sns.barplot(
    data=basic_subset_df,
    x='extractor_name',
    y=OPTIMIZING_METRIC,
    hue='img_norm',
    ax=ax2
)
ax2.set_xlabel(None)  # Set x-label for the second plot
ax2.legend().set_visible(False)  # Hide legend for the first plot
ax2.tick_params(axis='x', rotation=30,)  # Rotate x-tick labels
ax2.set_xticklabels(ax2.get_xticklabels(), ha='right')

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()

### Standard subset: distance metric, dimansionality reduction, clustering are fixed like in the best-performing model

In [None]:
display(standard_subset_df)

plt.figure(figsize=(6, 3))
sns.barplot(
    data=standard_subset_df,
    x='extractor_name',
    y=OPTIMIZING_METRIC,
    hue='img_norm'
)
# plt.title('Clustering Performance Extractor Name and Clustering Method')
plt.xlabel('Feature Extractor')
plt.xticks(rotation=30, ha='right')
plt.legend(title='Image Normalization', loc='lower left')
plt.show()

### Take best combination of (distance, dimensionality reduction, clustering) for each normalization-extractor combination

In [None]:
tmp_df = results_df.loc[
    results_df.groupby(['extractor_name', 'img_norm'])[
        OPTIMIZING_METRIC].idxmax()
].sort_values(by=[OPTIMIZING_METRIC], ascending=False)

display(tmp_df)

# best performance over all distance metrics, dimensionality reduction methods, and clustering algorithms
plt.figure(figsize=(6, 3))
sns.barplot(
    data=tmp_df,
    x='extractor_name',
    y=OPTIMIZING_METRIC,
    hue='img_norm'
)
plt.xlabel('Feature Extractor')
plt.xticks(rotation=30, ha='right')
plt.legend(title='Image Normalization', loc='lower left')
plt.show()

## Radar Chart

Radar charts can help compare multiple metrics for a specific combination of parameters.

### Basic subset (euclidean, no reduction, k-means) with best-performing image normalisation for each extractor

In [None]:
tmp_df = basic_subset_df.loc[
    basic_subset_df.groupby(['extractor_name'])[OPTIMIZING_METRIC].idxmax()
    ].sort_values(by=[OPTIMIZING_METRIC], ascending=False)

display(tmp_df)

# Create radar chart for best performance data
create_radar_chart(
    tmp_df,
    metric_columns_wo_precision,
    title='Clustering Metrics: Best-performing Image Normalization',
    baseline=0.4
)

plt.figure(figsize=(7.5, 3))
parallel_coordinates(
    tmp_df,
    'extractor_name',
    cols=metric_columns_wo_precision,
    color=plt.cm.Set1.colors,
)
plt.title('Clustering Metrics: Best-performing Image Normalization')
plt.xticks(rotation=30, ha='right')
plt.legend(bbox_to_anchor=(0.01, 0.01, 1.05, 1.05), #loc='lower right',
           ncol=3, borderaxespad=0.)
plt.ylim(0.15, 1.0)
plt.show()


plot_precision_bar_chart(tmp_df, title='Precision by Extractor: Best-performing Image Normalization')

### Basic subset (euclidean, no reduction, k-means) with ImageNet image normalization

In [None]:
img_norm = 'imagenet'
assert img_norm in ALL_IMG_NORMS

basic_subset_img_norm_restricted = basic_subset & (
    results_df['img_norm'] == img_norm
    )
basic_subset_img_norm_restricted_df = results_df[basic_subset_img_norm_restricted].sort_values(
    by=[OPTIMIZING_METRIC], ascending=False, ignore_index=True)

display(basic_subset_img_norm_restricted_df)

# Create radar chart for best performance data
create_radar_chart(
    basic_subset_img_norm_restricted_df,
    metrics_columns,
    title=f'Clustering Metrics: {img_norm} Image Normalization',
    baseline=0.3
)

# ------------------------------------------------------------------------



plot_precision_bar_chart(basic_subset_img_norm_restricted_df, title='Precision by Extractor: ImageNet Normalization')

# plot precision@5 only as a bar chart
plt.figure(figsize=(6, 3))
sns.barplot(
    data=basic_subset_img_norm_restricted_df,
    x='extractor_name',
    y='precision@5'
)
plt.ylim(0.85, 1.01)
plt.title('Precision@5 by Extractor: ImageNet Normalization')
plt.xlabel('Feature Extractor')
plt.ylabel('Precision@5')
plt.xticks(rotation=30, ha='right')
plt.show()