## Imports

In [14]:
# utils
import os
import sys
from pathlib import Path

PROJECT_DIR = Path.cwd().parent
sys.path.append(str(PROJECT_DIR))

import matplotlib.lines as mlines
# viz
import matplotlib.pyplot as plt
# basics
import numpy as np
import pandas as pd
from matplotlib.lines import Line2D

# metrics
from utils import config
from utils.reader import read_file_yaml
from utils.utils import get_last_modification_directory

np.random.seed(0)

## Parameters

In [15]:
path_conf = PROJECT_DIR / "conf"
file_path_parameters = path_conf / "parameters.yml"

params = read_file_yaml(file_path_parameters)

path_root = PROJECT_DIR / params["results"]["filepath"]
path_outputs = PROJECT_DIR / "outputs"

if not os.path.exists(path_outputs):
    os.makedirs(path_outputs)

file_path_simulation_plot = path_outputs / params["outputs"]["simulation_random"]["filepath"]

file_path_plot_i_dataset = path_outputs / params["outputs"]["simulation_random"]["data_filepath"]
n_random = np.sort([int(i.replace("random_n", "")) for i in os.listdir(path_root) if ".placehold" not in i])
path_random = ["random_n" + str(i) for i in n_random]
path_results = [path_root / i for i in path_random]

path_results, path_random = get_last_modification_directory(path_results, path_random, params)

ext_type = params["outputs"]["extension_type"]
ext_local_img = params["outputs"]["extension_local_img"]
ext_best_img = params["outputs"]["extension_best_img"]

In [16]:
under_line = "\n{}\n"
title_part_n2 = "PROJECT_DIR: [ {} ]".format(PROJECT_DIR)
title_part_n3 = under_line.format("".join(["-"] * len(title_part_n2)))
title_part_n1 = under_line.format("".join(["-"] * len(title_part_n2)))
print(title_part_n1 + title_part_n2 + title_part_n3)


-------------------------------------------------------------
PROJECT_DIR: [ /home/manuel/projects/aaai-claire-clustering ]
-------------------------------------------------------------



## Read

In [17]:
parameters = read_file_yaml(file_path_parameters)

In [18]:
init = params["outputs"]["init_values"]
metrics = {}
for name, url in zip(path_random, path_results):
    metrics[name] = {}
    for dataset in os.listdir(url):
        metrics[name][dataset] = pd.read_csv(url / Path(dataset) / "metrics" / Path("metrics" + ext_type), index_col=0)

## Concat all results

In [19]:
data = metrics[f"random_n{init}"]["aniso"].T.filter(regex="^(?!.*random_model)").T[["abilities"]].reset_index()

data.rename(columns={"abilities": f"n{init}_aniso", "index": "model"}, inplace=True)

for random_n, content_n in metrics.items():
    for name, content_dataset in content_n.items():
        if (random_n == f"random_n{init}") and (name == "aniso"):
            continue
        tmp = content_dataset.T.filter(regex="^(?!.*random_model)").T[["abilities"]].reset_index()

        tmp.rename(
            columns={"abilities": "n{}_{}".format(random_n.replace("random_n", ""), name), "index": "model"},
            inplace=True,
        )

        data = data.merge(tmp, on="model", how="outer")

data.set_index("model", inplace=True)

In [20]:
datasets = {}
for i in config.file_names:
    datasets[i] = data.filter(regex=r"{}".format(i))
    datasets[i].columns = datasets[i].columns.str.replace("_" + i, "")
    datasets[i] = datasets[i][
        [col.replace("random_", "") for col in path_random if col.replace("random_", "") in datasets[i].columns]
    ]

## Plots

In [21]:
models = list(config.models.keys()) + ["average_model", "optimal_clustering"]
gray_scale = [str(i / len(models)) for i in range(1, len(models))]
colors = plt.cm.coolwarm(np.linspace(0, 1, len(models)))
markers = list(mlines.Line2D.markers.keys())

_params = {
    _model: {"color": colors[_idx], "marker": markers[_idx], "linestyle": "--"} for _idx, _model in enumerate(models)
}
handler_lines = [
    Line2D([], [], color=param["color"], linestyle=param["linestyle"], marker=param["marker"])
    for param in _params.values()
]

In [22]:
# by dataset
figs_dataset = {}
for name, content in datasets.items():
    _fig, ax = plt.subplots(1, 1, figsize=(25, 8))
    content = content.dropna()
    for line_name, line_data in content.iterrows():
        for idx, i in enumerate(models):
            if i in line_name:
                _line_index = []
                for k in line_data.index.str.split("n"):
                    _line_index.append("$p_{(" + k[1] + ")}$")
                linestyle = "--"
                ax.plot(_line_index, line_data, **_params[i])
                _line_index = []
    ax.grid(True)
    ax.set_title(name)
    ax.set_ylabel("$abilities$")
    ax.set_xlabel(r"$n\_random\_model$")
    ax.legend(handler_lines, models, loc="upper left", bbox_to_anchor=(1.00, 1.0))
    figs_dataset[name] = _fig
    plt.ioff()

In [23]:
# global plot
fig, axes = plt.subplots(len(datasets.keys()), 1, figsize=(25, 8 * len(datasets)))

if not isinstance(axes, np.ndarray):
    axes = [axes]

for ax, (name, content) in zip(axes, datasets.items()):
    content = content.dropna()
    for line_name, line_data in content.iterrows():
        for idx, i in enumerate(models):
            if i in line_name:
                _line_index = []
                for k in line_data.index.str.split("n"):
                    _line_index.append("$p_{(" + k[1] + ")}$")
                linestyle = "--"
                ax.plot(_line_index, line_data, **_params[i])
                _line_index = []
    ax.grid(True)
    ax.set_title(name)
    ax.set_ylabel("$abilities$")

axes[-1].set_xlabel(r"$p\_random\_model$")
handler_lines = [
    Line2D([], [], color=param["color"], linestyle=param["linestyle"], marker=param["marker"])
    for param in _params.values()
]
axes[0].legend(handler_lines, models, loc="upper left", bbox_to_anchor=(1.00, 1.0))
plt.ioff()

<contextlib.ExitStack at 0x7f6da91b9e10>

In [33]:
metrics["random_n6"]["aniso"]

Unnamed: 0,abilities,v_measure,mutual_info,adjusted_rand_score,calinski_harabasz,davies_bouldin,silhouette
optimal_clustering,0.509038,,,,,,
spectral_clustering_n_clusters_1_eigen_solver_arpack_affinity_nearest_neighbors,0.507173,0.958005,1.052461,0.976072,902.045151,0.883421,0.462729
kernel_kmeans_n_clusters_1_kernel_gak_random_state_0,0.506526,0.971133,1.092017,0.984961,424.889244,4.614135,0.277312
spectral_clustering_n_clusters_8_eigen_solver_arpack_affinity_nearest_neighbors,0.505443,0.936025,1.073629,0.95802,505.389306,9.430885,0.452949
kmeans_n_clusters_3,0.503607,0.631689,0.693941,0.605685,1226.738186,0.703821,0.50552
dbscan_eps_0_3_min_samples_2,0.502415,0.586506,0.644291,0.572083,1232.441445,0.711837,0.502867
dbscan_eps_0_4_min_samples_2,0.499515,0.553608,0.673218,0.451494,1137.418844,0.73909,0.458721
spectral_clustering_n_clusters_3_eigen_solver_arpack_affinity_nearest_neighbors,0.499461,0.806182,1.071783,0.713607,904.288345,0.871982,0.413852
kmeans_n_clusters_4,0.498035,0.554935,0.671732,0.426969,1121.633412,0.729018,0.45624
dbscan_eps_0_6_min_samples_2,0.495114,0.669361,0.965071,0.501724,1242.272996,0.687849,0.473917


## Save

In [None]:
fig.savefig(str(file_path_simulation_plot) + ext_best_img)  # save eps format
fig.savefig(str(file_path_simulation_plot) + ext_local_img)  # save png format

In [None]:
for name, content in figs_dataset.items():
    content.savefig(str(file_path_plot_i_dataset).format(name) + ext_best_img)  # save eps format
    content.savefig(str(file_path_plot_i_dataset).format(name) + ext_local_img)  # save png format