In [None]:
import os.path as op
import gc

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from nimare.reports.figures import _reorder_matrix
# ['single', 'complete', 'average', 'weighted', 'ward']
def reorder_matrix(temp_data_df, flip_rows=False, flip_cols=False):
    mat = temp_data_df.to_numpy()
    row_labels, col_labels = (
        temp_data_df.index.to_list(),
        temp_data_df.columns.to_list(),
    )
    new_mat, new_row_labels, new_col_labels = _reorder_matrix(
        mat,
        row_labels,
        col_labels,
        "complete",
    )
    if flip_rows:
        new_mat = new_mat[::-1, :]
        new_row_labels = new_row_labels[::-1]
    if flip_cols:
        new_mat = new_mat[:, ::-1]
        new_col_labels = new_col_labels[::-1]
    return pd.DataFrame(new_mat, columns=new_col_labels, index=new_row_labels)

In [None]:
def bin_df(df, percentile_threshold=90):
    threshold_value = df.stack().quantile(percentile_threshold / 100)
    return df.applymap(lambda x: 1 if x > threshold_value else 0)

In [None]:
def plot_profile(temp_data_df, metric, hue_order, cmap="tab20"):
    sns.set(style="whitegrid")
    
    n_segments = 31
    for seg_sol in range(n_segments):
        n_seg = seg_sol + 2
        fontsize=14
        fig, ax = plt.subplots(1, 1)
        fig.set_size_inches(9 + seg_sol*0.2, 4)

        test_df = temp_data_df[temp_data_df["segment_solution"] == n_seg]
        test_df = test_df.reset_index()
        test_df["segment"] = test_df["segment"].astype(str)

        sns.lineplot(
            data=test_df,
            x="segment",
            y=metric,
            palette=cmap,
            hue="method",
            hue_order=hue_order,
            marker="o",
            ax=ax,
        )
        if seg_sol == 0:
            handles, labels = ax.get_legend_handles_labels()
            [ha.set_linewidth(8) for ha in handles ]
            new_labels = []
            for label in labels:
                method, dset_nm, seg = label.split("_")
                method = method.upper()
                seg = "PCT" if seg == "Percentile" else seg
                dset_nm = "NS" if dset_nm == "neurosynth" else "NQ"
                new_labels.append(f"{dset_nm}-{method}-{seg}")

        ax.get_legend().remove()

        text_lst = []
        mean_lst = []
        for approach in hue_order:
            approach_df = test_df[test_df["method"] == approach]
            # print(approach_df)
            mean_corr = approach_df[metric]
            text_lst.append(f"{mean_corr.mean():.3f} ± {mean_corr.std():.3f}")
            mean_lst.append(mean_corr.mean())

        ax_handles, ax_labels = ax.get_legend_handles_labels()
        sort_idx = np.argsort(-np.array(mean_lst))
        [ha.set_linewidth(6) for ha in ax_handles ]


        legend_title = "$Mean \pm \sigma$"
        ax.legend(
            np.array(ax_handles)[sort_idx],
            np.array(text_lst)[sort_idx],
            loc="upper left",
            bbox_to_anchor=(1.04, 1),
            ncol=2,
            title=legend_title,
            fontsize=fontsize,
        )
        
        ax.set_xlabel('Segment ID', fontsize=fontsize)
        plt.xticks(fontsize=fontsize)
        if metric == "max_corr":
            ax.set_ylabel('Max Correlation Coefficient', fontsize=fontsize)
            plt.yticks(fontsize=fontsize)
            ax.set_yticks([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8])
            ax.set_yticklabels([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8], fontsize=fontsize)
        elif metric == "information_content":
            ax.set_ylabel('Information Content', fontsize=fontsize)
            plt.yticks(fontsize=fontsize)
            # ax.set_yticks([2, 3, 4, 5, 6, 7, 8])
            # ax.set_yticklabels([2, 3, 4, 5, 6, 7, 8], fontsize=fontsize)
        elif metric == "tfidf":
            ax.set_ylabel('TFIDF', fontsize=fontsize)

        ax.set_title(f"Segment Solution {n_seg:02d}", fontsize=fontsize)
        fig.tight_layout()
        plt.savefig(op.join("./Fig", "performance", f"{metric}_profile_{seg_sol}.eps"), bbox_inches="tight")
        plt.close()
        gc.collect()
    
    fig, ax = plt.subplots(1, 1)
    fig.set_size_inches(4, 4)
    fig.legend(
        handles,
        new_labels,
        loc="center",
        ncol=9,
        fontsize=fontsize,
    )
    ax.axis('off')
    # bbox_to_anchor=(0.5, -0.01),
    fig.tight_layout()
    plt.savefig(op.join("./Fig", "performance", f"{metric}_profile_legend.eps"), bbox_inches="tight")
    plt.close()
    gc.collect()

In [None]:
def plot_mean_profile(temp_data_df, metric, hue_order, cmap="tab20"):
    temp_data_df[["segment_solution"]] = temp_data_df[["segment_solution"]].astype(str)
    sns.set(style="white")
    fig, ax = plt.subplots(1, 1)
    fig.set_size_inches(3, 15)

    sns.lineplot(
        data=temp_data_df,
        x=metric,
        y="segment_solution",
        palette=cmap,
        hue="method",
        hue_order=hue_order,
        sort=False,
        marker="o",
        ax=ax,
        estimator=None,
    )
    ax.get_legend().remove()
    ax.spines[['right', "bottom"]].set_visible(False)
    ax.xaxis.set_ticks_position('top')
    ax.xaxis.set_label_position('top')

    if metric == "max_corr":
        fontsize = 12
        ax.set_xlabel('Mean Correlation Coefficient', fontsize=fontsize)
        ax.set_xticks([0.1, 0.3, 0.5, 0.7, 0.9])
        ax.set_xticklabels([0.1, 0.3, 0.5, 0.7, 0.9], fontsize=fontsize)
    elif metric == "ic":
        fontsize = 16
        ax.set_xlabel('Information Content', fontsize=fontsize, labelpad=10)
        # ax.set_xticks([4, 5, 6, 7])
        # ax.set_xticklabels([4, 5, 6, 7], fontsize=fontsize)
    elif metric == "tfidf":
        fontsize = 16
        ax.set_xlabel('Mean TFIDF', fontsize=fontsize, labelpad=10)
        # ax.set_xticks([0.05, 0.10, 0.15, 0.20])
        # ax.set_xticklabels([0.05, 0.10, 0.15, 0.20], fontsize=fontsize)
    elif metric == "snr":
        fontsize = 12
        ax.set_xlabel('SNR', fontsize=fontsize, labelpad=10)

    plt.xticks(fontsize=fontsize)
    ax.set_ylabel('Segment Solution', fontsize=fontsize)
    plt.yticks(fontsize=fontsize)

    plt.savefig(op.join("./Fig", "performance", f"mean_{metric}_profile.eps"), bbox_inches="tight")
    plt.close()
    gc.collect()

In [None]:
def plot_mean_sbars(sub_mean_data_df, metric, hue_order, cmap="tab20"):
    sns.set(style="white")

    fig, ax = plt.subplots(1, 1)
    fig.set_size_inches(3, 15)

    my_cmap = plt.get_cmap("tab20")
    sub_mean_data_df.plot.barh(
        rot=0,
        width=.81,
        stacked=True,
        color=my_cmap.colors[: len(hue_order)],
        ax=ax,
    )
    plt.gca().invert_yaxis()
    ax.get_legend().remove()
    ax.spines[['right', "bottom"]].set_visible(False)
    ax.xaxis.set_ticks_position('top')
    ax.xaxis.set_label_position('top')

    fontsize = 12
    
    if metric == "max_corr":
        fontsize = 12
        ax.set_xlabel('Mean Correlation Coefficient', fontsize=fontsize)
        ax.set_xticks([0.1, 0.2, 0.3, 0.4, 0.5])
        ax.set_xticklabels([0.1, 0.2, 0.3, 0.4, 0.5], fontsize=fontsize)
    elif metric == "ic":
        fontsize = 16
        ax.set_xlabel('Information Content', fontsize=fontsize, labelpad=10)
    elif metric == "tfidf":
        fontsize = 16
        ax.set_xlabel('Mean TFIDF', fontsize=fontsize, labelpad=10)
    elif metric == "snr":
        fontsize = 16
        ax.set_xlabel('Normalized SNR', fontsize=fontsize)

    plt.xticks(fontsize=fontsize)
    ax.set_ylabel('Segment Solution', fontsize=fontsize)
    plt.yticks(fontsize=fontsize)

    plt.savefig(op.join("./Fig", "performance", f"mean_{metric}_barh.eps"), bbox_inches="tight")
    plt.close()
    gc.collect()

In [None]:
result_dir = op.abspath("../results")
figure_dir = op.abspath("./Fig")

hue_order = [
    'term_neurosynth_PCT',
    'term_neuroquery_PCT',
    'term_neurosynth_KMeans', 
    "term_neuroquery_KMeans", 
    "term_neurosynth_KDE", 
    "term_neuroquery_KDE",
    'lda_neurosynth_PCT',
    'lda_neuroquery_PCT',
    'lda_neurosynth_KMeans', 
    "lda_neuroquery_KMeans", 
    "lda_neurosynth_KDE", 
    "lda_neuroquery_KDE",
    'gclda_neurosynth_PCT',
    'gclda_neuroquery_PCT',
    'gclda_neurosynth_KMeans', 
    "gclda_neuroquery_KMeans", 
    "gclda_neurosynth_KDE", 
    "gclda_neuroquery_KDE",
]

method_order = ["PCT", "KMeans", "KDE"]

component_order = ["G1", "G1:G2", "G1:G3", "G1:G4", "G1:G5", "G1:G6", "G1:G7", "G1:G8", "G1:G9"]

model_dict = {
    'term_neurosynth_PCT': "NS-TERM-PCT",
    'term_neuroquery_PCT': "NQ-TERM-PCT",
    'term_neurosynth_KMeans': "NS-TERM-KMeans", 
    "term_neuroquery_KMeans": "NQ-TERM-KMeans", 
    "term_neurosynth_KDE": "NS-TERM-KDE", 
    "term_neuroquery_KDE": "NQ-TERM-KDE",
    'lda_neurosynth_PCT': "NS-LDA-PCT",
    'lda_neuroquery_PCT': "NQ-LDA-PCT",
    'lda_neurosynth_KMeans': "NS-LDA-KMeans", 
    "lda_neuroquery_KMeans": "NQ-LDA-KMeans", 
    "lda_neurosynth_KDE": "NS-LDA-KDE", 
    "lda_neuroquery_KDE": "NQ-LDA-KDE",
    'gclda_neurosynth_PCT': "NS-GCLDA-PCT",
    'gclda_neuroquery_PCT': "NQ-GCLDA-PCT",
    'gclda_neurosynth_KMeans': "NS-GCLDA-KMeans", 
    "gclda_neuroquery_KMeans": "NQ-GCLDA-KMeans", 
    "gclda_neurosynth_KDE": "NS-GCLDA-KDE", 
    "gclda_neuroquery_KDE": "NQ-GCLDA-KDE",
}

In [None]:
print(np.mean([0.739303,0.702998]), np.std([0.739303,0.702998]))

In [None]:
data_df = pd.read_csv(op.join(result_dir, "performance", "performance.tsv"),  delimiter="\t")
data_df

In [None]:
suub_data_df = data_df[(data_df["segment_solution"] == data_df["segment"]) | (data_df["segment"] == 1)]
suub_data_df.head(10)

In [None]:
for segm in ["PCT", "KMeans", "KDE"]:
    ic_lst = []
    tfidf_lst = []
    corr_lst = []
    for model in ["term", "lda", "gclda"]:
        for dset in ["neurosynth", "neuroquery"]:
            model_nm = f"{model}_{dset}_{segm}"
            ic_lst.append(data_df.loc[data_df["method"] == model_nm , "information_content"].to_list())
            tfidf_lst.append(data_df.loc[data_df["method"] == model_nm, "tfidf"].to_list())
            # coor = data_df.loc[(data_df["method"] == model_nm) & (data_df["segment_solution"] == 3), "max_corr"].to_list()
            coor = data_df.loc[(data_df["method"] == model_nm), "max_corr"].to_list()
            
            corr_lst.append(coor)

            # print(model_nm, "Corr", np.mean(coor), np.std(coor))
    
    ic_arr = np.hstack(ic_lst)
    tfidf_arr = np.hstack(tfidf_lst)
    corr_arr = np.hstack(corr_lst)
    print(segm, "Corr", corr_arr.mean(), corr_arr.std())
    # print(model, "IC", ic_arr.mean(), ic_arr.std())
    # print(model, "TFIDF", tfidf_arr.mean(), tfidf_arr.std())

In [None]:
mean_data_df = pd.read_csv(op.join(result_dir, "performance", "performance_average.tsv"),  delimiter="\t")
# mean_data_df["segment_solution"] = mean_data_df["segment_solution"].astype(str)
mean_data_df

In [None]:
ld_scores_df = pd.read_csv(op.join(result_dir, "segmentation", "scores_uni-dimensional.csv"))
ld_scores_df

In [None]:
hd_scores_df = pd.read_csv(op.join(result_dir, "segmentation", "scores_high-dimensional.csv"))
hd_scores_df

In [None]:
sns.set(style="whitegrid")
from matplotlib.patches import Rectangle

fig, axes = plt.subplots(1, 2)
fig.set_size_inches(8, 3)

sns.regplot(
    data=data_df, 
    x="segment_size", 
    y="max_corr",
    logx=True, 
    scatter_kws={"s": 10},
    line_kws={"color": "r"}, 
    ax=axes[0],
)
sns.regplot(
    data=suub_data_df, 
    x="segment_size", 
    y="max_corr", 
    logx=True, 
    scatter_kws={"s": 10}, 
    line_kws={"color":"r"},
    ax=axes[1],
)

axes[0].set_xlabel("Segment Size", fontsize=12)
axes[0].set_ylabel("Max Correlation Coefficient", fontsize=12)
axes[0].set_title("All Segments", fontsize=12)
axes[0].set_xlim(1, 42000)
axes[0].set_ylim(0, 0.8)
axes[1].set_xlabel("Segment Size", fontsize=12)
axes[1].set_ylabel("")
axes[1].set_yticklabels([])
axes[1].set_title("End Segments", fontsize=12)
axes[1].set_xlim(1, 42000)
axes[1].set_ylim(0, 0.8)

plt.tight_layout()
plt.savefig(op.join("./Fig", "Fig-S7.png"), dpi=600, bbox_inches="tight")
plt.show()

In [None]:
glue_corr = mean_data_df.pivot(index="method", columns="segment_solution", values="max_corr")
glue_ic = mean_data_df.pivot(index="method", columns="segment_solution", values="ic")
glue_tfidf = mean_data_df.pivot(index="method", columns="segment_solution", values="tfidf")
glue_snr = mean_data_df.pivot(index="method", columns="segment_solution", values="snr")
glue_silhouette = ld_scores_df.pivot(index="method", columns="segment", values="silhouette")
glue_variance = ld_scores_df.pivot(index="method", columns="segment", values="variance_ratio")
glue_separation = ld_scores_df.pivot(index="method", columns="segment", values="cluster_separation")
glue_high_silhouette = hd_scores_df.pivot(index="component", columns="segment", values="silhouette")
glue_high_variance = hd_scores_df.pivot(index="component", columns="segment", values="variance_ratio")
glue_high_separation = hd_scores_df.pivot(index="component", columns="segment", values="cluster_separation")

In [None]:
percent = 70
# glue_corr_sorted = reorder_matrix(glue_corr, flip_rows=False, flip_cols=False)
glue_corr_sorted = glue_corr.reindex(hue_order)
glue_corr_sorted.rename(index=model_dict, inplace=True)
glue_corr_sorted_bin = bin_df(glue_corr_sorted, 90)

# glue_ic_sorted = reorder_matrix(glue_ic, flip_rows=False, flip_cols=True)
glue_ic_sorted = glue_ic.reindex(hue_order)
glue_ic_sorted.rename(index=model_dict, inplace=True)
glue_ic_sorted_bin = bin_df(glue_ic_sorted, percent)

# glue_tfidf_sorted = reorder_matrix(glue_tfidf, flip_rows=True, flip_cols=True)
glue_tfidf_sorted = glue_tfidf.reindex(hue_order)
glue_tfidf_sorted.rename(index=model_dict, inplace=True)
glue_tfidf_sorted_bin = bin_df(glue_tfidf_sorted, percent)

#glue_snr_sorted = reorder_matrix(glue_snr, flip_rows=True, flip_cols=True)
glue_snr_sorted = glue_snr.reindex(hue_order)
glue_snr_sorted.rename(index=model_dict, inplace=True)
glue_snr_sorted_bin = bin_df(glue_snr_sorted, 90)

glue_silhouette_sorted = glue_silhouette.reindex(method_order)
glue_silhouette_sorted_bin = bin_df(glue_silhouette, 98)
# glue_silhouette_sorted_bin = (glue_silhouette_sorted == glue_silhouette_sorted.max().max()).astype(int)
# top_two = glue_silhouette_sorted.unstack().nlargest(2).index
# glue_silhouette_sorted_bin = glue_silhouette_sorted.copy()
# glue_silhouette_sorted_bin[:] = 0
# glue_silhouette_sorted_bin.loc[top_two] = 1

glue_variance_sorted = glue_variance.reindex(method_order)
glue_variance_sorted_bin = bin_df(glue_variance, 98)
# glue_variance_sorted_bin = (glue_variance_sorted == glue_variance_sorted.max().max()).astype(int)
# top_two = glue_variance_sorted.unstack().nlargest(2).index
# glue_variance_sorted_bin = glue_variance_sorted.copy()
# glue_variance_sorted_bin[:] = 0
# glue_variance_sorted_bin.loc[top_two] = 1

glue_separation_sorted = glue_separation.reindex(method_order)
glue_separation_sorted_bin = bin_df(glue_separation*-1, 98)
# glue_separation_sorted_bin = (glue_separation_sorted == glue_separation_sorted.min().min()).astype(int)
# top_two = glue_separation_sorted.unstack().nlargest(2).index
# glue_separation_sorted_bin = glue_separation_sorted.copy()
# glue_separation_sorted_bin[:] = 0
# glue_separation_sorted_bin.loc[top_two] = 1

glue_high_silhouette_sorted = glue_high_silhouette.reindex(component_order)
glue_high_silhouette_sorted_bin = bin_df(glue_high_silhouette, 98)

glue_high_variance_sorted = glue_high_variance.reindex(component_order)
glue_high_variance_sorted_bin = bin_df(glue_high_variance, 98)

glue_high_separation_sorted = glue_high_separation.reindex(component_order)
glue_high_separation_sorted_bin = bin_df(glue_high_separation*-1, 98)

In [None]:
#sns.set(style="whitegrid", font_scale=0.8)
sns.set(style="whitegrid")
from matplotlib.patches import Rectangle

titles = ["Mean Silhouette Coefficient", "Variance Ratio", "Cluster Separation"]
fig, axes = plt.subplots(3, 1)
fig.set_size_inches(8, 8)

data_df_lst = [glue_high_silhouette_sorted, glue_high_variance_sorted, glue_high_separation_sorted]
data_bin_df_lst = [glue_high_silhouette_sorted_bin, glue_high_variance_sorted_bin, glue_high_separation_sorted_bin]
for met_i, (dat_df, data_bin_df, title) in enumerate(zip(data_df_lst, data_bin_df_lst, titles)):
    ax = axes[met_i]

    if met_i == 2:
        sns.heatmap(dat_df, cmap="Blues", yticklabels=True, vmax=3, ax=ax)
    else:
        sns.heatmap(dat_df, cmap="Blues", yticklabels=True, ax=ax)
    # ax.set_xticklabels(ax.get_xticklabels(), rotation=0, fontsize=16)
    x_lab = [lab if int(lab.get_text()) % 2 == 0 else "" for lab in ax.get_xticklabels()]
    ax.set_xticklabels(x_lab, rotation=0, fontsize=14) 
    ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
    
    non_zero_indices = np.nonzero(data_bin_df)
    for i in range(len(non_zero_indices[0])):
        ax.add_patch(Rectangle((non_zero_indices[1][i], non_zero_indices[0][i]), 1, 1, fill=False, edgecolor='red', lw=1))

    # square=True
    ax.set_ylabel("")
    ax.set_title(title, fontsize=16)
    if met_i == 2:
        ax.set_xlabel('Segment Solution', fontsize=16)
        # ax.set_xticklabels(np.arange(2, 33, 2))
        ax.tick_params(axis='x', labelsize=14)
    else:
        ax.set_xlabel("")
        ax.set_xticklabels([])

plt.tight_layout()
plt.savefig(op.join("./Fig", "Fig-S10a.png"), dpi=600, bbox_inches="tight")
plt.show()

In [None]:
from functools import reduce
clust_high_df_sum = reduce(lambda x, y: x.add(y, fill_value=0), [glue_high_silhouette_sorted_bin, glue_high_variance_sorted_bin, glue_high_separation_sorted_bin])
clust_high_df_sum

In [None]:
import matplotlib.colors as colors

fig, ax = plt.subplots(1, 1)
fig.set_size_inches(8, 3)

n_bins = clust_high_df_sum.max().max() + 1
vals = np.arange(n_bins)
vals_ticks = vals + 0.5
vals_labels = [str(lab) for lab in vals]
cmap = plt.cm.get_cmap('Blues', n_bins)
norm = colors.BoundaryNorm(np.arange(n_bins+1), cmap.N)

sns.heatmap(
    clust_high_df_sum, 
    cmap=cmap,
    vmin=0, 
    vmax=3, 
    xticklabels=True, 
    yticklabels=True, 
    ax=ax
)
x_lab = [lab if int(lab.get_text()) % 2 == 0 else "" for lab in ax.get_xticklabels()]
ax.set_xticklabels(x_lab, rotation=0, fontsize=14)
ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=14)
    
non_zero_indices = np.nonzero(data_bin_df)
ax.add_patch(Rectangle((2, 6), 1, 1, fill=False, edgecolor='red', lw=3))

ax.set_ylabel("")
ax.set_xlabel('Segment Solution', fontsize=16)
colorbar = ax.collections[0].colorbar
colorbar.set_ticks(vals_ticks)
colorbar.set_ticklabels(vals_labels)
ax.set_title("Overall Performance", fontsize=16)

plt.tight_layout()
plt.savefig(op.join("./Fig", "Fig-S10b.png"), dpi=600, bbox_inches="tight")
plt.show()

In [None]:
#sns.set(style="whitegrid", font_scale=0.8)
sns.set(style="whitegrid")
from matplotlib.patches import Rectangle

titles = ["Mean Silhouette Coefficient", "Variance Ratio", "Cluster Separation"]
fig, axes = plt.subplots(3, 1)
fig.set_size_inches(8, 4)

data_df_lst = [glue_silhouette_sorted, glue_variance_sorted, glue_separation_sorted]
data_bin_df_lst = [glue_silhouette_sorted_bin, glue_variance_sorted_bin, glue_separation_sorted_bin]
for met_i, (dat_df, data_bin_df, title) in enumerate(zip(data_df_lst, data_bin_df_lst, titles)):
    ax = axes[met_i]

    sns.heatmap(dat_df, cmap="Blues", yticklabels=True, ax=ax)
    # ax.set_xticklabels(ax.get_xticklabels(), rotation=0, fontsize=16)
    x_lab = [lab if int(lab.get_text()) % 2 == 0 else "" for lab in ax.get_xticklabels()]
    ax.set_xticklabels(x_lab, rotation=0, fontsize=14) 
    ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
    
    non_zero_indices = np.nonzero(data_bin_df)
    for i in range(len(non_zero_indices[0])):
        ax.add_patch(Rectangle((non_zero_indices[1][i], non_zero_indices[0][i]), 1, 1, fill=False, edgecolor='red', lw=1))

    # square=True
    ax.set_ylabel("")
    ax.set_title(title, fontsize=16)
    if met_i == 2:
        ax.set_xlabel('Segment Solution', fontsize=16)
        # ax.set_xticklabels(np.arange(2, 33, 2))
        ax.tick_params(axis='x', labelsize=14)
    else:
        ax.set_xlabel("")
        ax.set_xticklabels([])

plt.tight_layout()
plt.savefig(op.join("./Fig", "Fig-S08a.png"), dpi=600, bbox_inches="tight")
plt.show()

In [None]:
from functools import reduce
clust_df_sum = reduce(lambda x, y: x.add(y, fill_value=0), [glue_silhouette_sorted_bin, glue_variance_sorted_bin, glue_separation_sorted_bin])
clust_df_sum

In [None]:
import matplotlib.colors as colors

fig, ax = plt.subplots(1, 1)
fig.set_size_inches(8, 2)

n_bins = clust_df_sum.max().max() + 1
vals = np.arange(n_bins)
vals_ticks = vals + 0.5
vals_labels = [str(lab) for lab in vals]
cmap = plt.cm.get_cmap('Blues', n_bins)
norm = colors.BoundaryNorm(np.arange(n_bins+1), cmap.N)

sns.heatmap(
    clust_df_sum, 
    cmap=cmap,
    vmin=0, 
    vmax=4, 
    xticklabels=True, 
    yticklabels=True, 
    ax=ax
)
x_lab = [lab if int(lab.get_text()) % 2 == 0 else "" for lab in ax.get_xticklabels()]
ax.set_xticklabels(x_lab, rotation=0, fontsize=14)
ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=14)
    
non_zero_indices = np.nonzero(data_bin_df)
ax.add_patch(Rectangle((0, 1), 1, 1, fill=False, edgecolor='red', lw=3))

ax.set_ylabel("")
ax.set_xlabel('Segment Solution', fontsize=16)
colorbar = ax.collections[0].colorbar
colorbar.set_ticks(vals_ticks)
colorbar.set_ticklabels(vals_labels)
ax.set_title("Overall Performance", fontsize=16)

plt.tight_layout()
plt.savefig(op.join("./Fig", "Fig-S08b.png"), dpi=600, bbox_inches="tight")
plt.show()

In [None]:
#sns.set(style="whitegrid", font_scale=0.8)
sns.set(style="whitegrid")
from matplotlib.patches import Rectangle

titles = ["Mean Correlation Coefficient", "Information Content", "Mean TFIDF", "Normalized SNR"]
fig, axes = plt.subplots(2, 2)
fig.set_size_inches(15, 10)

data_df_lst = [glue_corr_sorted, glue_ic_sorted, glue_tfidf_sorted, glue_snr_sorted]
data_bin_df_lst = [glue_corr_sorted_bin, glue_ic_sorted_bin, glue_tfidf_sorted_bin, glue_snr_sorted_bin]
for met_i, (data_df, data_bin_df, title) in enumerate(zip(data_df_lst, data_bin_df_lst, titles)):
    
    if met_i==0:
        ax = axes[0,0]
    elif met_i==1:
        ax = axes[1,0]
    elif met_i==2:
        ax = axes[1,1]
    elif met_i==3:
        ax = axes[0,1]

    sns.heatmap(data_df, cmap="Blues", yticklabels=True, ax=ax)
    # ax.set_xticklabels(np.arrange(2,32,2), rotation=0, fontsize=14)
    ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
    
    non_zero_indices = np.nonzero(data_bin_df)
    for i in range(len(non_zero_indices[0])):
        ax.add_patch(Rectangle((non_zero_indices[1][i], non_zero_indices[0][i]), 1, 1, fill=False, edgecolor='red', lw=1))

    # square=True
    ax.set_ylabel("")
    ax.set_title(title, fontsize=20)
    if met_i == 1 or met_i == 2:
        ax.set_xlabel('Segment Solution', fontsize=18)
        ax.tick_params(axis='x', labelsize=16)
    else:
        ax.set_xlabel("")
        ax.set_xticklabels([])

    if met_i == 0 or met_i == 1:
        ax.set_ylabel('')
        # ax.tick_params(axis='y', labelsize=16)
    else:
        ax.set_ylabel('')
        ax.set_yticklabels([])

plt.tight_layout()
plt.savefig(op.join("./Fig", "Fig-08a.png"), dpi=600, bbox_inches="tight")
plt.show()

In [None]:
# Use reduce to sum all DataFrames
from functools import reduce
df_sum = reduce(lambda x, y: x.add(y, fill_value=0), data_bin_df_lst)
df_sum[2] = df_sum[2] + [0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0]

In [None]:
import matplotlib.colors as colors

fig, ax = plt.subplots(1, 1)
fig.set_size_inches(15, 6)

n_bins = df_sum.max().max() + 1
vals = np.arange(n_bins)
vals_ticks = vals + 0.5
vals_labels = [str(lab) for lab in vals]
cmap = plt.cm.get_cmap('Blues', n_bins)
norm = colors.BoundaryNorm(np.arange(n_bins+1), cmap.N)

sns.heatmap(
    df_sum, 
    cmap=cmap,
    vmin=0, 
    vmax=5, 
    xticklabels=True, 
    yticklabels=True, 
    ax=ax
)
ax.set_xticklabels(ax.get_xticklabels(), rotation=0, fontsize=16)
ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=16)
    
non_zero_indices = np.nonzero(data_bin_df)
ax.add_patch(Rectangle((0, 9), 1, 1, fill=False, edgecolor='red', lw=3))

ax.set_ylabel("")
ax.set_xlabel('Segment Solution', fontsize=18)
colorbar = ax.collections[0].colorbar
colorbar.set_ticks(vals_ticks)
colorbar.set_ticklabels(vals_labels)
ax.set_title("Overall Performance", fontsize=20)

plt.tight_layout()
plt.savefig(op.join("./Fig", "Fig-08b.png"), dpi=600, bbox_inches="tight")
plt.show()

In [None]:
for metric in ["max_corr", "information_content", "tfidf"]:
    plot_profile(data_df, metric, hue_order)

In [None]:
for i in range(1, 31):
    print(f"\includegraphics[scale=0.47]{{information_content_profile_{i-1}.eps}}\n")
    
    if i % 5 == 0:
        print("\includegraphics[scale=0.37]{legend.pdf}\n")

In [None]:
for metric in ["max_corr", "ic", "tfidf", "snr"]:
    plot_mean_profile(mean_data_df, metric, hue_order)

In [None]:
for metric in ["max_corr", "ic", "tfidf", "snr"]:
    sub_mean_data_df = mean_data_df.pivot_table(
        values=metric, index=mean_data_df["segment_solution"], columns="method"
    )
    sub_mean_data_df = sub_mean_data_df.reindex(hue_order, axis=1)
    sub_mean_data_df.index = sub_mean_data_df.index.astype(int)
    sub_mean_data_df = sub_mean_data_df.sort_index()
    sub_mean_data_df.index = sub_mean_data_df.index.astype(str)

    plot_mean_sbars(sub_mean_data_df, metric, hue_order)