In [None]:
from statistics import stdev
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

%reload_ext autoreload
%autoreload 2

In [None]:
datasets = ["cancer", "card", "gene", "glass", "heart", "horse", "mushroom", "soybean", "thyroid"]
train_alpha = 0.3

In [None]:
# With enhanced%
fig, axs = plt.subplots(3, 3, figsize=(10, 11))
fig.tight_layout()
axi = 0
for dataset in datasets:
    ax = axs.flat[axi]
    df = pd.read_csv(f"../../log/prelim_imp_top/imp_top_{dataset}.txt")
    df["frac_enhanced"] = df.nenhanced / df.nsel
    means = pd.pivot_table(df, index="nsel")
    stds = pd.pivot_table(df, index="nsel", aggfunc=stdev)
    x = means.index / means.nin
    f1test = ax.plot(x, means.ftest, label="$F_1$ (test)", color="C0")
    ax.fill_between(x, means.ftest - stds.ftest, means.ftest + stds.ftest, color="C0", alpha=0.1)
    f1train = ax.plot(x, means.ftrain, label="$F_1$ (train)", color="C1", alpha=train_alpha)
    ax.fill_between(x, means.ftrain - stds.ftrain, means.ftrain + stds.ftrain, color="C1", alpha=0.1*train_alpha*1.5)
    ax2 = ax.twinx()
    enhanced = ax2.plot(x, means.frac_enhanced, label="enhanced%", color="k")
    ax2.fill_between(x, means.frac_enhanced - stds.frac_enhanced, means.frac_enhanced + stds.frac_enhanced, color="k", alpha = 0.1)
    lns = f1train + f1test + enhanced
    labs = [l.get_label() for l in lns]
    ax.legend(lns, labs, loc="upper right")
    ax.set_title(f"{dataset} dataset")
    ax.set_xlim([0, 1])
    ax.set_ylim([0, 1])
    ax2.set_ylim([0, 1])
    ax.set(xlabel="Fraction of features selected", ylabel="$F_1$-score")
    ax2.set_ylabel("% of selected features is enhanced")
    ax.label_outer()
    ax2.label_outer()
    axi += 1

plt.subplots_adjust(top=0.92)
fig.suptitle("$F_1$-scores of features selected by importance scores", fontsize=20)

In [None]:
# Without enhanced%
fig, axs = plt.subplots(3, 3, figsize=(10, 11))
fig.tight_layout()
axi = 0
for dataset in datasets:
    ax = axs.flat[axi]
    df = pd.read_csv(f"../../log/prelim_imp_top/imp_top_{dataset}.txt")
    means = pd.pivot_table(df, index="nsel")
    stds = pd.pivot_table(df, index="nsel", aggfunc=stdev)
    x = means.index / means.nin
    f1test = ax.plot(x, means.ftest, label="$F_1$ (test)", color="C0")
    ax.fill_between(x, means.ftest - stds.ftest, means.ftest + stds.ftest, color="C0", alpha = 0.1)
    f1train = ax.plot(x, means.ftrain, label="$F_1$ (train)", color="C1", alpha=train_alpha)
    ax.fill_between(x, means.ftrain - stds.ftrain, means.ftrain + stds.ftrain, color="C1", alpha = 0.1*train_alpha*1.5)
    ax.legend(loc="lower right")
    ax.set_title(f"{dataset} dataset")
    ax.set_xlim([0, 1])
    # ax.set_ylim([0, 1])
    sps = ax.get_subplotspec()
    if sps.is_first_col():
        ax.set_ylabel("$F_1$-score (mean $\pm$ stdev)")
    if sps.is_last_row():
        ax.set_xlabel("Fraction of features selected")
    axi += 1

plt.subplots_adjust(top=0.92, hspace=0.20)
fig.suptitle("$F_1$-scores of features selected by importance scores", fontsize=20)