In [None]:
from matplotlib.colors import LinearSegmentedColormap
from statistics import median, stdev
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

%reload_ext autoreload
%autoreload 2

In [None]:
datasets = ["abalone", "adult", "cancer", "card", "covtype", "gene", "glass", "heart", "horse", "madelon", "optdigits", "page-blocks", "pendigits", "poker", "satimage", "segmentation", "shuttle", "soybean", "spect", "thyroid", "vehicle", "waveform"]
realbins = range(5, 201, 5)

In [None]:
def alsogood(q):
    return lambda x: x >= q

def prettify_m(ms, interval=5):
    prevm, laststoredm, txt = None, None, None
    strs = []
    for i in range(len(ms)):
        m = ms[i]
        if prevm is None:
            txt = f"{m}"
            laststoredm = m
        elif m != prevm + interval:
            if laststoredm != prevm:
                txt = f"{txt} - {prevm}"
            strs.append(txt)
            txt = f"{m}"
            laststoredm = m
        prevm = m
        if i == len(ms) - 1:
            if laststoredm != m:
                txt = f"{txt} - {m}"
            strs.append(txt)
    fulltxt = ", ".join(strs)
    return fulltxt


In [None]:
tobedf = []
for dataset in datasets:
    df = pd.read_csv(f"../../log/prelim_rvfl_enh/rvfl_enh_{dataset}.txt")
    g = df.groupby("m")["ftest"].apply(list).reset_index(name="ftest").ftest.tolist()
    medians = [median(row) for row in g]
    maxval = max(medians)
    maxind = medians.index(maxval)
    maxm = (maxind + 1) * 5
    alsogoodinds = [i for i in range(len(medians)) if alsogood(np.percentile(g[maxind], 25))(medians[i])]
    alsogoodvals = [medians[i] for i in alsogoodinds]
    alsogoodms = [(ind + 1) * 5 for ind in alsogoodinds]
    alsogoodtxt = prettify_m(alsogoodms)
    min_good_m, min_good_val = min(alsogoodms), medians[min(alsogoodinds)]
    tobedf.append([dataset, maxm, str(round(maxval, 3)), alsogoodtxt, min_good_m, str(round(min_good_val, 3)), alsogoodms, alsogoodvals])

resultdf = pd.DataFrame(tobedf, columns=["dataset", "best_m", "f1_best_m", "also_good_pretty", "mingood_m", "f1_mingood_m", "also_good_ms", "also_good_medians"])
resultdf.to_csv("rvfl_enh_nodes.csv")

In [None]:
cm = LinearSegmentedColormap.from_list("BlueRed", [(0.0, "blue"), (1.0, "red")])
colors = [cm((x - min(realbins)) / (max(realbins) - min(realbins))) for x in realbins]
fig = plt.figure(figsize=(14, 7))
_, _, patches = plt.hist(resultdf.best_m, bins=np.arange(5, 206, 5)-2.5, edgecolor="k")
for c, p in zip(colors, patches):
    plt.setp(p, "facecolor", c)
plt.xticks(realbins, rotation="vertical")
plt.xlim([-2.5, 207.5])
plt.xlabel("Number of enhancement nodes $m$", fontsize=18)
plt.ylabel("Number of datasets for which $m$ is optimal", fontsize=18)
plt.title("Distribution of optimal $m$ values per dataset", fontsize=24)
plt.grid(True, axis="y", color="k", alpha=0.2)
plt.show()

In [None]:
goodms = np.concatenate(resultdf.also_good_ms).flat
fig = plt.figure(figsize=(14, 7))
_, _, patches = plt.hist(goodms, bins=np.arange(5, 206, 5)-2.5, edgecolor="k")
for c, p in zip(colors, patches):
    plt.setp(p, "facecolor", c)
plt.xticks(realbins, rotation="vertical")
plt.xlim([-2.5, 207.5])
plt.xlabel("Number of enhancement nodes $m$", fontsize=18)
plt.ylabel("Number of datasets for which $m$ is good", fontsize=18)
plt.title("Distribution of good $m$ values per dataset", fontsize=24)
plt.grid(True, axis="y", color="k", alpha=0.2)
plt.show()

In [None]:
fig = plt.figure(figsize=(14, 7))
_, _, patches = plt.hist(resultdf.mingood_m, bins=np.arange(5, 206, 5)-2.5, edgecolor="k")
for c, p in zip(colors, patches):
    plt.setp(p, "facecolor", c)
plt.xticks(realbins, rotation="vertical")
plt.xlim([-2.5, 207.5])
plt.xlabel("Number of enhancement nodes $m$", fontsize=18)
plt.ylabel("Number of datasets for which $m$ is minimal while good", fontsize=14)
plt.title("Distribution of minimal good $m$ values per dataset", fontsize=24)
plt.grid(True, axis="y", color="k", alpha=0.2)
plt.show()

In [None]:
fig, axs = plt.subplots(4, 2, figsize=(14, 40/11*4))
fig.tight_layout()
boxprops = dict(color="b")
flierprops = dict(markeredgecolor="#D3691D", markersize=5)
medianprops = dict(color="darkred")
whiskerprops = dict(color="b")
axi = 0
for dataset in datasets[0:8]:
    ax = axs.flat[axi]
    df = pd.read_csv(f"../../log/prelim_rvfl_enh/rvfl_enh_{dataset}.txt")
    g = df.groupby("m")["ftest"].apply(list).reset_index(name="ftest").ftest.tolist()
    bp = ax.boxplot(g, sym=".", boxprops=boxprops, medianprops=medianprops, whiskerprops=whiskerprops, flierprops=flierprops, patch_artist=True)
    for box in bp["boxes"]: box.set_facecolor("azure")
    ax.set_xticklabels(realbins, rotation="vertical")
    ax.set_title(f"{dataset} dataset", fontsize=14)
    ax.grid(True, color="#DDDDDD")
    sps = ax.get_subplotspec()
    if sps.is_first_col(): ax.set_ylabel("$F_1$-score")
    if sps.is_last_row(): ax.set_xlabel("number of enhancement nodes")
    axi += 1

plt.subplots_adjust(top=0.93, hspace=0.25)
fig.suptitle("$F_1$-scores of RVFL-nets containing different numbers of enhancement nodes (part 1 of 3)", fontsize=20)

In [None]:
fig, axs = plt.subplots(4, 2, figsize=(14, 40/11*4))
fig.tight_layout()
axi = 0
for dataset in datasets[8:16]:
    ax = axs.flat[axi]
    df = pd.read_csv(f"../../log/prelim_rvfl_enh/rvfl_enh_{dataset}.txt")
    g = df.groupby("m")["ftest"].apply(list).reset_index(name="ftest").ftest.tolist()
    bp = ax.boxplot(g, sym=".", boxprops=boxprops, medianprops=medianprops, whiskerprops=whiskerprops, flierprops=flierprops, patch_artist=True)
    for box in bp["boxes"]: box.set_facecolor("azure")
    ax.set_xticklabels(realbins, rotation="vertical")
    ax.set_title(f"{dataset} dataset", fontsize=14)
    ax.grid(True, color="#DDDDDD")
    sps = ax.get_subplotspec()
    if sps.is_first_col(): ax.set_ylabel("$F_1$-score")
    if sps.is_last_row(): ax.set_xlabel("number of enhancement nodes")
    axi += 1

plt.subplots_adjust(top=0.93, hspace=0.25)
fig.suptitle("$F_1$-scores of RVFL-nets containing different numbers of enhancement nodes (part 2 of 3)", fontsize=20)

In [None]:
fig, axs = plt.subplots(3, 2, figsize=(14, 40/11*3))
fig.tight_layout()
axi = 0
for dataset in datasets[16:22]:
    ax = axs.flat[axi]
    df = pd.read_csv(f"../../log/prelim_rvfl_enh/rvfl_enh_{dataset}.txt")
    g = df.groupby("m")["ftest"].apply(list).reset_index(name="ftest").ftest.tolist()
    bp = ax.boxplot(g, sym=".", boxprops=boxprops, medianprops=medianprops, whiskerprops=whiskerprops, flierprops=flierprops, patch_artist=True)
    for box in bp["boxes"]: box.set_facecolor("azure")
    ax.set_xticklabels(realbins, rotation="vertical")
    ax.set_title(f"{dataset} dataset", fontsize=14)
    ax.grid(True, color="#DDDDDD")
    sps = ax.get_subplotspec()
    if sps.is_first_col(): ax.set_ylabel("$F_1$-score")
    if sps.is_last_row(): ax.set_xlabel("number of enhancement nodes")
    axi += 1

plt.subplots_adjust(top=0.91, hspace=0.25)
fig.suptitle("$F_1$-scores of RVFL-nets containing different numbers of enhancement nodes (part 3 of 3)", fontsize=20)

In [None]:
mediandf = []
for dataset in datasets:
    df = pd.read_csv(f"../../log/prelim_rvfl_enh/rvfl_enh_{dataset}.txt")
    g = df.groupby("m")["ftest"].median()
    mediandf.append(g)
mediandf = pd.concat(mediandf, axis="columns")
mediandf.columns = datasets
fig = plt.figure(figsize=(14, 9))
plt.plot(mediandf, ".-")
plt.xlim(0, 205)
plt.ylim(0, 1)
plt.legend(datasets, ncol=2, framealpha=0.3, loc=(0.14, 0.01))
plt.title("$F_1$-scores per $m$ value for each dataset", fontsize=24)
plt.xlabel("Number of enhancement nodes $m$", fontsize=18)
plt.ylabel("$F_1$-score (median)", fontsize=18)