In [None]:
from scipy.stats import ttest_rel, ttest_ind
from statistics import mean, stdev
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

%reload_ext autoreload
%autoreload 2

In [None]:
# datasets = ["cancer", "card", "diabetes", "gene", "glass", "heart", "horse", "mushroom", "soybean", "thyroid"]
datasets = ["cancer", "card", "gene", "glass", "heart", "horse", "mushroom", "soybean", "thyroid"]
regressors = ["importance", "importance_corfilter", "inipg", "l1l2", "lasso", "sparse_encoded_lasso", "stepwise"]
regressor_names = ["Importance scores", "Importance scores with correlation filtering", "IniPG", "L1-L2-Hybrid Regularization", "LASSO", "Sparse Encoded LASSO", "Stepwise Regression"]

In [None]:
withi = []
withouti = []
for r in regressors:
    df = pd.read_csv(f"../../log/prelim_intercept/intercept_{r}_with.txt")
    withi.append(df)
    df = pd.read_csv(f"../../log/prelim_intercept/intercept_{r}_without.txt")
    withouti.append(df)

In [None]:
withmeans = [pd.pivot_table(df, index="dataset") for df in withi]
withstds = [pd.pivot_table(df, index="dataset", aggfunc=stdev) for df in withi]
withoutmeans = [pd.pivot_table(df, index="dataset") for df in withouti]
withoutstds = [pd.pivot_table(df, index="dataset", aggfunc=stdev) for df in withouti]
nclass = [datasets[i] + "\n(" + str(withmeans[0].nout[i]) + ")" for i in range(len(datasets))]

In [None]:
spc = np.arange(len(datasets))
w = 0.45
errstyle = dict(elinewidth=1, capsize=5)
errstyletr = dict(elinewidth=1, capsize=5, alpha=0.4)
fig, axs = plt.subplots(2, 2, figsize=(18, 14))
fig.tight_layout()
plt.rcParams.update({"font.size": 13})
for i in range(4):
    ax = axs.flat[i]
    wm = withmeans[i]
    ws = withstds[i]
    wom = withoutmeans[i]
    wos = withoutstds[i]
    ax.bar(spc, wm.ftest, width=w, label="With intercept (test)", edgecolor="k", yerr=ws.ftest, error_kw=errstyle)
    ax.bar(spc, wm.ftrain, width=w, label="With intercept (train)", edgecolor="k", yerr=ws.ftrain, alpha=0.1, error_kw=errstyletr)
    ax.bar(spc + w, wom.ftest, width=w, label="Without intercept (test)", edgecolor="k", yerr=wos.ftest, error_kw=errstyle)
    ax.bar(spc + w, wom.ftrain, width=w, label="Without intercept (train)", edgecolor="k", yerr=wos.ftrain, alpha=0.1, error_kw=errstyletr)
    ax.legend(loc="lower left")
    ax.set_xticks(spc + w / 2, nclass)
    sps = ax.get_subplotspec()
    if sps.is_last_row(): ax.set_xlabel("Dataset (#classes)", fontsize=18)
    if sps.is_first_col(): ax.set_ylabel("$F_1$-score (mean $\pm$ stdev)", fontsize=18)
    ax.set_title(regressor_names[i])

plt.subplots_adjust(top=0.92, hspace=0.22)
fig.suptitle(f"F1-scores of FS methods with vs without intercept\n(part 1 of 2)", fontsize=22)

In [None]:
fig, axs = plt.subplots(3, 1, figsize=(10, 16))
fig.tight_layout()
for i in range(4, 7):
    ax = axs.flat[i-4]
    wm = withmeans[i]
    ws = withstds[i]
    wom = withoutmeans[i]
    wos = withoutstds[i]
    ax.bar(spc, wm.ftest, width=w, label="With intercept (test)", edgecolor="k", yerr=ws.ftest, error_kw=errstyle)
    ax.bar(spc, wm.ftrain, width=w, label="With intercept (train)", edgecolor="k", yerr=ws.ftrain, alpha=0.1, error_kw=errstyletr)
    ax.bar(spc + w, wom.ftest, width=w, label="Without intercept (test)", edgecolor="k", yerr=wos.ftest, error_kw=errstyle)
    ax.bar(spc + w, wom.ftrain, width=w, label="Without intercept (train)", edgecolor="k", yerr=wos.ftrain, alpha=0.1, error_kw=errstyletr)
    ax.legend(loc="lower left")
    ax.set_xticks(spc + w / 2, nclass)
    sps = ax.get_subplotspec()
    if sps.is_last_row(): ax.set_xlabel("Dataset (#classes)", fontsize=18)
    ax.set_ylabel("$F_1$-score (mean $\pm$ stdev)", fontsize=18)
    ax.set_title(regressor_names[i])

plt.subplots_adjust(top=0.92, hspace=0.20)
fig.suptitle("F1-scores of FS methods with vs without intercept\n(part 2 of 2)", fontsize=22)

In [None]:
diffs = []
for i in range(len(regressors)):
    reg = regressors[i]
    row = []
    for dataset in datasets:
        withf1 = withi[i][withi[i].dataset == dataset].ftest
        withoutf1 = withouti[i][withouti[i].dataset == dataset].ftest
        stat, pval = ttest_rel(withf1, withoutf1)
        if stat == 0 or np.isnan(stat):
            row.append("equal")
        elif stat > 0:
            row.append(f"with, {pval < 0.05}")
        else:
            row.append(f"without, {pval < 0.05}")
    diffs.append(row)

diffdf = pd.DataFrame(diffs, index=regressors, columns=datasets)
diffdf.to_csv("intercept_diffs.csv")