In [1]:
import os
os.chdir(os.path.abspath(".."))

In [2]:
import numpy as np
import pandas as pd

from dataio import load_metadataset
from sklearn.feature_selection import SelectFpr, VarianceThreshold

In [3]:
for feature_set in ["minimal", "catch22", "efficient", "comprehensive"]:
    print(f"--- Analysing {feature_set} features ---")
    meta_X, meta_y = load_metadataset(feature_set, problem_type="regression")
    total_features = meta_X.shape[1]

    variance_selector = VarianceThreshold((1e-5)**2).fit(meta_X)
    meta_X_threshold = meta_X[variance_selector.get_feature_names_out()]
    threshold_features = meta_X_threshold.shape[1]

    removed_features = total_features - threshold_features
    print(f"{removed_features} feature(s) removed due to low variance ({100 * removed_features / total_features:.2f}%)")

    anova_selector = SelectFpr(alpha=0.05).fit(meta_X_threshold, meta_y)
    meta_X_final = meta_X_threshold[anova_selector.get_feature_names_out()]
    final_features = meta_X_final.shape[1]

    print(f"{final_features} out of {threshold_features} features are significant ({(100 * final_features / threshold_features):.2f}%)")

    print("")
    


--- Analysing minimal features ---
1 feature(s) removed due to low variance (3.57%)
1 out of 27 features are significant (3.70%)

--- Analysing catch22 features ---
0 feature(s) removed due to low variance (0.00%)
12 out of 57 features are significant (21.05%)

--- Analysing efficient features ---
9 feature(s) removed due to low variance (0.58%)
114 out of 1553 features are significant (7.34%)

--- Analysing comprehensive features ---
9 feature(s) removed due to low variance (0.57%)
119 out of 1565 features are significant (7.60%)

