In [1]:
_ = """
1. Feature importance meesures based on RF:
    a) impurities based on measures
    b) permutations based on measures

2. Boruta algorithm (POSLA GUROM) - to niżej to nie jest 100% poprawne:
    default variables + "shadow" variables
    default scores + "shadow" scores

    Use random forest

    if some default score is bigger then max(shadow_scores):
        variable is important (stay)
    elif some default score is lower then min(shadow_scores):
        variable is not important (to delete)
    else:
        we do not have knowledge about the variable

    Repeat until all variable will be deleted or stay
"""


TODO:
pip install Boruta

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import chi2
import numpy as np

np.int = np.int32
np.float = np.float64
np.bool = np.bool_

from boruta import BorutaPy


In [3]:
def get_dataset_1(
        size: int,
        features: int,
        significant_features: int
        ) -> tuple[np.array, np.array]:

    X = np.random.normal(0, 1, size=(size, features))

    chi2_median = chi2.ppf(0.5, significant_features)

    X_k = X[:, :significant_features]
    X_k_sumsq = np.sum(X_k ** 2, axis=1)

    Y = (X_k_sumsq > chi2_median).astype(int)

    return X, Y


def get_dataset_2(
        size: int,
        features: int,
        significant_features: int
        ) -> tuple[np.array, np.array]:

    X = np.random.normal(0, 1, size=(size, features))

    X_k = X[:, :significant_features]
    X_k_sum_abs = np.sum(np.abs(X_k), axis=1)

    Y = (X_k_sum_abs > significant_features).astype(int)

    return X, Y


def get_random_forest_features(
        x: np.array,
        y: np.array,
        n: int
        ) -> list[tuple[str, float]]:
    """
    Input:
     * x: np.array - features of dataset
     * y: np.array - binary labels of dataset
     * n: int - how many the best scores return

    Output:
     * list[tuple[str, float]] - list with results.
        Every tuple represent featurer name and the score
    """

    feature_names = [f"feature {i}" for i in range(x.shape[1])]
    forest = RandomForestClassifier(random_state=0)
    forest.fit(x, y)

    feature_importances = forest.feature_importances_

    forest_importances = {
        feature_names[i]: feature_importances[i]
        for i in range(len(feature_names))
    }

    sorted_features = sorted(
        forest_importances.items(),
        key=lambda x:x[1],
        reverse=True
        )
    
    return sorted_features[:n]


def get_boruta_features(
        x: np.array,
        y: np.array
        ) -> list[tuple[str, float]]:
    """
    Input:
     * x: np.array - features of dataset
     * y: np.array - binary labels of dataset

    Output:
     * list[tuple[str, float]] - list with results.
        Every tuple represent featurer name and the score
    """

    rf = RandomForestClassifier(n_jobs=-1, class_weight="balanced")
    boruta_model = BorutaPy(rf, n_estimators="auto")
    boruta_model.fit(x, y)

    feature_names = [f"feature {i}" for i in range(x.shape[1])]
    feature_importances = boruta_model.support_

    the_best_features = [
        (feature_names[i], 1.0)
        for i in range(len(feature_names))
        if feature_importances[i] == True
    ]

    return the_best_features


In [4]:
X1, y1 = get_dataset_1(100, 50, 10)
X2, y2 = get_dataset_1(100, 50, 10)

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2)


In [5]:
boruta_features = get_boruta_features(X1, y1)
rf_feafuter = get_random_forest_features(X1, y1, len(boruta_features))

print(rf_feafuter)
print(boruta_features)

print()

boruta_features = get_boruta_features(X2, y2)
rf_feafuter = get_random_forest_features(X2, y2, len(boruta_features))

print(rf_feafuter)
print(boruta_features)


[('feature 22', 0.041641940013642305), ('feature 6', 0.041486579981411545), ('feature 9', 0.04057802130580221), ('feature 3', 0.040217164331192164)]
[('feature 6', 1.0), ('feature 9', 1.0), ('feature 12', 1.0), ('feature 22', 1.0)]

[('feature 0', 0.07043565289725294)]
[('feature 0', 1.0)]
