In [1]:
from tqdm import tqdm
from custom_ml_toolkit.feature_selector.combination import CombinatoricFeatureGenerator
from custom_ml_toolkit.feature_selector.utils import split_feature_category

import pandas as pd
import numpy as np
from custom_ml_toolkit.preprocessor.encoder import SupportMissingDatasetEncoder
from custom_ml_toolkit.eval.classification import process_eval_dict
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier



In [2]:
random_state = 77
data_df = pd.read_csv('example_data/titanic.csv')
data_df['Deck'] = data_df['Cabin'].str[0]

In [3]:
feature_categories = {
    'numerical_cols': ['Age', 'SibSp', 'Parch', 'Fare'],
    'norminal_cols': ['Sex', 'Embarked'],
    'ordinal_cols': ['Pclass', 'Deck']
}

target_col = 'Survived'

train_data_df, test_data_df = train_test_split(
    data_df,
    test_size=0.2,
    random_state=random_state,
    stratify=data_df['Survived']
)

In [4]:
cfg = CombinatoricFeatureGenerator(
    r_start=1,
    r_end=5,
    selected_cols=['SibSp', 'Parch', 'Fare', 'Embarked', ['Pclass', 'Deck']],
    required_cols=['Age', 'Sex'],
    budget=-1
)

cfg.info()

results = list()

for removed_features, combination in tqdm(cfg):
    print('Remaining:', (cfg.number_of_combinations - cfg.remaining), '/', cfg.number_of_combinations)
    print('Removed Features:', removed_features)
    print('Remained Features:', combination)

    classified_feature = split_feature_category(
        col_names=combination,
        categories=feature_categories
    )

    comb_selected_cols = combination + [target_col]

    # -------------------------------------------------------------------------
    de = SupportMissingDatasetEncoder(
        numerical_cols=classified_feature['numerical_cols'],
        norminal_cols=classified_feature['norminal_cols'],
        ordinal_cols=classified_feature['ordinal_cols'],
        target_col=target_col,
        drop_binary=True,
        oe_unknown_value=np.nan,
        oe_missing_value=np.nan,
        encode_target=True
    )
    clf = XGBClassifier(
        random_state=random_state,
        n_jobs=-1,
    )
    # -------------------------------------------------------------------------
    de.fit(train_data_df[comb_selected_cols])
    encoded_train_data_df = de.transform(train_data_df[comb_selected_cols])
    encoded_test_data_df = de.transform(test_data_df[comb_selected_cols])

    X_train = encoded_train_data_df.drop(columns=[target_col])
    y_train = encoded_train_data_df[target_col]

    X_test = encoded_test_data_df.drop(columns=[target_col])
    y_test = encoded_test_data_df[target_col]
    # -------------------------------------------------------------------------
    clf.fit(
        X=X_train,
        y=y_train
    )
    # -------------------------------------------------------------------------
    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)
    # -------------------------------------------------------------------------
    train_eval_dict = classification_report(y_train, y_train_pred, output_dict=True)
    train_eval_df = process_eval_dict(
        eval_dict=train_eval_dict
    )
    train_eval_df['SET'] = 'Train'
    train_eval_df['removed_features'] = str(removed_features)
    train_eval_df['remained_features'] = str(combination)
    results.append(train_eval_df)

    test_eval_dict = classification_report(y_test, y_test_pred, output_dict=True)
    test_eval_df = process_eval_dict(
        eval_dict=test_eval_dict
    )
    test_eval_df['SET'] = 'Test'
    test_eval_df['removed_features'] = str(removed_features)
    test_eval_df['remained_features'] = str(combination)
    results.append(test_eval_df)

result_df = pd.concat(results)

---------------------------------------------------------------
n: 5
r = 1: 5 combinations
r = 2: 10 combinations
r = 3: 10 combinations
r = 4: 5 combinations
r = 5: 1 combinations
Remaining 31/31
---------------------------------------------------------------


100%|██████████| 31/31 [00:03<00:00,  9.01it/s]

Run out of combination





In [5]:
result_df

Unnamed: 0,0_f1-score,0_precision,0_recall,0_support,1_f1-score,1_precision,1_recall,1_support,accuracy_f1-score,accuracy_precision,...,macro avg_precision,macro avg_recall,macro avg_support,weighted avg_f1-score,weighted avg_precision,weighted avg_recall,weighted avg_support,SET,removed_features,remained_features
0,0.881319,0.85138,0.91344,439.0,0.789883,0.842324,0.74359,273.0,0.848315,0.848315,...,0.846852,0.828515,712.0,0.84626,0.847908,0.848315,712.0,Train,"['Parch', 'Fare', 'Embarked', ['Pclass', 'Deck']]","['Age', 'Sex', 'SibSp']"
0,0.846512,0.866667,0.827273,110.0,0.769231,0.743243,0.797101,69.0,0.815642,0.815642,...,0.804955,0.812187,179.0,0.816722,0.81909,0.815642,179.0,Test,"['Parch', 'Fare', 'Embarked', ['Pclass', 'Deck']]","['Age', 'Sex', 'SibSp']"
0,0.880713,0.862445,0.899772,439.0,0.796964,0.826772,0.769231,273.0,0.849719,0.849719,...,0.844609,0.834501,712.0,0.848602,0.848767,0.849719,712.0,Train,"['SibSp', 'Fare', 'Embarked', ['Pclass', 'Deck']]","['Age', 'Sex', 'Parch']"
0,0.837209,0.857143,0.818182,110.0,0.755245,0.72973,0.782609,69.0,0.804469,0.804469,...,0.793436,0.800395,179.0,0.805614,0.808028,0.804469,179.0,Test,"['SibSp', 'Fare', 'Embarked', ['Pclass', 'Deck']]","['Age', 'Sex', 'Parch']"
0,0.974359,0.954148,0.995444,439.0,0.956357,0.992126,0.923077,273.0,0.967697,0.967697,...,0.973137,0.959261,712.0,0.967456,0.96871,0.967697,712.0,Train,"['SibSp', 'Parch', 'Embarked', ['Pclass', 'Dec...","['Age', 'Sex', 'Fare']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.877828,0.873874,0.881818,110.0,0.80292,0.808824,0.797101,69.0,0.849162,0.849162,...,0.841349,0.83946,179.0,0.848953,0.848799,0.849162,179.0,Test,['Parch'],"['Age', 'Sex', 'SibSp', 'Fare', 'Embarked', 'P..."
0,0.983089,0.973214,0.993166,439.0,0.972067,0.988636,0.956044,273.0,0.978933,0.978933,...,0.980925,0.974605,712.0,0.978863,0.979128,0.978933,712.0,Train,['SibSp'],"['Age', 'Sex', 'Parch', 'Fare', 'Embarked', 'P..."
0,0.877828,0.873874,0.881818,110.0,0.80292,0.808824,0.797101,69.0,0.849162,0.849162,...,0.841349,0.83946,179.0,0.848953,0.848799,0.849162,179.0,Test,['SibSp'],"['Age', 'Sex', 'Parch', 'Fare', 'Embarked', 'P..."
0,0.982022,0.968958,0.995444,439.0,0.970037,0.992337,0.948718,273.0,0.977528,0.977528,...,0.980648,0.972081,712.0,0.977427,0.977922,0.977528,712.0,Train,[],"['Age', 'Sex', 'SibSp', 'Parch', 'Fare', 'Emba..."
