In [7]:
from tqdm import tqdm
from custom_ml_toolkit.feature_selector.combination import CombinatoricFeatureGenerator
from custom_ml_toolkit.feature_selector.utils import split_feature_category

import pandas as pd
import numpy as np
from custom_ml_toolkit.preprocessor.encoder import SupportMissingDatasetEncoder
from custom_ml_toolkit.eval.classification import process_eval_dict
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier



In [8]:
random_state = 77
data_df = pd.read_csv('example_data/titanic.csv')
data_df['Deck'] = data_df['Cabin'].str[0]

In [9]:
feature_categories = {
    'numerical_cols': ['Age', 'SibSp', 'Parch', 'Fare'],
    'norminal_cols': ['Sex', 'Embarked'],
    'ordinal_cols': ['Pclass', 'Deck']
}
target_col = 'Survived'

train_data_df, test_data_df = train_test_split(
    data_df,
    test_size=0.2,
    random_state=random_state,
    stratify=data_df[target_col]
)

In [10]:
cfg = CombinatoricFeatureGenerator(
    r_start=1,
    r_end=5,
    selected_cols=['SibSp', 'Parch', 'Fare', 'Embarked', ['Pclass', 'Deck']],
    required_cols=['Age', 'Sex'],
    budget=-1
)

cfg.info()

---------------------------------------------------------------
n: 5
r = 1: 5 combinations
r = 2: 10 combinations
r = 3: 10 combinations
r = 4: 5 combinations
r = 5: 1 combinations
Remaining 31/31
---------------------------------------------------------------


In [11]:
results = list()
for removed_features, combination in tqdm(cfg):
    remaining = f'{(cfg.number_of_combinations - cfg.remaining)} / {cfg.number_of_combinations}'
    # print('Remaining:', remaining)
    # print('Removed Features:', removed_features)
    # print('Remained Features:', combination)

    classified_feature = split_feature_category(
        col_names=combination,
        categories=feature_categories
    )

    comb_selected_cols = combination + [target_col]

    # -------------------------------------------------------------------------
    # de = SupportMissingDatasetEncoder(
    #     numerical_cols=classified_feature['numerical_cols'],
    #     norminal_cols=classified_feature['norminal_cols'],
    #     ordinal_cols=classified_feature['ordinal_cols'],
    #     target_col=target_col,
    #     drop_binary=True,
    #     oe_unknown_value=np.nan,
    #     oe_missing_value=np.nan,
    #     encode_target=True
    # )
    # clf = XGBClassifier(
    #     random_state=random_state,
    #     n_jobs=-1,
    # )
    # -------------------------------------------------------------------------
    de = SupportMissingDatasetEncoder(
        numerical_cols=classified_feature['numerical_cols'],
        norminal_cols=None,
        ordinal_cols=classified_feature['norminal_cols'] + classified_feature['ordinal_cols'],
        target_col=target_col,
        drop_binary=True,
        oe_unknown_value=-1,
        oe_missing_value=-1,
        encode_target=False
    )
    clf = LGBMClassifier(
        random_state=random_state,
        n_jobs=-1,
        verbose=-1
    )
    # -------------------------------------------------------------------------
    de.fit(train_data_df[comb_selected_cols])
    encoded_train_data_df = de.transform(train_data_df[comb_selected_cols])
    encoded_test_data_df = de.transform(test_data_df[comb_selected_cols])

    X_train = encoded_train_data_df.drop(columns=[target_col])
    y_train = encoded_train_data_df[target_col]

    X_test = encoded_test_data_df.drop(columns=[target_col])
    y_test = encoded_test_data_df[target_col]
    # -------------------------------------------------------------------------
    clf.fit(X=X_train, y=y_train)
    # -------------------------------------------------------------------------
    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)
    # -------------------------------------------------------------------------
    train_eval_dict = classification_report(y_train, y_train_pred, output_dict=True)
    train_eval_df = process_eval_dict(
        eval_dict=train_eval_dict
    )
    train_eval_df['SET'] = 'Train'
    train_eval_df['experiment_no'] = remaining
    train_eval_df['removed_features'] = str(removed_features)
    train_eval_df['remained_features'] = str(combination)
    results.append(train_eval_df)

    test_eval_dict = classification_report(y_test, y_test_pred, output_dict=True)
    test_eval_df = process_eval_dict(
        eval_dict=test_eval_dict
    )
    test_eval_df['SET'] = 'Test'
    test_eval_df['experiment_no'] = remaining
    test_eval_df['removed_features'] = str(removed_features)
    test_eval_df['remained_features'] = str(combination)
    results.append(test_eval_df)

result_df = pd.concat(results)

100%|██████████| 31/31 [00:03<00:00, 10.28it/s]

Run out of combination





In [12]:
result_df

Unnamed: 0,0_f1-score,0_precision,0_recall,0_support,1_f1-score,1_precision,1_recall,1_support,accuracy_f1-score,accuracy_precision,...,macro avg_recall,macro avg_support,weighted avg_f1-score,weighted avg_precision,weighted avg_recall,weighted avg_support,SET,experiment_no,removed_features,remained_features
0,0.871965,0.845824,0.899772,439.0,0.776062,0.820408,0.736264,273.0,0.837079,0.837079,...,0.818018,712.0,0.835193,0.836079,0.837079,712.0,Train,1 / 31,"['Parch', 'Fare', 'Embarked', ['Pclass', 'Deck']]","['Age', 'Sex', 'SibSp']"
0,0.846512,0.866667,0.827273,110.0,0.769231,0.743243,0.797101,69.0,0.815642,0.815642,...,0.812187,179.0,0.816722,0.81909,0.815642,179.0,Test,1 / 31,"['Parch', 'Fare', 'Embarked', ['Pclass', 'Deck']]","['Age', 'Sex', 'SibSp']"
0,0.863839,0.846827,0.881549,439.0,0.768939,0.796078,0.74359,273.0,0.828652,0.828652,...,0.812569,712.0,0.827452,0.827369,0.828652,712.0,Train,2 / 31,"['SibSp', 'Fare', 'Embarked', ['Pclass', 'Deck']]","['Age', 'Sex', 'Parch']"
0,0.833333,0.849057,0.818182,110.0,0.746479,0.726027,0.768116,69.0,0.798883,0.798883,...,0.793149,179.0,0.799853,0.801632,0.798883,179.0,Test,2 / 31,"['SibSp', 'Fare', 'Embarked', ['Pclass', 'Deck']]","['Age', 'Sex', 'Parch']"
0,0.947012,0.9375,0.95672,439.0,0.912477,0.92803,0.897436,273.0,0.933989,0.933989,...,0.927078,712.0,0.93377,0.933869,0.933989,712.0,Train,3 / 31,"['SibSp', 'Parch', 'Embarked', ['Pclass', 'Dec...","['Age', 'Sex', 'Fare']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.878924,0.867257,0.890909,110.0,0.8,0.818182,0.782609,69.0,0.849162,0.849162,...,0.836759,179.0,0.848501,0.84834,0.849162,179.0,Test,29 / 31,['Parch'],"['Age', 'Sex', 'SibSp', 'Fare', 'Embarked', 'P..."
0,0.965594,0.941558,0.990888,439.0,0.940727,0.984,0.901099,273.0,0.956461,0.956461,...,0.945994,712.0,0.956059,0.957832,0.956461,712.0,Train,30 / 31,['SibSp'],"['Age', 'Sex', 'Parch', 'Fare', 'Embarked', 'P..."
0,0.891892,0.883929,0.9,110.0,0.823529,0.835821,0.811594,69.0,0.865922,0.865922,...,0.855797,179.0,0.86554,0.865384,0.865922,179.0,Test,30 / 31,['SibSp'],"['Age', 'Sex', 'Parch', 'Fare', 'Embarked', 'P..."
0,0.969967,0.947826,0.993166,439.0,0.948571,0.988095,0.912088,273.0,0.962079,0.962079,...,0.952627,712.0,0.961763,0.963266,0.962079,712.0,Train,31 / 31,[],"['Age', 'Sex', 'SibSp', 'Parch', 'Fare', 'Emba..."
