In [None]:
import pandas as pd
import numpy as np
from custom_ml_toolkit.preprocessor.encoder import SupportMissingDatasetEncoder
from custom_ml_toolkit.feature_selector.importance_explaner import plot_feature_importances, plot_shap_values
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier, plot_tree
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt

In [None]:
random_state = 77

data_df = pd.read_csv('example_data/titanic.csv')
data_df['Deck'] = data_df['Cabin'].str[0]

numerical_cols = ['Age', 'SibSp', 'Parch', 'Fare']
norminal_cols = ['Sex', 'Embarked']
ordinal_cols = ['Pclass', 'Deck']
target_col = 'Survived'

train_data_df, test_data_df = train_test_split(
    data_df,
    test_size=0.2,
    random_state=random_state,
    stratify=data_df['Survived']
)

## XGBClassifier

In [None]:
de = SupportMissingDatasetEncoder(
    numerical_cols=numerical_cols,
    norminal_cols=norminal_cols,
    ordinal_cols=ordinal_cols,
    target_col=target_col,
    drop_binary=True,
    oe_unknown_value=np.nan,
    oe_missing_value=np.nan,
    encode_target=True
)

de.fit(train_data_df)
encoded_train_data_df = de.transform(train_data_df)
encoded_test_data_df = de.transform(test_data_df)

clf = XGBClassifier(
    random_state=random_state,
    n_jobs=-1,
)

X_train = encoded_train_data_df.drop(columns=['Survived'])
y_train = encoded_train_data_df['Survived']

X_test = encoded_test_data_df.drop(columns=['Survived'])
y_test = encoded_test_data_df['Survived']

clf.fit(
    X=X_train,
    y=y_train
)

y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)
print(classification_report(y_train, y_train_pred))
print(classification_report(y_test, y_test_pred))

feat_imp_df = plot_feature_importances(
    feature_importance=clf.feature_importances_,
    feature_names=X_train.columns,
    top_n=30
)

feat_imp_df

In [None]:
# import shap

# explainer = shap.TreeExplainer(clf)
# shap_values = explainer.shap_values(X_train)

# shap.summary_plot(
#     shap_values=shap_values,
#     features=X_train,
#     plot_type='bar',
#     show=False
# )
# # shap.plots.waterfall(shap_values[3])

# shap_values.shape

## LGBMClassifier

In [None]:
de = SupportMissingDatasetEncoder(
    numerical_cols=numerical_cols,
    norminal_cols=None,
    ordinal_cols=norminal_cols + ordinal_cols,
    target_col=target_col,
    drop_binary=True,
    oe_unknown_value=-1,
    oe_missing_value=-1,
    encode_target=False
)

de.fit(train_data_df)
encoded_train_data_df = de.transform(train_data_df)
encoded_test_data_df = de.transform(test_data_df)

clf = LGBMClassifier(
    random_state=random_state,
    n_jobs=-1,
    verbose=-1
)

X_train = encoded_train_data_df.drop(columns=['Survived'])
y_train = encoded_train_data_df['Survived']

X_test = encoded_test_data_df.drop(columns=['Survived'])
y_test = encoded_test_data_df['Survived']

clf.fit(
    X=X_train,
    y=y_train,
    categorical_feature=norminal_cols
)

y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)
print(classification_report(y_train, y_train_pred))
print(classification_report(y_test, y_test_pred))

feat_imp_df = plot_feature_importances(
    feature_importance=clf.feature_importances_,
    feature_names=X_train.columns,
    top_n=30
)
feat_imp_df

## Overfit DecisionTreeClassifier

In [None]:
de = SupportMissingDatasetEncoder(
    numerical_cols=numerical_cols,
    norminal_cols=norminal_cols,
    ordinal_cols=ordinal_cols,
    target_col=target_col,
    drop_binary=True,
    oe_unknown_value=np.nan,
    oe_missing_value=np.nan,
    encode_target=False
)

de.fit(data_df)
encoded_data_df = de.transform(data_df)

norminal_feature_name_out = de.features_encoder.get_norminal_feature_name_out()
encoded_data_df[numerical_cols + ordinal_cols] = encoded_data_df[numerical_cols + ordinal_cols].fillna(-999)
encoded_data_df[norminal_feature_name_out] = encoded_data_df[norminal_feature_name_out].fillna(0)

clf = DecisionTreeClassifier(
    random_state=random_state,
    # max_depth=15
)

X_train = encoded_data_df.drop(columns=['Survived'])
y_train = encoded_data_df['Survived']

clf.fit(
    X=X_train,
    y=y_train
)

y_train_pred = clf.predict(X_train)

print(classification_report(y_train, y_train_pred))

# plt.figure(figsize=(300,50))
# plt.figure(figsize=(100,50))
# plot_tree(
#     decision_tree=clf,
#     feature_names=X_train.columns,
#     class_names=de.classes_,
#     filled=True,
#     fontsize=6
# )
# plt.savefig(
#     fname ='tree_high_dpi',
#     dpi=100
# )