In [1]:
from data_cleaning import clean_data
from data_balancing import balance_data_sets
from feature_selection import pca_selection, info_gain_selection, boruta_selection

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

import pandas as pd

Data Cleaning

In [2]:
data = clean_data() 

Class distribution:
Class
N    4161
Y     838
Name: count, dtype: int64


Data balancing

In [3]:
df_us, df_r, df_s = balance_data_sets(data)


✅ Class distribution after balancing:

Under-sampling:
Class
N    838
Y    838
Name: count, dtype: int64

ROSE (RandomOverSampler approximation):
Class
Y    4161
N    4161
Name: count, dtype: int64

SMOTE:
Class
Y    4161
N    4161
Name: count, dtype: int64



[WinError 2] The system cannot find the file specified
  File "c:\Users\ghl14\anaconda3\envs\gpu-env\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\ghl14\anaconda3\envs\gpu-env\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ghl14\anaconda3\envs\gpu-env\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\ghl14\anaconda3\envs\gpu-env\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


Feature selection

In [4]:
feature_selected_sets = {}
balanced_datasets = {
    "us": df_us,
    "r": df_r,
    "s": df_s
}

for key, df in balanced_datasets.items():
    X = df.drop("Class", axis=1)
    y = df["Class"]
    
    # PCA
    X_pca, _, _ = pca_selection(X)
    df_pca = pd.DataFrame(X_pca)
    df_pca["Class"] = y.values
    feature_selected_sets[f"{key}_pca"] = df_pca

    # Info Gain
    selected_info, _ = info_gain_selection(X, y, top_k=10)
    df_info = df[selected_info + ["Class"]]
    feature_selected_sets[f"{key}_info"] = df_info

    # Boruta
    selected_boruta, _ = boruta_selection(X, y)
    df_boruta = df[selected_boruta + ["Class"]]
    feature_selected_sets[f"{key}_boruta"] = df_boruta

[PCA] Selected 1 components to retain 95.0% variance.
[InfoGain] Top 10 features:
['EMPLOY1', 'PNEUVAC4', 'CHCCOPD2', 'CHILDREN', 'BPHIGH4', 'ASTHMA3', 'DIABETE4', 'HTM4', 'FRUITJU2', 'WTKG3']
[Boruta] Selected features:
['EMPLOY1']
[PCA] Selected 1 components to retain 95.0% variance.
[InfoGain] Top 10 features:
['EMPLOY1', 'WTKG3', 'MARITAL', 'CHILDREN', 'PNEUVAC4', 'BPHIGH4', 'HLTHPLN1', 'DIFFWALK', 'FLUSHOT7', 'PERSDOC2']
[Boruta] Selected features:
['GENHLTH', 'MARITAL', 'EMPLOY1', 'ALCDAY5', 'FRUIT2', 'FVGREEN1', 'VEGETAB2', 'PNEUVAC4', 'HTM4', 'WTKG3']
[PCA] Selected 1 components to retain 95.0% variance.
[InfoGain] Top 10 features:
['HTM4', 'FVGREEN1', 'INCOME2', 'FRUIT2', 'VEGETAB2', 'EMPLOY1', 'GENHLTH', 'EDUCA', 'MARITAL', 'TETANUS1']
[Boruta] Selected features:
['GENHLTH', 'PERSDOC2', 'BPHIGH4', 'TOLDHI2', 'HAVARTH4', 'MARITAL', 'EDUCA', 'RENTHOM1', 'VETERAN3', 'EMPLOY1', 'INCOME2', 'DIFFWALK', 'SMOKE100', 'EXERANY2', 'FRUIT2', 'FVGREEN1', 'POTATOE1', 'FLUSHOT7', 'TETANUS1'

Model training

In [5]:
# ==================== NAIVE BAYES ====================
for fs_name, df_fs in feature_selected_sets.items():
    print(f"\n Processing [{fs_name}] with GaussianNB")

    # Preparing data
    X = df_fs.drop("Class", axis=1)
    y = df_fs["Class"]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    # Train and fit
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred, labels=gnb.classes_)
    print("Classes order:", list(gnb.classes_))
    print("Confusion Matrix:\n", cm)


 Processing [us_pca] with GaussianNB
Classes order: [np.str_('N'), np.str_('Y')]
Confusion Matrix:
 [[ 40 128]
 [ 40 128]]

 Processing [us_info] with GaussianNB
Classes order: [np.str_('N'), np.str_('Y')]
Confusion Matrix:
 [[126  42]
 [ 68 100]]

 Processing [us_boruta] with GaussianNB
Classes order: [np.str_('N'), np.str_('Y')]
Confusion Matrix:
 [[114  54]
 [ 53 115]]

 Processing [r_pca] with GaussianNB
Classes order: [np.str_('N'), np.str_('Y')]
Confusion Matrix:
 [[187 646]
 [178 654]]

 Processing [r_info] with GaussianNB
Classes order: [np.str_('N'), np.str_('Y')]
Confusion Matrix:
 [[433 400]
 [186 646]]

 Processing [r_boruta] with GaussianNB
Classes order: [np.str_('N'), np.str_('Y')]
Confusion Matrix:
 [[481 352]
 [228 604]]

 Processing [s_pca] with GaussianNB
Classes order: [np.str_('N'), np.str_('Y')]
Confusion Matrix:
 [[211 622]
 [211 621]]

 Processing [s_info] with GaussianNB
Classes order: [np.str_('N'), np.str_('Y')]
Confusion Matrix:
 [[318 515]
 [ 81 751]]

 Pr

In [7]:
# ==================== KNN ====================
# Search grid
knn_param_grid = {
    "n_neighbors": [3, 5, 7, 9],
    "weights": ["uniform", "distance"],
    "metric": ["euclidean", "manhattan"]
}

for fs_name, df_fs in feature_selected_sets.items():
    print(f"\nProcessing [{fs_name}] with KNN Grid Search")

    # Split data
    X = df_fs.drop("Class", axis=1)
    y = df_fs["Class"]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    # GridSearchCV
    grid_knn = GridSearchCV(
        estimator=KNeighborsClassifier(),
        param_grid=knn_param_grid,
        cv=5,
        scoring="f1_macro",
        n_jobs=-1
    )
    grid_knn.fit(X_train, y_train)

    # Best model and predict
    best_knn = grid_knn.best_estimator_
    y_pred = best_knn.predict(X_test)

    # Best parameters
    print("▶ Best params:", grid_knn.best_params_)

    # Output
    cm = confusion_matrix(y_test, y_pred, labels=best_knn.classes_)
    print("Classes order:", list(best_knn.classes_))
    print("Confusion Matrix:\n", cm)


Processing [us_pca] with KNN Grid Search
▶ Best params: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
Classes order: ['N', 'Y']
Confusion Matrix:
 [[76 92]
 [84 84]]

Processing [us_info] with KNN Grid Search
▶ Best params: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'uniform'}
Classes order: ['N', 'Y']
Confusion Matrix:
 [[90 78]
 [77 91]]

Processing [us_boruta] with KNN Grid Search
▶ Best params: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}
Classes order: ['N', 'Y']
Confusion Matrix:
 [[ 37 131]
 [ 14 154]]

Processing [r_pca] with KNN Grid Search
▶ Best params: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
Classes order: ['N', 'Y']
Confusion Matrix:
 [[584 249]
 [ 11 821]]

Processing [r_info] with KNN Grid Search
▶ Best params: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
Classes order: ['N', 'Y']
Confusion Matrix:
 [[584 249]
 [ 45 787]]

Processing [r_boruta] with KNN Grid Search
▶ Best params:

In [8]:
# ==================== Random Forest ====================
# Search grid
rf_param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
    "max_features": ["sqrt", "log2"]
}

for fs_name, df_fs in feature_selected_sets.items():
    print(f"\nProcessing [{fs_name}] with Random Forest Grid Search")

    # Split data
    X = df_fs.drop("Class", axis=1)
    y = df_fs["Class"]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    # GridSearchCV
    grid_rf = GridSearchCV(
        estimator=RandomForestClassifier(),
        param_grid=rf_param_grid,
        cv=5,
        scoring="f1_macro",
        n_jobs=-1
    )
    grid_rf.fit(X_train, y_train)

    # Best model and predict
    best_rf = grid_rf.best_estimator_
    y_pred = best_rf.predict(X_test)

    # Best parameters
    print("▶ Best params:", grid_rf.best_params_)

    # Output
    cm = confusion_matrix(y_test, y_pred, labels=best_rf.classes_)
    print("Classes order:", list(best_rf.classes_))
    print("Confusion Matrix:\n", cm)


Processing [us_pca] with Random Forest Grid Search
▶ Best params: {'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Classes order: ['N', 'Y']
Confusion Matrix:
 [[ 65 103]
 [ 83  85]]

Processing [us_info] with Random Forest Grid Search
▶ Best params: {'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Classes order: ['N', 'Y']
Confusion Matrix:
 [[118  50]
 [ 50 118]]

Processing [us_boruta] with Random Forest Grid Search
▶ Best params: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Classes order: ['N', 'Y']
Confusion Matrix:
 [[127  41]
 [ 61 107]]

Processing [r_pca] with Random Forest Grid Search
▶ Best params: {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Classes order: ['N', 'Y']
Confusion Matrix:
 [[644 189]
 [ 23 809]]

Processing [r_in

In [9]:
# ==================== XGBoost ====================
# Search grid
xgb_param_dist = {
    "n_estimators": [50, 100],       
    "max_depth": [3, 5],             
    "learning_rate": [0.1],          
    "subsample": [1.0],              
    "colsample_bytree": [1.0],       
    "gamma": [0]                     
}

for fs_name, df_fs in feature_selected_sets.items():
    print(f"\nProcessing[{fs_name}] with XGB RandomizedSearchCV")

    # Data preparing
    X = df_fs.drop("Class", axis=1)
    y = df_fs["Class"].map({"N":0,"Y":1})
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    # Create model
    model = XGBClassifier(
        use_label_encoder=False,
        tree_method="gpu_hist",
        predictor="gpu_predictor",
        eval_metric="logloss"
    )

    # Random grid search
    rand_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=xgb_param_dist,
        n_iter=10,
        cv=2,
        scoring="f1_macro",
        n_jobs=-1,
        random_state=42
    )

    rand_search.fit(X_train, y_train, verbose=False)

    best = rand_search.best_estimator_
    print("▶ Best params:", rand_search.best_params_)

    # Output
    y_pred = best.predict(X_test)
    cm = confusion_matrix(y_test, y_pred, labels=[0,1])
    print("Labels order: [0='N', 1='Y']")
    print("Confusion Matrix:\n", cm)


Processing[us_pca] with XGB RandomizedSearchCV



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


▶ Best params: {'subsample': 1.0, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 1.0}
Labels order: [0='N', 1='Y']
Confusion Matrix:
 [[77 91]
 [90 78]]

Processing[us_info] with XGB RandomizedSearchCV



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


▶ Best params: {'subsample': 1.0, 'n_estimators': 50, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 1.0}
Labels order: [0='N', 1='Y']
Confusion Matrix:
 [[114  54]
 [ 53 115]]

Processing[us_boruta] with XGB RandomizedSearchCV



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


▶ Best params: {'subsample': 1.0, 'n_estimators': 50, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 1.0}
Labels order: [0='N', 1='Y']
Confusion Matrix:
 [[127  41]
 [ 61 107]]

Processing[r_pca] with XGB RandomizedSearchCV



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


▶ Best params: {'subsample': 1.0, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 1.0}
Labels order: [0='N', 1='Y']
Confusion Matrix:
 [[468 365]
 [327 505]]

Processing[r_info] with XGB RandomizedSearchCV



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


▶ Best params: {'subsample': 1.0, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 1.0}
Labels order: [0='N', 1='Y']
Confusion Matrix:
 [[572 261]
 [172 660]]

Processing[r_boruta] with XGB RandomizedSearchCV



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


▶ Best params: {'subsample': 1.0, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 1.0}
Labels order: [0='N', 1='Y']
Confusion Matrix:
 [[636 197]
 [162 670]]

Processing[s_pca] with XGB RandomizedSearchCV



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


▶ Best params: {'subsample': 1.0, 'n_estimators': 50, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 1.0}
Labels order: [0='N', 1='Y']
Confusion Matrix:
 [[472 361]
 [363 469]]

Processing[s_info] with XGB RandomizedSearchCV



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


▶ Best params: {'subsample': 1.0, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 1.0}
Labels order: [0='N', 1='Y']
Confusion Matrix:
 [[811  22]
 [157 675]]

Processing[s_boruta] with XGB RandomizedSearchCV
▶ Best params: {'subsample': 1.0, 'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 1.0}
Labels order: [0='N', 1='Y']
Confusion Matrix:
 [[818  15]
 [162 670]]



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


In [10]:
# ==================== MLP ====================
# Search grid
mlp_param_grid = {
    "hidden_layer_sizes": [(50,), (100,), (100, 50)],
    "activation": ["relu", "tanh"],
    "alpha": [0.0001, 0.001],
    "learning_rate": ["constant", "adaptive"],
    "max_iter": [300]
}

# MLP Grid Search for all feature selections
for fs_name, df_fs in feature_selected_sets.items():
    print(f"\nProcessing [{fs_name}] with MLP Grid Search")

    # Split data
    X = df_fs.drop("Class", axis=1)
    y = df_fs["Class"]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        stratify=y,
        random_state=42
    )

    # GridSearchCV
    grid_mlp = GridSearchCV(
        estimator=MLPClassifier(),
        param_grid=mlp_param_grid,
        cv=2,
        scoring="f1_macro",
        n_jobs=-1
    )
    grid_mlp.fit(X_train, y_train)

    # Best model and predict
    best_mlp = grid_mlp.best_estimator_
    y_pred = best_mlp.predict(X_test)

    # Print best parameter
    print("▶ Best params:", grid_mlp.best_params_)

    # Output
    cm = confusion_matrix(y_test, y_pred, labels=best_mlp.classes_)
    print("Classes order:", list(best_mlp.classes_))
    print("Confusion Matrix:\n", cm)


Processing [us_pca] with MLP Grid Search
▶ Best params: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (100, 50), 'learning_rate': 'adaptive', 'max_iter': 300}
Classes order: [np.str_('N'), np.str_('Y')]
Confusion Matrix:
 [[130  38]
 [124  44]]

Processing [us_info] with MLP Grid Search
▶ Best params: {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (50,), 'learning_rate': 'constant', 'max_iter': 300}
Classes order: [np.str_('N'), np.str_('Y')]
Confusion Matrix:
 [[168   0]
 [168   0]]

Processing [us_boruta] with MLP Grid Search
▶ Best params: {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (50,), 'learning_rate': 'adaptive', 'max_iter': 300}
Classes order: [np.str_('N'), np.str_('Y')]
Confusion Matrix:
 [[114  54]
 [ 53 115]]

Processing [r_pca] with MLP Grid Search
▶ Best params: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50,), 'learning_rate': 'constant', 'max_iter': 300}
Classes order: [np.str_('N'), np.str_('Y')]
Con