In [11]:
import numpy as np
import pandas as pd
import os, time, warnings, shap, optuna, random
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.svm import SVC
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, train_test_split, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, precision_recall_curve, auc
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score
from sklearn.inspection import permutation_importance
from sklearn.compose import ColumnTransformer, make_column_transformer
from xgboost import XGBClassifier

pd.set_option('display.max_columns', 20)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.expand_frame_repr', False)
warnings.filterwarnings('ignore')


In [2]:
time0 = time.time()
df = pd.read_csv('../input/tabular-playground-series-aug-2022/train.csv')
display(df.shape, df.head(), df.failure.value_counts(), df.count())
df0 = df.copy()
df.drop(columns = ['id'], inplace=True)

(26570, 26)

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
0,0,A,80.1,material_7,material_8,9,5,7,8,4,...,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.1,0
1,1,A,84.89,material_7,material_8,9,5,14,3,3,...,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0
2,2,A,82.43,material_7,material_8,9,5,12,1,5,...,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376,0
3,3,A,101.07,material_7,material_8,9,5,13,2,6,...,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0
4,4,A,188.06,material_7,material_8,9,5,9,2,8,...,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0


0    20921
1     5649
Name: failure, dtype: int64

id                26570
product_code      26570
loading           26320
attribute_0       26570
attribute_1       26570
attribute_2       26570
attribute_3       26570
measurement_0     26570
measurement_1     26570
measurement_2     26570
measurement_3     26189
measurement_4     26032
measurement_5     25894
measurement_6     25774
measurement_7     25633
measurement_8     25522
measurement_9     25343
measurement_10    25270
measurement_11    25102
measurement_12    24969
measurement_13    24796
measurement_14    24696
measurement_15    24561
measurement_16    24460
measurement_17    24286
failure           26570
dtype: int64

In [3]:
# train-test split:

test_size = 0.1
df.reset_index(inplace=True, drop=True)
test_index = random.sample(list(df.index), int(test_size*df.shape[0]))
train = df.iloc[list(set(df.index)-set(test_index))]
test = df.iloc[test_index]
display(train.shape, test.shape, train.head(3), test.head(3), train.count())

(23913, 25)

(2657, 25)

Unnamed: 0,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
0,A,80.1,material_7,material_8,9,5,7,8,4,18.04,...,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.1,0
1,A,84.89,material_7,material_8,9,5,14,3,3,18.213,...,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0
2,A,82.43,material_7,material_8,9,5,12,1,5,18.057,...,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376,0


Unnamed: 0,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
11155,C,103.85,material_7,material_8,5,8,10,4,2,18.255,...,11.455,15.454,18.445,,15.942,13.809,15.946,18.737,661.085,0
19387,D,165.06,material_7,material_5,6,6,7,5,4,17.831,...,,15.918,19.039,14.476,16.091,17.573,11.425,16.409,410.328,0
11581,C,66.04,material_7,material_8,5,8,7,12,3,19.097,...,11.863,16.767,17.399,,16.072,18.781,18.145,13.405,972.508,0


product_code      23913
loading           23682
attribute_0       23913
attribute_1       23913
attribute_2       23913
attribute_3       23913
measurement_0     23913
measurement_1     23913
measurement_2     23913
measurement_3     23572
measurement_4     23416
measurement_5     23317
measurement_6     23191
measurement_7     23059
measurement_8     22973
measurement_9     22818
measurement_10    22741
measurement_11    22579
measurement_12    22471
measurement_13    22318
measurement_14    22231
measurement_15    22080
measurement_16    22024
measurement_17    21865
failure           23913
dtype: int64

In [4]:
num_feat = [col for col in train.columns if train[col].nunique()>=20]
cat_te_feat = [col for col in train.columns if train[col].nunique() in (range(5,20))]
cat_ohe_feat = [col for col in train.columns if train[col].nunique()<5]
cat_ohe_feat.remove('failure')
display('num features: ', num_feat, '/n',
       'cat features: ', cat_te_feat+cat_ohe_feat)

'num features: '

['loading',
 'measurement_0',
 'measurement_1',
 'measurement_2',
 'measurement_3',
 'measurement_4',
 'measurement_5',
 'measurement_6',
 'measurement_7',
 'measurement_8',
 'measurement_9',
 'measurement_10',
 'measurement_11',
 'measurement_12',
 'measurement_13',
 'measurement_14',
 'measurement_15',
 'measurement_16',
 'measurement_17']

'/n'

'cat features: '

['product_code', 'attribute_0', 'attribute_1', 'attribute_2', 'attribute_3']

In [5]:
# fill na:

for col in num_feat:
    train[col] = train[col].fillna(train[col].median())
    test[col] = test[col].fillna(train[col].median())

for col in cat_te_feat+cat_ohe_feat:
    train[col]=train[col].fillna(train[col].mode()[0])
    test[col]=test[col].fillna(train[col].mode()[0])
    
display(train.count(), test.count())

product_code      23913
loading           23913
attribute_0       23913
attribute_1       23913
attribute_2       23913
attribute_3       23913
measurement_0     23913
measurement_1     23913
measurement_2     23913
measurement_3     23913
measurement_4     23913
measurement_5     23913
measurement_6     23913
measurement_7     23913
measurement_8     23913
measurement_9     23913
measurement_10    23913
measurement_11    23913
measurement_12    23913
measurement_13    23913
measurement_14    23913
measurement_15    23913
measurement_16    23913
measurement_17    23913
failure           23913
dtype: int64

product_code      2657
loading           2657
attribute_0       2657
attribute_1       2657
attribute_2       2657
attribute_3       2657
measurement_0     2657
measurement_1     2657
measurement_2     2657
measurement_3     2657
measurement_4     2657
measurement_5     2657
measurement_6     2657
measurement_7     2657
measurement_8     2657
measurement_9     2657
measurement_10    2657
measurement_11    2657
measurement_12    2657
measurement_13    2657
measurement_14    2657
measurement_15    2657
measurement_16    2657
measurement_17    2657
failure           2657
dtype: int64

In [6]:
# take out target

X_train = train
y_train = X_train.pop('failure')
X_test = test
y_test = X_test.pop('failure')

display(X_train.shape, X_test.shape, y_train.shape)

(23913, 24)

(2657, 24)

(23913,)

In [7]:
# encode cat features, ohe for now:

feature_transformer = ColumnTransformer([
    ("cat", OneHotEncoder(sparse = False, handle_unknown="ignore"), cat_te_feat+cat_ohe_feat)],
    remainder="passthrough")

print('Number of features before transformation: ', X_train.shape)
X_train = pd.DataFrame(feature_transformer.fit_transform(X_train), columns=feature_transformer.get_feature_names_out())
X_test = pd.DataFrame(feature_transformer.transform(X_test), columns=feature_transformer.get_feature_names_out())
print('Number of features after transformation: ', X_train.shape)

Number of features before transformation:  (23913, 24)
Number of features after transformation:  (23913, 37)


In [8]:
X_train

Unnamed: 0,cat__product_code_A,cat__product_code_B,cat__product_code_C,cat__product_code_D,cat__product_code_E,cat__attribute_0_material_5,cat__attribute_0_material_7,cat__attribute_1_material_5,cat__attribute_1_material_6,cat__attribute_1_material_8,...,remainder__measurement_8,remainder__measurement_9,remainder__measurement_10,remainder__measurement_11,remainder__measurement_12,remainder__measurement_13,remainder__measurement_14,remainder__measurement_15,remainder__measurement_16,remainder__measurement_17
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,20.155,10.672,15.859,17.594,15.193,15.029,16.039,13.034,14.684,764.100
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,17.889,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,18.288,12.715,15.607,19.211,13.798,16.711,18.631,14.094,17.946,663.376
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,18.093,10.337,17.082,19.932,12.428,16.182,12.760,13.153,16.412,579.885
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,20.810,10.622,14.904,19.107,13.327,15.354,19.251,14.965,17.625,832.902
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23908,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,19.354,11.433,12.177,17.942,10.112,15.795,18.572,16.144,16.434,729.131
23909,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,19.563,11.242,14.179,20.564,10.234,14.450,14.322,13.146,16.471,853.924
23910,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,19.279,11.407,16.437,17.476,8.668,15.069,16.599,15.590,14.065,750.364
23911,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,19.358,11.392,17.064,17.814,14.928,16.273,15.485,13.624,12.865,730.156


In [15]:
# fit XGBoost

optuna_xgb = XGBClassifier(tree_method = 'gpu_hist')
optuna_xgb.fit(X_train, y_train)

precision_t, recall_t, threshold = precision_recall_curve(y_train, optuna_xgb.predict_proba(X_train)[:, 1])
auc_precision_recall_train = auc(recall_t, precision_t)
precision_t, recall_t, threshold = precision_recall_curve(y_test, optuna_xgb.predict_proba(X_test)[:, 1])
auc_precision_recall_test = auc(recall_t, precision_t)

display('Accuracy: ', accuracy_score(y_train,optuna_xgb.predict(X_train)))
display('F1 score: ', f1_score(y_train,optuna_xgb.predict(X_train)))
display('Recall score: ', recall_score(y_train,optuna_xgb.predict(X_train)))
display('Precision score: ', precision_score(y_train,optuna_xgb.predict(X_train)))
display('ROCAUC: ', roc_auc_score(y_train,optuna_xgb.predict(X_train)))
#display('Precision at 20% recall: ', r20prec_train)
#display('Precision at 50% recall: ', r50prec_train)
# Performance evaluation:
display('Accuracy: ', accuracy_score(y_test,optuna_xgb.predict(X_test)))
display('F1 score: ', f1_score(y_test,optuna_xgb.predict(X_test)))
display('Recall score: ', recall_score(y_test,optuna_xgb.predict(X_test)))
display('Precision score: ', precision_score(y_test,optuna_xgb.predict(X_test)))
display('ROCAUC: ', roc_auc_score(y_test,optuna_xgb.predict(X_test)))
#display('Precision at 20% recall: ', r20prec_test)
#display('Precision at 50% recall: ', r50prec_test)
display(time.time()-time0)

'Accuracy: '

0.878936143520261

'F1 score: '

0.6043460434604346

'Recall score: '

0.4335294117647059

'Precision score: '

0.9972936400541272

'ROCAUC: '

0.7166052416820659

'Accuracy: '

0.7813323296951449

'F1 score: '

0.061389337641357025

'Recall score: '

0.03460837887067395

'Precision score: '

0.2714285714285714

'ROCAUC: '

0.5052074152417886

NameError: name 'time0' is not defined

In [14]:
time1=time.time()

### Fit XGBoost using Optuna hyperparameter optimization ###

def objective(trial, cv_runs=2, n_splits=2, n_jobs=-1, scale_pos_weight=1, early_stopping_rounds=50):

    cv_regularizer=0.1
    # Usually values between 0.1 and 0.2 work fine.

    params = {
        "tree_method": 'gpu_hist',
        "verbosity": 0,  # 0 (silent) - 3 (debug)
        #"n_estimators": trial.suggest_int("n_estimators", 500, 1000),
        "n_estimators": 500,
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "learning_rate": trial.suggest_uniform("learning_rate", 0.01, 0.20),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.1, 0.95),
        "subsample": trial.suggest_uniform("subsample", 0.5, 0.95),
        "alpha": trial.suggest_loguniform("alpha", 0.1, 10.0),
        "lambda": trial.suggest_loguniform("lambda", 0.1, 150.0),
        "gamma": trial.suggest_loguniform("gamma", 1e-10, 10.0),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 0.1, 10),
        "n_jobs": n_jobs,
    }
    # usually it makes sense to resrtict hyperparameter space from some solutions which Optuna will find
    # e.g., for tmx-joined data only (downsampled tmx), optuna keeps selecting depths of 2 and 3.
    # for my purposes (smooth left side of prc, close to 1), those solutions are no good.

    temp_out = []

    for i in range(cv_runs):

        X = X_train
        y = y_train

        model = XGBClassifier(**params)
        rkf = KFold(n_splits=n_splits, shuffle=True)
        X_values = X.values
        y_values = y.values
        y_pred = np.zeros_like(y_values)
        y_pred_train = np.zeros_like(y_values)
        for train_index, test_index in rkf.split(X_values):
            X_A, X_B = X_values[train_index, :], X_values[test_index, :]
            y_A, y_B = y_values[train_index], y_values[test_index]
            model.fit(X_A, y_A, eval_set=[(X_B, y_B)],
                      early_stopping_rounds=early_stopping_rounds, verbose = False)
            y_pred[test_index] += model.predict(X_B)
            y_pred_train[train_index] += model.predict(X_A)
        score_train = roc_auc_score(y_train, y_pred_train)
        score_test = roc_auc_score(y_train, y_pred) 
        overfit = score_train-score_test
        #return (score_test)
        #return (score_test-cv_regularizer*overfit)
        temp_out.append(score_test-cv_regularizer*overfit)

    return (np.mean(temp_out))

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)
print('Total time for hypermarameter optimization ', time.time()-time1)
hp = study.best_params
for key, value in hp.items():
    print(f"{key:>20s} : {value}")
print(f"{'best objective value':>20s} : {study.best_value}")

optuna_hyperpars = study.best_params
optuna_hyperpars['tree_method']='gpu_hist'
optuna_hyperpars['scale_pos_weight']=1
#optuna_hyperpars['early_stopping_rounds']=50

optuna_xgb = XGBClassifier(**optuna_hyperpars)
optuna_xgb.fit(X_train, y_train)


display('Accuracy: ', accuracy_score(y_train,optuna_xgb.predict(X_train)))
display('F1 score: ', f1_score(y_train,optuna_xgb.predict(X_train)))
display('Recall score: ', recall_score(y_train,optuna_xgb.predict(X_train)))
display('Precision score: ', precision_score(y_train,optuna_xgb.predict(X_train)))
display('ROCAUC: ', roc_auc_score(y_train,optuna_xgb.predict(X_train)))
#display('Precision at 20% recall: ', r20prec_train)
#display('Precision at 50% recall: ', r50prec_train)
# Performance evaluation:
display('Accuracy: ', accuracy_score(y_test,optuna_xgb.predict(X_test)))
display('F1 score: ', f1_score(y_test,optuna_xgb.predict(X_test)))
display('Recall score: ', recall_score(y_test,optuna_xgb.predict(X_test)))
display('Precision score: ', precision_score(y_test,optuna_xgb.predict(X_test)))
display('ROCAUC: ', roc_auc_score(y_test,optuna_xgb.predict(X_test)))
#display('Precision at 20% recall: ', r20prec_test)
#display('Precision at 50% recall: ', r50prec_test)
display(time.time()-time0)

[32m[I 2022-08-05 22:38:12,494][0m A new study created in memory with name: no-name-1f62ed23-05ea-4c6f-8730-70eeba4c27bf[0m
[32m[I 2022-08-05 22:38:17,134][0m Trial 0 finished with value: 0.5 and parameters: {'max_depth': 9, 'learning_rate': 0.035392210656885144, 'colsample_bytree': 0.8646760749667454, 'subsample': 0.5730406969903468, 'alpha': 0.931153680567377, 'lambda': 63.44070124175965, 'gamma': 2.2057917051674848e-08, 'min_child_weight': 7.976670393177083}. Best is trial 0 with value: 0.5.[0m
[32m[I 2022-08-05 22:38:20,322][0m Trial 1 finished with value: 0.49998039215686274 and parameters: {'max_depth': 9, 'learning_rate': 0.06386084279897095, 'colsample_bytree': 0.5388395223934204, 'subsample': 0.5954239994313864, 'alpha': 0.42082228407316724, 'lambda': 11.575219669231412, 'gamma': 1.7828626025703436e-09, 'min_child_weight': 6.931471702626036}. Best is trial 0 with value: 0.5.[0m
[32m[I 2022-08-05 22:38:23,151][0m Trial 2 finished with value: 0.4999657746051698 and pa

Total time for hypermarameter optimization  44.15415024757385
           max_depth : 5
       learning_rate : 0.19720705346554546
    colsample_bytree : 0.41287057476193434
           subsample : 0.7508167094467701
               alpha : 0.20493453711786586
              lambda : 1.6413020613020086
               gamma : 1.3123403806976627e-05
    min_child_weight : 1.6346623608934567
best objective value : 0.5001618209873648


'Accuracy: '

0.7984359971563585

'F1 score: '

0.10872781065088757

'Recall score: '

0.05764705882352941

'Precision score: '

0.9545454545454546

'PRUC: '

0.9363524648684831

'Accuracy: '

0.792623259315017

'F1 score: '

0.021314387211367674

'Recall score: '

0.01092896174863388

'Precision score: '

0.42857142857142855

'PRUC: '

0.22924522114388052

NameError: name 'time0' is not defined