In [29]:
import numpy as np
import pandas as pd
import os, time, warnings, shap, optuna, random
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.svm import SVC
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, precision_recall_curve, auc
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score
from sklearn.inspection import permutation_importance
from sklearn.compose import ColumnTransformer, make_column_transformer
from xgboost import XGBClassifier

pd.set_option('display.max_columns', 20)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.expand_frame_repr', False)
warnings.filterwarnings('ignore')


In [30]:
df = pd.read_csv('../input/tabular-playground-series-aug-2022/train.csv')
display(df.shape, df.head(), df.failure.value_counts(), df.count())
df0 = df.copy()
df.drop(columns = ['id'], inplace=True)

(26570, 26)

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
0,0,A,80.1,material_7,material_8,9,5,7,8,4,...,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.1,0
1,1,A,84.89,material_7,material_8,9,5,14,3,3,...,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0
2,2,A,82.43,material_7,material_8,9,5,12,1,5,...,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376,0
3,3,A,101.07,material_7,material_8,9,5,13,2,6,...,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0
4,4,A,188.06,material_7,material_8,9,5,9,2,8,...,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0


0    20921
1     5649
Name: failure, dtype: int64

id                26570
product_code      26570
loading           26320
attribute_0       26570
attribute_1       26570
attribute_2       26570
attribute_3       26570
measurement_0     26570
measurement_1     26570
measurement_2     26570
measurement_3     26189
measurement_4     26032
measurement_5     25894
measurement_6     25774
measurement_7     25633
measurement_8     25522
measurement_9     25343
measurement_10    25270
measurement_11    25102
measurement_12    24969
measurement_13    24796
measurement_14    24696
measurement_15    24561
measurement_16    24460
measurement_17    24286
failure           26570
dtype: int64

In [31]:
# train-test split:

test_size = 0.1
df.reset_index(inplace=True, drop=True)
test_index = random.sample(list(df.index), int(test_size*df.shape[0]))
train = df.iloc[list(set(df.index)-set(test_index))]
test = df.iloc[test_index]
display(train.shape, test.shape, train.head(3), test.head(3), train.count())

(23913, 25)

(2657, 25)

Unnamed: 0,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
0,A,80.1,material_7,material_8,9,5,7,8,4,18.04,...,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.1,0
2,A,82.43,material_7,material_8,9,5,12,1,5,18.057,...,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376,0
3,A,101.07,material_7,material_8,9,5,13,2,6,17.295,...,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0


Unnamed: 0,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
16696,D,111.87,material_7,material_5,6,6,14,10,9,18.891,...,11.235,13.929,20.017,11.42,15.858,14.037,15.879,16.117,716.727,0
1844,A,160.94,material_7,material_8,9,5,11,3,9,19.579,...,13.492,15.395,19.502,11.095,16.172,14.563,15.546,,761.583,0
9143,B,129.1,material_5,material_5,8,8,5,3,10,17.016,...,12.239,16.598,18.45,12.768,15.77,15.311,16.137,16.094,645.889,0


product_code      23913
loading           23695
attribute_0       23913
attribute_1       23913
attribute_2       23913
attribute_3       23913
measurement_0     23913
measurement_1     23913
measurement_2     23913
measurement_3     23565
measurement_4     23434
measurement_5     23300
measurement_6     23196
measurement_7     23083
measurement_8     22953
measurement_9     22795
measurement_10    22737
measurement_11    22600
measurement_12    22468
measurement_13    22315
measurement_14    22240
measurement_15    22085
measurement_16    22024
measurement_17    21885
failure           23913
dtype: int64

In [32]:
num_feat = [col for col in train.columns if train[col].nunique()>=20]
cat_te_feat = [col for col in train.columns if train[col].nunique() in (range(5,20))]
cat_ohe_feat = [col for col in train.columns if train[col].nunique()<5]
cat_ohe_feat.remove('failure')
display('num features: ', num_feat, '/n',
       'cat features: ', cat_te_feat+cat_ohe_feat)

'num features: '

['loading',
 'measurement_0',
 'measurement_1',
 'measurement_2',
 'measurement_3',
 'measurement_4',
 'measurement_5',
 'measurement_6',
 'measurement_7',
 'measurement_8',
 'measurement_9',
 'measurement_10',
 'measurement_11',
 'measurement_12',
 'measurement_13',
 'measurement_14',
 'measurement_15',
 'measurement_16',
 'measurement_17']

'/n'

'cat features: '

['product_code', 'attribute_0', 'attribute_1', 'attribute_2', 'attribute_3']

In [33]:
# fill na:

for col in num_feat:
    train[col] = train[col].fillna(train[col].median())
    test[col] = test[col].fillna(train[col].median())

for col in cat_te_feat+cat_ohe_feat:
    train[col]=train[col].fillna(train[col].mode()[0])
    test[col]=test[col].fillna(train[col].mode()[0])
    
display(train.count(), test.count())

product_code      23913
loading           23913
attribute_0       23913
attribute_1       23913
attribute_2       23913
attribute_3       23913
measurement_0     23913
measurement_1     23913
measurement_2     23913
measurement_3     23913
measurement_4     23913
measurement_5     23913
measurement_6     23913
measurement_7     23913
measurement_8     23913
measurement_9     23913
measurement_10    23913
measurement_11    23913
measurement_12    23913
measurement_13    23913
measurement_14    23913
measurement_15    23913
measurement_16    23913
measurement_17    23913
failure           23913
dtype: int64

product_code      2657
loading           2657
attribute_0       2657
attribute_1       2657
attribute_2       2657
attribute_3       2657
measurement_0     2657
measurement_1     2657
measurement_2     2657
measurement_3     2657
measurement_4     2657
measurement_5     2657
measurement_6     2657
measurement_7     2657
measurement_8     2657
measurement_9     2657
measurement_10    2657
measurement_11    2657
measurement_12    2657
measurement_13    2657
measurement_14    2657
measurement_15    2657
measurement_16    2657
measurement_17    2657
failure           2657
dtype: int64

In [34]:
# take out target

X_train = train
y_train = X_train.pop('failure')
X_test = test
y_test = X_test.pop('failure')

display(X_train.shape, X_test.shape, y_train.shape)

(23913, 24)

(2657, 24)

(23913,)

In [35]:
# encode cat features, ohe for now:

feature_transformer = ColumnTransformer([
    ("cat", OneHotEncoder(sparse = False, handle_unknown="ignore"), cat_te_feat+cat_ohe_feat)],
    remainder="passthrough")

print('Number of features before transformation: ', X_train.shape)
X_train = pd.DataFrame(feature_transformer.fit_transform(X_train), columns=feature_transformer.get_feature_names_out())
X_test = pd.DataFrame(feature_transformer.transform(X_test), columns=feature_transformer.get_feature_names_out())
print('Number of features after transformation: ', X_train.shape)

Number of features before transformation:  (23913, 24)
Number of features before transformation:  (23913, 37)


In [37]:
X_train

Unnamed: 0,cat__product_code_A,cat__product_code_B,cat__product_code_C,cat__product_code_D,cat__product_code_E,cat__attribute_0_material_5,cat__attribute_0_material_7,cat__attribute_1_material_5,cat__attribute_1_material_6,cat__attribute_1_material_8,...,remainder__measurement_8,remainder__measurement_9,remainder__measurement_10,remainder__measurement_11,remainder__measurement_12,remainder__measurement_13,remainder__measurement_14,remainder__measurement_15,remainder__measurement_16,remainder__measurement_17
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,20.155,10.672,15.859,17.5940,15.193,15.029,16.034,13.034,14.684,764.100
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,18.288,12.715,15.607,19.2105,13.798,16.711,18.631,14.094,17.946,663.376
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,19.060,12.471,16.346,18.3770,10.020,15.250,15.562,16.154,17.172,826.282
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,18.093,10.337,17.082,19.9320,12.428,16.182,12.760,13.153,16.412,579.885
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,20.810,10.622,14.904,19.1070,13.327,15.354,19.251,14.961,17.625,832.902
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23908,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,19.354,11.432,12.177,17.9420,10.112,15.795,18.572,16.144,16.426,729.131
23909,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,19.563,11.242,14.179,20.5640,10.234,14.450,14.322,13.146,16.471,853.924
23910,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,19.279,11.407,16.437,17.4760,8.668,15.069,16.599,15.590,14.065,750.364
23911,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,19.358,11.392,17.064,17.8140,14.928,16.273,15.485,13.624,12.865,730.156


In [38]:
# fir XGBoost

optuna_xgb = XGBClassifier()
optuna_xgb.fit(X_train, y_train)

display('Accuracy: ', accuracy_score(y_train,optuna_xgb.predict(X_train)))
display('F1 score: ', f1_score(y_train,optuna_xgb.predict(X_train)))
display('Recall score: ', recall_score(y_train,optuna_xgb.predict(X_train)))
display('Precision score: ', precision_score(y_train,optuna_xgb.predict(X_train)))
#display('PRUC: ', auc_precision_recall_train)
#display('Precision at 20% recall: ', r20prec_train)
#display('Precision at 50% recall: ', r50prec_train)
# Performance evaluation:
display('Accuracy: ', accuracy_score(y_test,optuna_xgb.predict(X_test)))
display('F1 score: ', f1_score(y_test,optuna_xgb.predict(X_test)))
display('Recall score: ', recall_score(y_test,optuna_xgb.predict(X_test)))
display('Precision score: ', precision_score(y_test,optuna_xgb.predict(X_test)))
#display('PRUC: ', auc_precision_recall_test)
#display('Precision at 20% recall: ', r20prec_test)
#display('Precision at 50% recall: ', r50prec_test)


'Accuracy: '

0.8533851879730691

'F1 score: '

0.47452038369304556

'Recall score: '

0.3113077679449361

'Precision score: '

0.9974795211090107

'Accuracy: '

0.7745577719232217

'F1 score: '

0.044657097288676235

'Recall score: '

0.024822695035460994

'Precision score: '

0.2222222222222222