In [1]:
from pycaret.classification import *
from sklearn.model_selection import train_test_split

import pandas as pd
import os

In [2]:
ROOT_DIR = "../data"
RANDOM_STATE = 200

train_data = pd.read_csv(os.path.join(ROOT_DIR, "pca_train.csv"))
train_data.rename(columns={'12': 'target'}, inplace=True)

df_normal = train_data[train_data["target"] == "Normal"]
df_abnormal = train_data[train_data["target"] == "AbNormal"]

normal_ratio = 3.0

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)

## under sampling
df_normal = df_normal.sample(n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)

train_data = df_concat

# df_train, df_val = train_test_split(
#     df_concat,
#     test_size=0.3,
#     stratify=df_concat["target"],
#     random_state=RANDOM_STATE,
# )

In [3]:
setup_clf = setup(data=train_data, target="target", train_size=0.7, session_id=333, fold_shuffle=True)

Unnamed: 0,Description,Value
0,Session id,333
1,Target,target
2,Target type,Binary
3,Target mapping,"AbNormal: 0, Normal: 1"
4,Original data shape,"(9400, 37)"
5,Transformed data shape,"(9400, 61)"
6,Transformed train set shape,"(6580, 61)"
7,Transformed test set shape,"(2820, 61)"
8,Numeric features,24
9,Categorical features,12


In [4]:
model = compare_models(sort='F1', n_select=10)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.747,0.6447,0.747,0.7112,0.716,0.1956,0.212,0.316
gbc,Gradient Boosting Classifier,0.7523,0.6483,0.7523,0.7126,0.7119,0.1796,0.2046,0.917
rf,Random Forest Classifier,0.724,0.6323,0.724,0.6999,0.7085,0.1918,0.1957,0.286
lda,Linear Discriminant Analysis,0.7464,0.6438,0.7464,0.7037,0.7057,0.1617,0.1841,0.034
ada,Ada Boost Classifier,0.7495,0.6411,0.7495,0.708,0.7055,0.1612,0.1884,0.199
ridge,Ridge Classifier,0.7543,0.6454,0.7543,0.7106,0.7002,0.1434,0.1807,0.037
et,Extra Trees Classifier,0.7081,0.6307,0.7081,0.6938,0.6998,0.181,0.1823,0.123
knn,K Neighbors Classifier,0.7229,0.5964,0.7229,0.6779,0.6889,0.119,0.1295,0.044
qda,Quadratic Discriminant Analysis,0.7529,0.5874,0.7529,0.733,0.6801,0.0954,0.1592,0.033
dt,Decision Tree Classifier,0.6801,0.5679,0.6801,0.6766,0.6782,0.1373,0.1374,0.08


In [5]:
tuned_model = [tune_model(i) for i in model]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7508,0.6111,0.7508,0.7065,0.7033,0.1506,0.1802
1,0.7492,0.6394,0.7492,0.7117,0.7153,0.1897,0.2087
2,0.7508,0.6485,0.7508,0.7136,0.7165,0.1926,0.2126
3,0.7553,0.6452,0.7553,0.7176,0.7162,0.1893,0.2156
4,0.7447,0.639,0.7447,0.701,0.7045,0.1562,0.1772
5,0.7325,0.6414,0.7325,0.6829,0.6909,0.1217,0.1378
6,0.7462,0.6502,0.7462,0.7139,0.7199,0.2117,0.2247
7,0.769,0.6632,0.769,0.7445,0.7465,0.2876,0.3034
8,0.7538,0.635,0.7538,0.7189,0.7203,0.2074,0.2286
9,0.7492,0.6691,0.7492,0.7076,0.7069,0.1662,0.192


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7447,0.6114,0.7447,0.6963,0.6976,0.1344,0.1592
1,0.7508,0.6405,0.7508,0.7158,0.7199,0.2044,0.2218
2,0.7629,0.6571,0.7629,0.729,0.7205,0.2004,0.2351
3,0.7508,0.6499,0.7508,0.7031,0.6957,0.128,0.1626
4,0.7568,0.6293,0.7568,0.7176,0.7106,0.1714,0.205
5,0.7432,0.6355,0.7432,0.6946,0.6956,0.1329,0.1578
6,0.7432,0.6623,0.7432,0.7072,0.7134,0.1909,0.2053
7,0.7675,0.6711,0.7675,0.7386,0.7342,0.2459,0.2736
8,0.7614,0.6414,0.7614,0.7278,0.7212,0.2067,0.2387
9,0.7568,0.6733,0.7568,0.7189,0.7111,0.177,0.2111


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7508,0.6117,0.7508,0.5636,0.6439,0.0,0.0
1,0.7508,0.6666,0.7508,0.5636,0.6439,0.0,0.0
2,0.7508,0.646,0.7508,0.5636,0.6439,0.0,0.0
3,0.7508,0.6467,0.7508,0.5636,0.6439,0.0,0.0
4,0.7508,0.6294,0.7508,0.5636,0.6439,0.0,0.0
5,0.7492,0.6317,0.7492,0.5614,0.6418,0.0,0.0
6,0.7492,0.6701,0.7492,0.5614,0.6418,0.0,0.0
7,0.7492,0.6653,0.7492,0.5614,0.6418,0.0,0.0
8,0.7492,0.6302,0.7492,0.5614,0.6418,0.0,0.0
9,0.7492,0.6792,0.7492,0.5614,0.6418,0.0,0.0


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7508,0.5835,0.7508,0.5636,0.6439,0.0,0.0
1,0.7508,0.61,0.7508,0.5636,0.6439,0.0,0.0
2,0.7508,0.5542,0.7508,0.5636,0.6439,0.0,0.0
3,0.7508,0.5637,0.7508,0.5636,0.6439,0.0,0.0
4,0.7508,0.6142,0.7508,0.5636,0.6439,0.0,0.0
5,0.7492,0.5869,0.7492,0.5614,0.6418,0.0,0.0
6,0.7492,0.5719,0.7492,0.5614,0.6418,0.0,0.0
7,0.7492,0.5712,0.7492,0.5614,0.6418,0.0,0.0
8,0.7492,0.5483,0.7492,0.5614,0.6418,0.0,0.0
9,0.7492,0.6184,0.7492,0.5614,0.6418,0.0,0.0


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7538,0.6124,0.7538,0.707,0.6774,0.0797,0.1315
1,0.769,0.6462,0.769,0.7479,0.7083,0.1651,0.2322
2,0.7538,0.6485,0.7538,0.7079,0.6927,0.1199,0.1622
3,0.7584,0.6462,0.7584,0.7224,0.6864,0.1044,0.1642
4,0.7523,0.6244,0.7523,0.7039,0.6899,0.112,0.1529
5,0.7477,0.6244,0.7477,0.6971,0.6893,0.1143,0.1488
6,0.769,0.6652,0.769,0.7424,0.72,0.2018,0.2519
7,0.7644,0.6676,0.7644,0.7327,0.7181,0.1965,0.2382
8,0.7629,0.6345,0.7629,0.7378,0.6961,0.1355,0.2023
9,0.7568,0.6739,0.7568,0.7257,0.6782,0.0882,0.1543


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7568,0.6277,0.7568,0.7159,0.6893,0.1114,0.1637
1,0.766,0.6536,0.766,0.7369,0.7095,0.1679,0.2232
2,0.7523,0.6579,0.7523,0.703,0.6863,0.1022,0.1452
3,0.7599,0.6394,0.7599,0.7269,0.6894,0.1126,0.1744
4,0.7553,0.6309,0.7553,0.7113,0.6883,0.1083,0.1573
5,0.7477,0.6303,0.7477,0.6955,0.6859,0.1048,0.141
6,0.7736,0.6662,0.7736,0.7531,0.7234,0.2113,0.2692
7,0.766,0.6538,0.766,0.7372,0.7133,0.1827,0.2344
8,0.7599,0.6439,0.7599,0.7258,0.6976,0.1387,0.1936
9,0.7568,0.6741,0.7568,0.7192,0.6883,0.1133,0.1692


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7538,0.6216,0.7538,0.7086,0.6961,0.1293,0.1692
1,0.7416,0.656,0.7416,0.7058,0.713,0.1871,0.2001
2,0.7538,0.6496,0.7538,0.7149,0.7138,0.1822,0.2083
3,0.7629,0.6481,0.7629,0.7293,0.7074,0.1616,0.2114
4,0.7477,0.6278,0.7477,0.7009,0.6997,0.1403,0.1678
5,0.7401,0.6234,0.7401,0.6869,0.689,0.1136,0.1377
6,0.7508,0.6676,0.7508,0.7159,0.7191,0.2053,0.2237
7,0.7553,0.6697,0.7553,0.7214,0.7226,0.2142,0.2354
8,0.7508,0.6357,0.7508,0.7104,0.7094,0.1734,0.1994
9,0.7477,0.6753,0.7477,0.6963,0.6876,0.1096,0.145


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7538,0.5936,0.7538,0.7069,0.6854,0.1002,0.1474
1,0.7553,0.6563,0.7553,0.7113,0.6901,0.1132,0.1609
2,0.7523,0.5966,0.7523,0.7018,0.6805,0.0871,0.133
3,0.7568,0.6202,0.7568,0.7214,0.6772,0.0806,0.1436
4,0.7538,0.6196,0.7538,0.7069,0.6854,0.1002,0.1474
5,0.7477,0.6115,0.7477,0.6875,0.6683,0.0593,0.0999
6,0.7477,0.5981,0.7477,0.6947,0.6841,0.1,0.1369
7,0.7568,0.6015,0.7568,0.7237,0.6803,0.0933,0.1572
8,0.7644,0.6148,0.7644,0.7434,0.6971,0.1387,0.2097
9,0.7568,0.6159,0.7568,0.7222,0.6824,0.0984,0.1601


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7599,0.5845,0.7599,0.7576,0.6722,0.0707,0.1578
1,0.7492,0.6222,0.7492,0.6891,0.6704,0.06,0.1008
2,0.7568,0.5974,0.7568,0.7303,0.6704,0.0644,0.1342
3,0.7477,0.5921,0.7477,0.6785,0.6628,0.0406,0.0757
4,0.7599,0.5945,0.7599,0.7432,0.6768,0.0815,0.1599
5,0.7584,0.5758,0.7584,0.7309,0.6813,0.0964,0.1655
6,0.7675,0.6147,0.7675,0.7749,0.6913,0.1256,0.2244
7,0.7568,0.5847,0.7568,0.7237,0.6803,0.0933,0.1572
8,0.7629,0.5904,0.7629,0.7519,0.6863,0.1109,0.1942
9,0.7508,0.6145,0.7508,0.6995,0.6723,0.0706,0.119


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7508,0.5,0.7508,0.5636,0.6439,0.0,0.0
1,0.7508,0.5,0.7508,0.5636,0.6439,0.0,0.0
2,0.7508,0.5,0.7508,0.5636,0.6439,0.0,0.0
3,0.7508,0.5,0.7508,0.5636,0.6439,0.0,0.0
4,0.7508,0.5,0.7508,0.5636,0.6439,0.0,0.0
5,0.7492,0.5,0.7492,0.5614,0.6418,0.0,0.0
6,0.7492,0.5,0.7492,0.5614,0.6418,0.0,0.0
7,0.7492,0.5,0.7492,0.5614,0.6418,0.0,0.0
8,0.7492,0.5,0.7492,0.5614,0.6418,0.0,0.0
9,0.7492,0.5,0.7492,0.5614,0.6418,0.0,0.0


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [7]:
tuned_model

[LGBMClassifier(bagging_fraction=1.0, bagging_freq=2, boosting_type='gbdt',
                class_weight=None, colsample_bytree=1.0, feature_fraction=0.8,
                importance_type='split', learning_rate=0.1, max_depth=-1,
                min_child_samples=86, min_child_weight=0.001, min_split_gain=0.1,
                n_estimators=130, n_jobs=-1, num_leaves=6, objective=None,
                random_state=333, reg_alpha=1e-07, reg_lambda=0.005,
                subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
 GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                            learning_rate=0.2, loss='log_loss', max_depth=2,
                            max_features=1.0, max_leaf_nodes=None,
                            min_impurity_decrease=0.3, min_samples_leaf=4,
                            min_samples_split=4, min_weight_fraction_leaf=0.0,
                            n_estimators=170, n_iter_no_change=None,
                      

In [8]:
blended_hard = blend_models(estimator_list=tuned_model[:8],
                       fold=10,
                       method="hard",
                       optimize="F1")

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7584,0.0,0.7584,0.7207,0.6903,0.1144,0.1704
1,0.772,0.0,0.772,0.75,0.7186,0.1938,0.2536
2,0.7553,0.0,0.7553,0.7115,0.6937,0.1229,0.1678
3,0.7644,0.0,0.7644,0.741,0.6962,0.1319,0.2015
4,0.7523,0.0,0.7523,0.7044,0.6917,0.1168,0.1567
5,0.7492,0.0,0.7492,0.7001,0.6904,0.1173,0.1539
6,0.7705,0.0,0.7705,0.7446,0.724,0.2132,0.2614
7,0.769,0.0,0.769,0.7415,0.7228,0.21,0.256
8,0.766,0.0,0.766,0.7418,0.7053,0.1605,0.2241
9,0.7584,0.0,0.7584,0.7243,0.6893,0.1164,0.1761


In [9]:
final_model = finalize_model(blended_hard)
evaluate_model(final_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [None]:
final_model

In [10]:
test_data = pd.read_csv(os.path.join(ROOT_DIR, "pca_test.csv"))

train_columns = train_data.columns
#test_data = test_data[train_columns]

df_test_x = test_data#.drop(columns=['target'])

In [11]:
test_pred = predict_model(final_model, data=test_data)
test_pred



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,prediction_label
0,Dam dispenser #2,AJX75334501,3J1XF767-1,AJX75334501,3J1XF767-1,OK,Fill1 dispenser #2,AJX75334501,3J1XF767-1,Fill2 dispenser #2,...,0.455273,0.269325,-1.238153,0.492180,6.959485,-5.926402,2.009640,-1.824390,-2.626029,Normal
1,Dam dispenser #2,AJX75334501,4B1XD472-2,AJX75334501,4B1XD472-2,OK,Fill1 dispenser #2,AJX75334501,4B1XD472-2,Fill2 dispenser #2,...,9.639068,20.259314,3.928456,-4.542729,-22.891991,-21.994287,0.685102,10.901917,14.508316,Normal
2,Dam dispenser #1,AJX75334501,3H1XE355-1,AJX75334501,3H1XE355-1,OK,Fill1 dispenser #1,AJX75334501,3H1XE355-1,Fill2 dispenser #1,...,-0.000310,-0.845370,-2.121011,1.438319,12.259931,-8.430806,4.413903,-2.896132,-13.369293,Normal
3,Dam dispenser #2,AJX75334501,3L1XA128-1,AJX75334501,3L1XA128-1,OK,Fill1 dispenser #2,AJX75334501,3L1XA128-1,Fill2 dispenser #2,...,7.138623,-20.968945,-10.638637,5.445040,19.410360,18.703112,-12.371858,2.363786,23.805340,Normal
4,Dam dispenser #1,AJX75334501,4A1XA639-1,AJX75334501,4A1XA639-1,OK,Fill1 dispenser #1,AJX75334501,4A1XA639-1,Fill2 dispenser #1,...,1.432299,-10.076963,-4.993634,1.617880,9.122725,8.559978,-4.423807,-1.227007,2.612746,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,Dam dispenser #2,AJX75334501,3K1XB597-1,AJX75334501,3K1XB597-1,OK,Fill1 dispenser #2,AJX75334501,3K1XB597-1,Fill2 dispenser #2,...,4.206943,-23.325636,-22.888144,6.355010,23.373249,23.745441,-1.180442,-9.862477,-8.717693,Normal
17357,Dam dispenser #2,AJX75334501,4A1XB974-1,AJX75334501,4A1XB974-1,OK,Fill1 dispenser #2,AJX75334501,4A1XB974-1,Fill2 dispenser #2,...,-0.705675,9.064880,5.279408,-1.777022,-8.853498,-7.986886,4.092596,1.211678,-2.399325,Normal
17358,Dam dispenser #1,AJX75334501,3L1XA998-1,AJX75334501,3L1XA998-1,OK,Fill1 dispenser #1,AJX75334501,3L1XA998-1,Fill2 dispenser #1,...,11.462799,-2.212937,-18.890753,2.500339,-0.450027,2.360564,0.850209,-1.942946,-4.352603,Normal
17359,Dam dispenser #1,AJX75334501,3F1XC376-1,AJX75334501,3F1XC376-1,OK,Fill1 dispenser #1,AJX75334501,3F1XC376-1,Fill2 dispenser #1,...,2.285775,0.859113,-2.722195,-0.715221,-22.827568,21.756470,-1.748279,-0.944664,-2.326972,Normal


In [12]:
df_sub = pd.read_csv("../data/submission.csv")
df_sub["target"] = test_pred["prediction_label"]
df_sub.to_csv("../data/automl/submission-12.csv", index=False)