In [1]:
from pycaret.classification import *
from sklearn.model_selection import train_test_split

import pandas as pd
import os

In [15]:
ROOT_DIR = "../data"
RANDOM_STATE = 200

train_data = pd.read_csv(os.path.join(ROOT_DIR, "pca_train.csv"))
train_data.rename(columns={'12': 'target'}, inplace=True)

df_normal = train_data[train_data["target"] == "Normal"]
df_abnormal = train_data[train_data["target"] == "AbNormal"]

normal_ratio = 1.0

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)

## under sampling
df_normal = df_normal.sample(n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)

train_data = df_concat

# df_train, df_val = train_test_split(
#     df_concat,
#     test_size=0.3,
#     stratify=df_concat["target"],
#     random_state=RANDOM_STATE,
# )

In [16]:
setup_clf = setup(data=train_data, target="target", train_size=0.7, session_id=333, fold_shuffle=True)

Unnamed: 0,Description,Value
0,Session id,333
1,Target,target
2,Target type,Binary
3,Target mapping,"AbNormal: 0, Normal: 1"
4,Original data shape,"(4700, 37)"
5,Transformed data shape,"(4700, 61)"
6,Transformed train set shape,"(3290, 61)"
7,Transformed test set shape,"(1410, 61)"
8,Numeric features,24
9,Categorical features,12


In [17]:
model = compare_models(sort='F1', fold=10, n_select=5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.6036,0.6318,0.6036,0.6058,0.602,0.2073,0.2094,0.024
ridge,Ridge Classifier,0.603,0.6328,0.603,0.6052,0.6013,0.2061,0.2082,0.023
lightgbm,Light Gradient Boosting Machine,0.5954,0.6286,0.5954,0.5955,0.5953,0.1909,0.1909,0.3
ada,Ada Boost Classifier,0.593,0.6262,0.593,0.5945,0.5911,0.1859,0.1874,0.106
gbc,Gradient Boosting Classifier,0.5909,0.6273,0.5909,0.5911,0.5907,0.1817,0.182,0.427
rf,Random Forest Classifier,0.59,0.6242,0.59,0.5902,0.5896,0.1799,0.1801,0.135
et,Extra Trees Classifier,0.5863,0.6204,0.5863,0.5865,0.5861,0.1727,0.1728,0.077
knn,K Neighbors Classifier,0.5733,0.6115,0.5733,0.5734,0.573,0.1465,0.1466,0.025
dt,Decision Tree Classifier,0.5672,0.5672,0.5672,0.5673,0.5669,0.1343,0.1345,0.039
lr,Logistic Regression,0.5641,0.5925,0.5641,0.5651,0.562,0.1282,0.1292,0.233


In [18]:
tuned_model = [tune_model(i) for i in model]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5532,0.543,0.5532,0.556,0.5467,0.1057,0.1088
1,0.5897,0.6131,0.5897,0.5996,0.5785,0.1785,0.1886
2,0.5593,0.6091,0.5593,0.5618,0.5541,0.118,0.1207
3,0.535,0.5445,0.535,0.5395,0.5192,0.0689,0.0738
4,0.5745,0.5895,0.5745,0.581,0.565,0.1482,0.1549
5,0.5623,0.5596,0.5623,0.567,0.5552,0.1253,0.1296
6,0.5471,0.5633,0.5471,0.5517,0.5379,0.095,0.0991
7,0.5866,0.5615,0.5866,0.5918,0.5812,0.1738,0.1786
8,0.535,0.5631,0.535,0.5374,0.5285,0.0706,0.0726
9,0.6109,0.6362,0.6109,0.6168,0.6064,0.2224,0.2279


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5745,0.5995,0.5745,0.5748,0.5739,0.1487,0.1491
1,0.6413,0.6698,0.6413,0.6474,0.6373,0.2822,0.2885
2,0.5714,0.6061,0.5714,0.5714,0.5714,0.1428,0.1428
3,0.6079,0.6583,0.6079,0.6082,0.6076,0.2157,0.216
4,0.6261,0.6388,0.6261,0.6306,0.6227,0.2518,0.2564
5,0.6018,0.6172,0.6018,0.603,0.6008,0.2039,0.205
6,0.5228,0.5407,0.5228,0.5229,0.5226,0.0457,0.0457
7,0.6717,0.7269,0.6717,0.6763,0.6697,0.3438,0.3481
8,0.5714,0.6006,0.5714,0.5736,0.5686,0.1433,0.1452
9,0.6596,0.7156,0.6596,0.6602,0.6593,0.3193,0.3199


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5562,0.5897,0.5562,0.5566,0.5557,0.1127,0.113
1,0.6201,0.6488,0.6201,0.6201,0.6201,0.2401,0.2401
2,0.5866,0.609,0.5866,0.5874,0.5859,0.1735,0.1741
3,0.6109,0.6562,0.6109,0.611,0.6109,0.2219,0.2219
4,0.6353,0.642,0.6353,0.6353,0.6352,0.2705,0.2705
5,0.5684,0.5967,0.5684,0.5684,0.5682,0.1367,0.1368
6,0.5015,0.5312,0.5015,0.5015,0.5015,0.003,0.003
7,0.6596,0.7291,0.6596,0.6598,0.6595,0.3192,0.3194
8,0.5775,0.6046,0.5775,0.5775,0.5774,0.1549,0.155
9,0.6535,0.6999,0.6535,0.6545,0.6528,0.3068,0.3079


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5805,0.5973,0.5805,0.581,0.5797,0.1609,0.1615
1,0.6383,0.6533,0.6383,0.6425,0.6354,0.2762,0.2806
2,0.5775,0.6035,0.5775,0.5778,0.577,0.1548,0.1552
3,0.6079,0.6498,0.6079,0.6087,0.607,0.2156,0.2165
4,0.614,0.64,0.614,0.6169,0.6113,0.2276,0.2307
5,0.614,0.6067,0.614,0.6153,0.613,0.2282,0.2294
6,0.5198,0.5331,0.5198,0.52,0.5192,0.0397,0.0398
7,0.6505,0.7198,0.6505,0.657,0.647,0.3013,0.3076
8,0.5836,0.5913,0.5836,0.5874,0.5795,0.1677,0.1711
9,0.6474,0.7014,0.6474,0.6493,0.6464,0.2951,0.2968


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5623,0.5992,0.5623,0.5627,0.5618,0.1248,0.1251
1,0.6261,0.6474,0.6261,0.6262,0.6261,0.2523,0.2523
2,0.5684,0.6174,0.5684,0.5704,0.5658,0.1372,0.1389
3,0.5897,0.6573,0.5897,0.5897,0.5896,0.1794,0.1794
4,0.6261,0.6478,0.6261,0.6262,0.6261,0.2522,0.2523
5,0.5957,0.6113,0.5957,0.5957,0.5957,0.1915,0.1915
6,0.5198,0.5393,0.5198,0.5198,0.5188,0.0393,0.0394
7,0.6657,0.729,0.6657,0.6661,0.6655,0.3314,0.3318
8,0.5775,0.6086,0.5775,0.5778,0.5772,0.1551,0.1554
9,0.6505,0.7015,0.6505,0.6505,0.6504,0.3009,0.3009


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [19]:
tuned_model

[LinearDiscriminantAnalysis(covariance_estimator=None, n_components=None,
                            priors=None, shrinkage=None, solver='svd',
                            store_covariance=False, tol=0.0001),
 RidgeClassifier(alpha=7.46, class_weight=None, copy_X=True, fit_intercept=False,
                 max_iter=None, positive=False, random_state=333, solver='auto',
                 tol=0.0001),
 LGBMClassifier(bagging_fraction=1.0, bagging_freq=2, boosting_type='gbdt',
                class_weight=None, colsample_bytree=1.0, feature_fraction=0.8,
                importance_type='split', learning_rate=0.1, max_depth=-1,
                min_child_samples=86, min_child_weight=0.001, min_split_gain=0.1,
                n_estimators=130, n_jobs=-1, num_leaves=6, objective=None,
                random_state=333, reg_alpha=1e-07, reg_lambda=0.005,
                subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
 AdaBoostClassifier(algorithm='SAMME', estimator=None, learning_ra

In [20]:
blended_soft = blend_models(estimator_list=tuned_model[2:],
                       fold=10,
                       method="hard",
                       optimize="F1")

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5623,0.0,0.5623,0.5624,0.5623,0.1247,0.1247
1,0.6353,0.0,0.6353,0.6354,0.6351,0.2704,0.2706
2,0.5897,0.0,0.5897,0.5903,0.5891,0.1795,0.1801
3,0.6018,0.0,0.6018,0.602,0.6016,0.2035,0.2037
4,0.6322,0.0,0.6322,0.6326,0.6319,0.2643,0.2647
5,0.5988,0.0,0.5988,0.5988,0.5988,0.1976,0.1976
6,0.5106,0.0,0.5106,0.5106,0.5105,0.0212,0.0212
7,0.6444,0.0,0.6444,0.6465,0.6432,0.289,0.2909
8,0.5836,0.0,0.5836,0.584,0.5832,0.1673,0.1677
9,0.6565,0.0,0.6565,0.6565,0.6565,0.3131,0.3131


In [21]:
final_model = finalize_model(blended_soft)
evaluate_model(final_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [25]:
test_data = pd.read_csv(os.path.join(ROOT_DIR, "pca_test.csv"))

train_columns = train_data.columns
test_data = test_data[train_columns]

df_test_x = test_data#.drop(columns=['target'])

KeyError: "['target'] not in index"

In [26]:
test_pred = predict_model(final_model, data=test_data)
test_pred



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,prediction_label
0,Dam dispenser #2,AJX75334501,3J1XF767-1,AJX75334501,3J1XF767-1,OK,Fill1 dispenser #2,AJX75334501,3J1XF767-1,Fill2 dispenser #2,...,0.455273,0.269325,-1.238153,0.492180,6.959485,-5.926402,2.009640,-1.824390,-2.626029,AbNormal
1,Dam dispenser #2,AJX75334501,4B1XD472-2,AJX75334501,4B1XD472-2,OK,Fill1 dispenser #2,AJX75334501,4B1XD472-2,Fill2 dispenser #2,...,9.639068,20.259314,3.928456,-4.542729,-22.891991,-21.994287,0.685102,10.901917,14.508316,Normal
2,Dam dispenser #1,AJX75334501,3H1XE355-1,AJX75334501,3H1XE355-1,OK,Fill1 dispenser #1,AJX75334501,3H1XE355-1,Fill2 dispenser #1,...,-0.000310,-0.845370,-2.121011,1.438319,12.259931,-8.430806,4.413903,-2.896132,-13.369293,AbNormal
3,Dam dispenser #2,AJX75334501,3L1XA128-1,AJX75334501,3L1XA128-1,OK,Fill1 dispenser #2,AJX75334501,3L1XA128-1,Fill2 dispenser #2,...,7.138623,-20.968945,-10.638637,5.445040,19.410360,18.703112,-12.371858,2.363786,23.805340,Normal
4,Dam dispenser #1,AJX75334501,4A1XA639-1,AJX75334501,4A1XA639-1,OK,Fill1 dispenser #1,AJX75334501,4A1XA639-1,Fill2 dispenser #1,...,1.432299,-10.076963,-4.993634,1.617880,9.122725,8.559978,-4.423807,-1.227007,2.612746,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,Dam dispenser #2,AJX75334501,3K1XB597-1,AJX75334501,3K1XB597-1,OK,Fill1 dispenser #2,AJX75334501,3K1XB597-1,Fill2 dispenser #2,...,4.206943,-23.325636,-22.888144,6.355010,23.373249,23.745441,-1.180442,-9.862477,-8.717693,AbNormal
17357,Dam dispenser #2,AJX75334501,4A1XB974-1,AJX75334501,4A1XB974-1,OK,Fill1 dispenser #2,AJX75334501,4A1XB974-1,Fill2 dispenser #2,...,-0.705675,9.064880,5.279408,-1.777022,-8.853498,-7.986886,4.092596,1.211678,-2.399325,Normal
17358,Dam dispenser #1,AJX75334501,3L1XA998-1,AJX75334501,3L1XA998-1,OK,Fill1 dispenser #1,AJX75334501,3L1XA998-1,Fill2 dispenser #1,...,11.462799,-2.212937,-18.890753,2.500339,-0.450027,2.360564,0.850209,-1.942946,-4.352603,AbNormal
17359,Dam dispenser #1,AJX75334501,3F1XC376-1,AJX75334501,3F1XC376-1,OK,Fill1 dispenser #1,AJX75334501,3F1XC376-1,Fill2 dispenser #1,...,2.285775,0.859113,-2.722195,-0.715221,-22.827568,21.756470,-1.748279,-0.944664,-2.326972,AbNormal


In [27]:
df_sub = pd.read_csv("../data/submission.csv")
df_sub["target"] = test_pred["prediction_label"]
df_sub.to_csv("../data/automl/submission-8.csv", index=False)