In [16]:
from pycaret.classification import *
from sklearn.model_selection import train_test_split

import pandas as pd
import os

In [27]:
ROOT_DIR = "../data"
RANDOM_STATE = 42

train_data = pd.read_csv(os.path.join(ROOT_DIR, "preprocessing_train.csv"))

df_normal = train_data[train_data["target"] == "Normal"]
df_abnormal = train_data[train_data["target"] == "AbNormal"]

normal_ratio = 1.5

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)

## under sampling
df_normal = df_normal.sample(n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)

train_data = df_concat

# df_train, df_val = train_test_split(
#     df_concat,
#     test_size=0.3,
#     stratify=df_concat["target"],
#     random_state=RANDOM_STATE,
# )

In [28]:
setup_clf = setup(data=train_data, target="target", train_size=0.7, session_id=333, fold_shuffle=True)

Unnamed: 0,Description,Value
0,Session id,333
1,Target,target
2,Target type,Binary
3,Target mapping,"AbNormal: 0, Normal: 1"
4,Original data shape,"(5875, 146)"
5,Transformed data shape,"(5875, 170)"
6,Transformed train set shape,"(4112, 170)"
7,Transformed test set shape,"(1763, 170)"
8,Numeric features,133
9,Categorical features,12


In [29]:
model = compare_models(sort='F1', fold=10, n_select=2)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.6413,0.6478,0.6413,0.6336,0.6341,0.2311,0.2339,0.434
rf,Random Forest Classifier,0.6367,0.6495,0.6367,0.6315,0.6321,0.2289,0.2306,0.138
ridge,Ridge Classifier,0.6452,0.6475,0.6452,0.6353,0.632,0.2262,0.2331,0.044
lda,Linear Discriminant Analysis,0.6425,0.647,0.6425,0.6325,0.6299,0.2216,0.2279,0.05
lightgbm,Light Gradient Boosting Machine,0.6342,0.647,0.6342,0.6281,0.6294,0.2223,0.2237,0.316
et,Extra Trees Classifier,0.6325,0.6524,0.6325,0.6286,0.629,0.2233,0.2247,0.12
ada,Ada Boost Classifier,0.6308,0.6462,0.6308,0.6269,0.6282,0.2219,0.2223,0.134
dt,Decision Tree Classifier,0.589,0.5749,0.589,0.5917,0.5899,0.1488,0.149,0.055
knn,K Neighbors Classifier,0.5968,0.5906,0.5968,0.5864,0.5881,0.1349,0.1366,0.051
nb,Naive Bayes,0.5827,0.6145,0.5827,0.5906,0.5853,0.1456,0.1464,0.044


In [30]:
tuned_model = [tune_model(i) for i in model]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6456,0.6573,0.6456,0.6362,0.6352,0.2327,0.2375
1,0.6675,0.6818,0.6675,0.6599,0.6581,0.2807,0.2862
2,0.6399,0.6462,0.6399,0.6304,0.6307,0.2221,0.2257
3,0.6204,0.6349,0.6204,0.6117,0.6136,0.1869,0.1887
4,0.6131,0.5877,0.6131,0.6038,0.6058,0.1704,0.1722
5,0.6521,0.68,0.6521,0.645,0.6461,0.2555,0.2577
6,0.6302,0.6572,0.6302,0.628,0.6289,0.2241,0.2242
7,0.6667,0.6469,0.6667,0.6599,0.6599,0.2857,0.289
8,0.6545,0.6686,0.6545,0.6456,0.6429,0.2496,0.256
9,0.6229,0.6446,0.6229,0.6138,0.6152,0.1919,0.1941


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6456,0.6591,0.6456,0.6351,0.6312,0.2246,0.2323
1,0.6626,0.6789,0.6626,0.6551,0.6544,0.2733,0.2774
2,0.6423,0.6404,0.6423,0.6319,0.6305,0.2216,0.227
3,0.618,0.6154,0.618,0.6067,0.6078,0.1739,0.1769
4,0.6131,0.5791,0.6131,0.5994,0.5994,0.1562,0.1604
5,0.6423,0.6688,0.6423,0.6325,0.6321,0.2249,0.2293
6,0.6277,0.6383,0.6277,0.6191,0.6207,0.2017,0.2038
7,0.6545,0.6338,0.6545,0.6462,0.6451,0.2543,0.259
8,0.6642,0.6703,0.6642,0.6563,0.6497,0.2646,0.2746
9,0.6156,0.6292,0.6156,0.6033,0.6035,0.1668,0.1707


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [33]:
blended_soft = blend_models(estimator_list=tuned_model,
                       method="hard",
                       optimize="F1")

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6578,0.0,0.6578,0.6502,0.6501,0.2644,0.2679
1,0.665,0.0,0.665,0.6593,0.6602,0.2867,0.2886
2,0.6375,0.0,0.6375,0.6308,0.6325,0.2275,0.2289
3,0.6204,0.0,0.6204,0.6117,0.6136,0.1869,0.1887
4,0.5961,0.0,0.5961,0.587,0.5895,0.1366,0.1377
5,0.6545,0.0,0.6545,0.6478,0.6489,0.2615,0.2635
6,0.6375,0.0,0.6375,0.635,0.636,0.2387,0.2388
7,0.6569,0.0,0.6569,0.652,0.6533,0.2738,0.2749
8,0.6569,0.0,0.6569,0.6485,0.6465,0.2572,0.2628
9,0.6204,0.0,0.6204,0.6143,0.6162,0.1958,0.1966


In [32]:
tuned_model

[GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                            learning_rate=0.1, loss='log_loss', max_depth=3,
                            max_features=None, max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_samples_leaf=1,
                            min_samples_split=2, min_weight_fraction_leaf=0.0,
                            n_estimators=100, n_iter_no_change=None,
                            random_state=333, subsample=1.0, tol=0.0001,
                            validation_fraction=0.1, verbose=0,
                            warm_start=False),
 RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight={},
                        criterion='entropy', max_depth=9, max_features=1.0,
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.002, min_samples_leaf=2,
                        min_samples_split=2, min_weight_fraction_leaf=0.0,
        

In [34]:
final_model = finalize_model(blended_soft)
evaluate_model(final_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [35]:
test_data = pd.read_csv(os.path.join(ROOT_DIR, "preprocessing_test.csv"))

train_columns = train_data.columns
test_data = test_data[train_columns]

df_test_x = test_data.drop(columns=['target'])

In [36]:
test_pred = predict_model(final_model, data=df_test_x)
test_pred

Unnamed: 0,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,...,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,prediction_label
0,Dam dispenser #2,AJX75334501,3J1XF767-1,1000.0,12.5,90,70,280,90,10,...,91.800003,270.000000,50,85,19.799999,13.000000,195,1,0,AbNormal
1,Dam dispenser #2,AJX75334501,4B1XD472-2,1000.0,12.5,90,70,280,90,16,...,50.000000,91.800003,270,50,85.000000,19.799999,14,256,1,Normal
2,Dam dispenser #1,AJX75334501,3H1XE355-1,240.0,2.5,-90,70,1030,-90,10,...,91.800003,270.000000,50,85,19.700001,1.000000,98,1,0,AbNormal
3,Dam dispenser #2,AJX75334501,3L1XA128-1,1000.0,12.5,90,70,280,90,10,...,50.000000,91.800003,270,50,85.000000,20.000000,14,0,1,Normal
4,Dam dispenser #1,AJX75334501,4A1XA639-1,240.0,2.5,-90,70,1030,-90,16,...,50.000000,91.800003,270,50,85.000000,19.799999,1,215,1,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,Dam dispenser #2,AJX75334501,3K1XB597-1,1000.0,12.5,90,70,280,90,10,...,50.000000,91.800003,270,50,85.000000,19.500000,14,131,1,Normal
17357,Dam dispenser #2,AJX75334501,4A1XB974-1,1000.0,12.5,90,70,280,90,16,...,50.000000,91.800003,270,50,85.000000,19.799999,12,279,1,Normal
17358,Dam dispenser #1,AJX75334501,3L1XA998-1,240.0,2.5,-90,70,1030,-90,16,...,50.000000,91.800003,270,50,85.000000,20.500000,4,66,1,AbNormal
17359,Dam dispenser #1,AJX75334501,3F1XC376-1,240.0,2.5,-90,70,1030,-90,10,...,91.800003,270.000000,50,85,18.900000,1.000000,117,1,0,AbNormal


In [37]:
df_sub = pd.read_csv("../data/submission.csv")
df_sub["target"] = test_pred["prediction_label"]
df_sub.to_csv("../data/automl/submission-22.csv", index=False)