In [1]:
from pycaret.classification import *
from sklearn.model_selection import train_test_split

import pandas as pd
import os

In [2]:
ROOT_DIR = "../data"
RANDOM_STATE = 200

train_data = pd.read_csv(os.path.join(ROOT_DIR, "preprocessing_train.csv"))

df_normal = train_data[train_data["target"] == "Normal"]
df_abnormal = train_data[train_data["target"] == "AbNormal"]

normal_ratio = 1.0

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)

## under sampling
df_normal = df_normal.sample(n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)

train_data = df_concat

# df_train, df_val = train_test_split(
#     df_concat,
#     test_size=0.3,
#     stratify=df_concat["target"],
#     random_state=RANDOM_STATE,
# )

In [3]:
setup_clf = setup(data=train_data, target="target", train_size=0.7, session_id=333, fold_shuffle=True)

Unnamed: 0,Description,Value
0,Session id,333
1,Target,target
2,Target type,Binary
3,Target mapping,"AbNormal: 0, Normal: 1"
4,Original data shape,"(4700, 146)"
5,Transformed data shape,"(4700, 170)"
6,Transformed train set shape,"(3290, 170)"
7,Transformed test set shape,"(1410, 170)"
8,Numeric features,133
9,Categorical features,12


In [4]:
model = compare_models(sort='F1', fold=10, n_select=5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.6021,0.6393,0.6021,0.6024,0.6018,0.2043,0.2045,0.109
ridge,Ridge Classifier,0.6006,0.6302,0.6006,0.602,0.5995,0.2013,0.2026,0.033
lightgbm,Light Gradient Boosting Machine,0.5982,0.628,0.5982,0.5984,0.5979,0.1963,0.1966,0.289
et,Extra Trees Classifier,0.5967,0.6349,0.5967,0.5968,0.5965,0.1933,0.1934,0.099
gbc,Gradient Boosting Classifier,0.596,0.6262,0.596,0.5964,0.5958,0.1921,0.1924,0.34
lda,Linear Discriminant Analysis,0.5948,0.6284,0.5948,0.596,0.5939,0.1897,0.1909,0.04
ada,Ada Boost Classifier,0.5875,0.6185,0.5875,0.5882,0.5863,0.175,0.1757,0.113
dt,Decision Tree Classifier,0.5702,0.5702,0.5702,0.5705,0.5697,0.1403,0.1406,0.046
nb,Naive Bayes,0.5675,0.6002,0.5675,0.5679,0.5668,0.1349,0.1353,0.033
knn,K Neighbors Classifier,0.5653,0.603,0.5653,0.5656,0.565,0.1307,0.1309,0.042


In [7]:
tuned_model = [tune_model(i) for i in model[:2]]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5775,0.6016,0.5775,0.5787,0.5762,0.1553,0.1563
1,0.6201,0.6643,0.6201,0.6202,0.62,0.2402,0.2403
2,0.6049,0.6243,0.6049,0.605,0.6048,0.2098,0.2099
3,0.6079,0.656,0.6079,0.6079,0.6079,0.2158,0.2158
4,0.6353,0.673,0.6353,0.6358,0.635,0.2706,0.2711
5,0.5623,0.6212,0.5623,0.5629,0.5609,0.1243,0.1251
6,0.5289,0.5623,0.5289,0.5297,0.5245,0.0572,0.0582
7,0.6748,0.7347,0.6748,0.6749,0.6747,0.3495,0.3496
8,0.5805,0.618,0.5805,0.5807,0.5803,0.161,0.1612
9,0.6535,0.716,0.6535,0.6601,0.6496,0.3065,0.3133


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5836,0.5931,0.5836,0.584,0.5829,0.167,0.1675
1,0.6474,0.6621,0.6474,0.6523,0.6444,0.2944,0.2995
2,0.5714,0.607,0.5714,0.5715,0.5714,0.1429,0.1429
3,0.6109,0.6632,0.6109,0.6111,0.6107,0.2218,0.222
4,0.6383,0.646,0.6383,0.6409,0.6364,0.2763,0.279
5,0.6109,0.618,0.6109,0.6119,0.6103,0.2221,0.2229
6,0.5046,0.5425,0.5046,0.5046,0.5046,0.0091,0.0091
7,0.6717,0.7288,0.6717,0.675,0.6703,0.3437,0.3468
8,0.5745,0.5949,0.5745,0.5766,0.5719,0.1493,0.1512
9,0.6383,0.7085,0.6383,0.6393,0.6378,0.2768,0.2776


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [8]:
tuned_model

[RandomForestClassifier(bootstrap=False, ccp_alpha=0.0,
                        class_weight='balanced_subsample', criterion='entropy',
                        max_depth=7, max_features='log2', max_leaf_nodes=None,
                        max_samples=None, min_impurity_decrease=0,
                        min_samples_leaf=3, min_samples_split=7,
                        min_weight_fraction_leaf=0.0, monotonic_cst=None,
                        n_estimators=140, n_jobs=-1, oob_score=False,
                        random_state=333, verbose=0, warm_start=False),
 RidgeClassifier(alpha=9.86, class_weight=None, copy_X=True, fit_intercept=False,
                 max_iter=None, positive=False, random_state=333, solver='auto',
                 tol=0.0001)]

In [8]:
#a = [tuned_model[0], tuned_model[], tuned_model[3], tuned_model[4]]

tuned_model.pop(1)

In [9]:
blended_soft = blend_models(estimator_list=tuned_model,
                       fold=10,
                       method="hard",
                       optimize="F1")

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5866,0.0,0.5866,0.5872,0.5861,0.1734,0.1739
1,0.6292,0.0,0.6292,0.6292,0.6291,0.2583,0.2584
2,0.5866,0.0,0.5866,0.5878,0.5855,0.1735,0.1746
3,0.6079,0.0,0.6079,0.608,0.6078,0.2159,0.216
4,0.6353,0.0,0.6353,0.6355,0.6352,0.2706,0.2708
5,0.5684,0.0,0.5684,0.5688,0.5675,0.1365,0.137
6,0.535,0.0,0.535,0.5357,0.5317,0.0694,0.0704
7,0.6626,0.0,0.6626,0.6626,0.6626,0.3252,0.3252
8,0.5897,0.0,0.5897,0.5899,0.5893,0.1792,0.1795
9,0.6565,0.0,0.6565,0.6601,0.6545,0.3127,0.3164


In [9]:
final_model = finalize_model(tuned_model[1])
evaluate_model(final_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [10]:
test_data = pd.read_csv(os.path.join(ROOT_DIR, "preprocessing_test.csv"))

train_columns = train_data.columns
test_data = test_data[train_columns]

df_test_x = test_data.drop(columns=['target'])

In [11]:
test_pred = predict_model(final_model, data=df_test_x)
test_pred

Unnamed: 0,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,...,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,prediction_label
0,Dam dispenser #2,AJX75334501,3J1XF767-1,1000.0,12.5,90,70,280,90,10,...,91.800003,270.000000,50,85,19.799999,13.000000,195,1,0,AbNormal
1,Dam dispenser #2,AJX75334501,4B1XD472-2,1000.0,12.5,90,70,280,90,16,...,50.000000,91.800003,270,50,85.000000,19.799999,14,256,1,Normal
2,Dam dispenser #1,AJX75334501,3H1XE355-1,240.0,2.5,-90,70,1030,-90,10,...,91.800003,270.000000,50,85,19.700001,1.000000,98,1,0,AbNormal
3,Dam dispenser #2,AJX75334501,3L1XA128-1,1000.0,12.5,90,70,280,90,10,...,50.000000,91.800003,270,50,85.000000,20.000000,14,0,1,Normal
4,Dam dispenser #1,AJX75334501,4A1XA639-1,240.0,2.5,-90,70,1030,-90,16,...,50.000000,91.800003,270,50,85.000000,19.799999,1,215,1,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,Dam dispenser #2,AJX75334501,3K1XB597-1,1000.0,12.5,90,70,280,90,10,...,50.000000,91.800003,270,50,85.000000,19.500000,14,131,1,AbNormal
17357,Dam dispenser #2,AJX75334501,4A1XB974-1,1000.0,12.5,90,70,280,90,16,...,50.000000,91.800003,270,50,85.000000,19.799999,12,279,1,Normal
17358,Dam dispenser #1,AJX75334501,3L1XA998-1,240.0,2.5,-90,70,1030,-90,16,...,50.000000,91.800003,270,50,85.000000,20.500000,4,66,1,AbNormal
17359,Dam dispenser #1,AJX75334501,3F1XC376-1,240.0,2.5,-90,70,1030,-90,10,...,91.800003,270.000000,50,85,18.900000,1.000000,117,1,0,AbNormal


In [12]:
df_sub = pd.read_csv("../data/submission.csv")
df_sub["target"] = test_pred["prediction_label"]
df_sub.to_csv("../data/automl/submission-6.csv", index=False)