In [1]:
from pycaret.classification import *
from sklearn.model_selection import train_test_split

import pandas as pd
import os

In [2]:
ROOT_DIR = "../data"
RANDOM_STATE = 42

train_data = pd.read_csv(os.path.join(ROOT_DIR, "feature_crosses_train.csv"))
a_ = pd.read_csv(os.path.join(ROOT_DIR, "preprocessing_train.csv"))
train_data['target'] = a_['target'].values


In [3]:
df_normal = train_data[train_data["target"] == "Normal"]
df_abnormal = train_data[train_data["target"] == "AbNormal"]

normal_ratio = 1.0

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)

## under sampling
df_normal = df_normal.sample(n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)

train_data = df_concat

# df_train, df_val = train_test_split(
#     df_concat,
#     test_size=0.3,
#     stratify=df_concat["target"],
#     random_state=RANDOM_STATE,
# )

In [4]:
setup_clf = setup(data=train_data, target="target", train_size=0.7, session_id=333, fold_shuffle=True)

Unnamed: 0,Description,Value
0,Session id,333
1,Target,target
2,Target type,Binary
3,Target mapping,"AbNormal: 0, Normal: 1"
4,Original data shape,"(4700, 170)"
5,Transformed data shape,"(4700, 170)"
6,Transformed train set shape,"(3290, 170)"
7,Transformed test set shape,"(1410, 170)"
8,Numeric features,169
9,Preprocess,True


In [5]:
model = compare_models(sort='F1', fold=10, n_select=3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.6106,0.6426,0.6106,0.6116,0.6098,0.2213,0.2223,0.312
ada,Ada Boost Classifier,0.61,0.6384,0.61,0.6104,0.6097,0.22,0.2204,0.086
ridge,Ridge Classifier,0.6061,0.6378,0.6061,0.6077,0.6044,0.2121,0.2138,0.015
lda,Linear Discriminant Analysis,0.5954,0.6315,0.5954,0.5969,0.5939,0.1909,0.1923,0.021
rf,Random Forest Classifier,0.593,0.6278,0.593,0.5932,0.5928,0.1861,0.1863,0.095
et,Extra Trees Classifier,0.5884,0.6166,0.5884,0.5886,0.5883,0.1769,0.177,0.075
lightgbm,Light Gradient Boosting Machine,0.5875,0.6262,0.5875,0.5877,0.5874,0.1751,0.1753,0.258
lr,Logistic Regression,0.5745,0.5987,0.5745,0.5748,0.574,0.149,0.1493,0.286
knn,K Neighbors Classifier,0.5702,0.5865,0.5702,0.5704,0.57,0.1405,0.1406,0.021
nb,Naive Bayes,0.5632,0.5894,0.5632,0.5641,0.5621,0.1265,0.1273,0.015


In [6]:
tuned_model = [tune_model(i) for i in model]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5957,0.6145,0.5957,0.5975,0.5937,0.1911,0.193
1,0.6474,0.6703,0.6474,0.6476,0.6473,0.2947,0.2949
2,0.5593,0.6078,0.5593,0.5593,0.5591,0.1185,0.1185
3,0.614,0.6491,0.614,0.6146,0.6133,0.2278,0.2285
4,0.6261,0.645,0.6261,0.6262,0.626,0.2522,0.2523
5,0.6201,0.6628,0.6201,0.6215,0.6191,0.2404,0.2416
6,0.6201,0.6713,0.6201,0.6215,0.6191,0.2404,0.2416
7,0.6353,0.6622,0.6353,0.6356,0.6351,0.2706,0.2709
8,0.5684,0.5921,0.5684,0.5692,0.5674,0.137,0.1377
9,0.6505,0.6754,0.6505,0.6508,0.6503,0.301,0.3013


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.535,0.578,0.535,0.5349,0.5348,0.0698,0.0698
1,0.5957,0.6288,0.5957,0.5964,0.5948,0.1913,0.1921
2,0.5866,0.6131,0.5866,0.5868,0.5865,0.1733,0.1735
3,0.6109,0.6208,0.6109,0.6111,0.6108,0.222,0.2221
4,0.6201,0.6454,0.6201,0.6204,0.6197,0.24,0.2403
5,0.5714,0.5932,0.5714,0.5725,0.5701,0.1432,0.1441
6,0.5653,0.6285,0.5653,0.5656,0.5651,0.1308,0.131
7,0.5988,0.6503,0.5988,0.5988,0.5987,0.1975,0.1976
8,0.5745,0.5864,0.5745,0.5751,0.5738,0.1491,0.1497
9,0.6201,0.6824,0.6201,0.6203,0.6199,0.2402,0.2404


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5805,0.6315,0.5805,0.5818,0.5786,0.1607,0.1622
1,0.614,0.6314,0.614,0.6151,0.6129,0.2277,0.2289
2,0.5836,0.6132,0.5836,0.5839,0.5831,0.167,0.1674
3,0.6109,0.6343,0.6109,0.6128,0.6091,0.2216,0.2236
4,0.6201,0.6426,0.6201,0.6227,0.6178,0.2398,0.2426
5,0.5866,0.6265,0.5866,0.5894,0.5837,0.1737,0.1762
6,0.6049,0.662,0.6049,0.6087,0.6017,0.2102,0.2137
7,0.6596,0.6792,0.6596,0.6598,0.6595,0.3192,0.3194
8,0.6079,0.6023,0.6079,0.6121,0.6045,0.2162,0.2202
9,0.6231,0.6697,0.6231,0.6233,0.623,0.2463,0.2464


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [7]:
tuned_model

[GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                            learning_rate=0.2, loss='log_loss', max_depth=2,
                            max_features=1.0, max_leaf_nodes=None,
                            min_impurity_decrease=0.3, min_samples_leaf=4,
                            min_samples_split=4, min_weight_fraction_leaf=0.0,
                            n_estimators=170, n_iter_no_change=None,
                            random_state=333, subsample=1.0, tol=0.0001,
                            validation_fraction=0.1, verbose=0,
                            warm_start=False),
 AdaBoostClassifier(algorithm='SAMME.R', estimator=None, learning_rate=1.0,
                    n_estimators=50, random_state=333),
 RidgeClassifier(alpha=4.87, class_weight=None, copy_X=True, fit_intercept=False,
                 max_iter=None, positive=False, random_state=333, solver='auto',
                 tol=0.0001)]

In [9]:
blended_hard = blend_models(estimator_list=tuned_model[:],
                       fold=10,
                       method="hard",
                       optimize="F1")

Processing:   0%|          | 0/6 [00:00<?, ?it/s]

ValueError: cannot set WRITEABLE flag to True of this array

In [10]:
final_model = finalize_model(tuned_model[0])
evaluate_model(final_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [11]:
test_data = pd.read_csv(os.path.join(ROOT_DIR, "feature_crosses_test.csv"))

train_columns = train_data.columns
test_data = test_data[train_columns]

df_test_x = test_data.drop(columns=['target'])

In [12]:
test_pred = predict_model(final_model, data=df_test_x)
test_pred

Unnamed: 0,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam,...,Equipment_Dam_Dam dispenser #2 Equipment_Fill1_Fill1 dispenser #1,Equipment_Dam_Dam dispenser #2 Equipment_Fill1_Fill1 dispenser #2,Equipment_Fill2_Fill2 dispenser #1 Equipment_Fill2_Fill2 dispenser #2,Equipment_Fill2_Fill2 dispenser #1 Equipment_Fill1_Fill1 dispenser #1,Equipment_Fill2_Fill2 dispenser #1 Equipment_Fill1_Fill1 dispenser #2,Equipment_Fill2_Fill2 dispenser #2 Equipment_Fill1_Fill1 dispenser #1,Equipment_Fill2_Fill2 dispenser #2 Equipment_Fill1_Fill1 dispenser #2,Equipment_Fill1_Fill1 dispenser #1 Equipment_Fill1_Fill1 dispenser #2,prediction_label,prediction_score
0,1000.0,12.5,90,70,280,90,10,17.000000,4.9,17.000000,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,Normal,0.5468
1,1000.0,12.5,90,70,280,90,16,14.200000,8.3,14.200000,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,Normal,0.6672
2,240.0,2.5,-90,70,1030,-90,10,9.700000,4.9,9.700000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,AbNormal,0.5485
3,1000.0,12.5,90,70,280,90,10,21.299999,10.6,21.299999,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,AbNormal,0.7370
4,240.0,2.5,-90,70,1030,-90,16,13.200000,7.5,13.200000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,Normal,0.6695
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,1000.0,12.5,90,70,280,90,10,21.299999,10.6,21.299999,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,Normal,0.6535
17357,1000.0,12.5,90,70,280,90,16,13.200000,7.6,13.200000,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,Normal,0.6058
17358,240.0,2.5,-90,70,1030,-90,16,13.200000,6.6,13.200000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,Normal,0.5560
17359,240.0,2.5,-90,70,1030,-90,10,9.700000,3.9,9.700000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,AbNormal,0.6278


In [13]:
df_sub = pd.read_csv("../data/submission.csv")
df_sub["target"] = test_pred["prediction_label"]
df_sub.to_csv("../data/automl/submission-29.csv", index=False)