In [1]:
from pycaret.classification import *
from sklearn.model_selection import train_test_split

import pandas as pd
import os

In [2]:
ROOT_DIR = "../data"
RANDOM_STATE = 42

train_data = pd.read_csv(os.path.join(ROOT_DIR, "preprocessing_train.csv"))

df_normal = train_data[train_data["target"] == "Normal"]
df_abnormal = train_data[train_data["target"] == "AbNormal"]

normal_ratio = 1.2

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)

## under sampling
df_normal = df_normal.sample(n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)

train_data = df_concat

# df_train, df_val = train_test_split(
#     df_concat,
#     test_size=0.3,
#     stratify=df_concat["target"],
#     random_state=RANDOM_STATE,
# )

In [3]:
setup_clf = setup(data=train_data, target="target", train_size=0.7, session_id=333, fold_shuffle=True)

Unnamed: 0,Description,Value
0,Session id,333
1,Target,target
2,Target type,Binary
3,Target mapping,"AbNormal: 0, Normal: 1"
4,Original data shape,"(5170, 146)"
5,Transformed data shape,"(5170, 170)"
6,Transformed train set shape,"(3618, 170)"
7,Transformed test set shape,"(1552, 170)"
8,Numeric features,133
9,Categorical features,12


In [4]:
ridge_model = create_model('ridge')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6271,0.6417,0.6271,0.625,0.6249,0.2415,0.2424
1,0.6105,0.6184,0.6105,0.6073,0.603,0.1994,0.2034
2,0.6133,0.6359,0.6133,0.6101,0.6074,0.2072,0.2102
3,0.6188,0.6209,0.6188,0.6164,0.6126,0.2193,0.223
4,0.6077,0.6408,0.6077,0.6053,0.605,0.2022,0.2033
5,0.6243,0.6792,0.6243,0.6222,0.6182,0.2306,0.2345
6,0.6022,0.6184,0.6022,0.5995,0.599,0.1902,0.1914
7,0.6685,0.7156,0.6685,0.6674,0.6654,0.3245,0.327
8,0.5817,0.6053,0.5817,0.5774,0.5758,0.1437,0.1457
9,0.6066,0.6448,0.6066,0.6034,0.6013,0.1952,0.1977


In [5]:
tuned_ridge_model = tune_model(
    ridge_model,
    search_library='scikit-learn',  # 기본 설정
    search_algorithm='grid',  # 하이퍼파라미터 탐색 방법 ('grid' 또는 'random')
    custom_grid={'alpha': [0.1, 1.0, 10.0, 100.0,200.0, 500,0]}  # 하이퍼파라미터 'alpha' 값 설정
)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.616,0.656,0.616,0.6131,0.6093,0.2116,0.2154
1,0.6022,0.6316,0.6022,0.5985,0.5956,0.1837,0.1868
2,0.6492,0.6654,0.6492,0.6478,0.6436,0.2804,0.2848
3,0.6409,0.6198,0.6409,0.6426,0.6298,0.2586,0.2689
4,0.6409,0.6547,0.6409,0.6394,0.6356,0.2653,0.2692
5,0.6602,0.6925,0.6602,0.6588,0.6577,0.3086,0.3104
6,0.5967,0.6431,0.5967,0.5932,0.5908,0.1748,0.1774
7,0.6602,0.7343,0.6602,0.6624,0.6515,0.3003,0.3097
8,0.6177,0.6357,0.6177,0.6156,0.6088,0.2129,0.2187
9,0.6482,0.6761,0.6482,0.6478,0.6409,0.2768,0.2832


Fitting 10 folds for each of 7 candidates, totalling 70 fits


In [6]:
tuned_ridge_model

In [9]:
final_model = finalize_model(tuned_ridge_model)
evaluate_model(final_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [10]:
test_data = pd.read_csv(os.path.join(ROOT_DIR, "preprocessing_test.csv"))

train_columns = train_data.columns
test_data = test_data[train_columns]

df_test_x = test_data.drop(columns=['target'])

In [11]:
test_pred = predict_model(final_model, data=df_test_x)
test_pred

Unnamed: 0,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,...,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,prediction_label
0,Dam dispenser #2,AJX75334501,3J1XF767-1,1000.0,12.5,90,70,280,90,10,...,91.800003,270.000000,50,85,19.799999,13.000000,195,1,0,Normal
1,Dam dispenser #2,AJX75334501,4B1XD472-2,1000.0,12.5,90,70,280,90,16,...,50.000000,91.800003,270,50,85.000000,19.799999,14,256,1,Normal
2,Dam dispenser #1,AJX75334501,3H1XE355-1,240.0,2.5,-90,70,1030,-90,10,...,91.800003,270.000000,50,85,19.700001,1.000000,98,1,0,AbNormal
3,Dam dispenser #2,AJX75334501,3L1XA128-1,1000.0,12.5,90,70,280,90,10,...,50.000000,91.800003,270,50,85.000000,20.000000,14,0,1,Normal
4,Dam dispenser #1,AJX75334501,4A1XA639-1,240.0,2.5,-90,70,1030,-90,16,...,50.000000,91.800003,270,50,85.000000,19.799999,1,215,1,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,Dam dispenser #2,AJX75334501,3K1XB597-1,1000.0,12.5,90,70,280,90,10,...,50.000000,91.800003,270,50,85.000000,19.500000,14,131,1,Normal
17357,Dam dispenser #2,AJX75334501,4A1XB974-1,1000.0,12.5,90,70,280,90,16,...,50.000000,91.800003,270,50,85.000000,19.799999,12,279,1,Normal
17358,Dam dispenser #1,AJX75334501,3L1XA998-1,240.0,2.5,-90,70,1030,-90,16,...,50.000000,91.800003,270,50,85.000000,20.500000,4,66,1,Normal
17359,Dam dispenser #1,AJX75334501,3F1XC376-1,240.0,2.5,-90,70,1030,-90,10,...,91.800003,270.000000,50,85,18.900000,1.000000,117,1,0,AbNormal


In [12]:
df_sub = pd.read_csv("../data/submission.csv")
df_sub["target"] = test_pred["prediction_label"]
df_sub.to_csv("../data/automl/submission-26.csv", index=False)