# 제품 이상여부 판별 프로젝트


## 1. 데이터 불러오기


In [None]:
from pycaret.classification import *

import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

from sklearn.model_selection import KFold
import catboost
from catboost import CatBoostClassifier, Pool, cv
import optuna
from optuna import Trial
from optuna.samplers import TPESampler

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from tqdm import tqdm

import seaborn as sns
import matplotlib.pyplot as plt


### 데이터 읽어오기


### train data + test data 병합

In [None]:
ROOT_DIR = "data"
RANDOM_STATE = 1004

# Load data
train = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))

In [None]:
train.shape

(40506, 464)

In [None]:
test = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

In [None]:
test.shape

(17361, 465)

In [None]:
data = pd.concat([train, test], axis = 0)

In [None]:
data.shape

(57867, 465)

### 전처리 및 파생변수 생성


전처리 진행


In [None]:
# 1. value_counts()가 1개인 열 삭제
data = data.loc[:, data.nunique() > 1]

# # 2. 중복열 있으면 한개 삭제
data = data.loc[:, ~data.columns.duplicated()]

# # 3. 모두 NaN값인 열 제거
data = data.dropna(axis=1, how='all')

### 파생변수 생성

In [None]:
data_dam = data[['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam', 'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam', 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam', 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam', 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam', 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam', 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam', 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam', 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam', 'HEAD Standby Position X Collect Result_Dam', 'HEAD Standby Position Y Collect Result_Dam', 'HEAD Standby Position Z Collect Result_Dam', 'Head Clean Position X Collect Result_Dam', 'Head Clean Position Y Collect Result_Dam', 'Head Clean Position Z Collect Result_Dam', 'Head Purge Position X Collect Result_Dam', 'Head Purge Position Y Collect Result_Dam', 'Head Purge Position Z Collect Result_Dam', 'Head Zero Position X Collect Result_Dam', 'Head Zero Position Y Collect Result_Dam', 'Head Zero Position Z Collect Result_Dam', 'Machine Tact time Collect Result_Dam', 'PalletID Collect Result_Dam', 'Production Qty Collect Result_Dam', 'Receip No Collect Result_Dam', 'Stage1 Circle1 Distance Speed Collect Result_Dam', 'Stage1 Circle2 Distance Speed Collect Result_Dam', 'Stage1 Circle3 Distance Speed Collect Result_Dam', 'Stage1 Circle4 Distance Speed Collect Result_Dam', 'Stage1 Line1 Distance Speed Collect Result_Dam', 'Stage1 Line2 Distance Speed Collect Result_Dam', 'Stage1 Line3 Distance Speed Collect Result_Dam', 'Stage1 Line4 Distance Speed Collect Result_Dam', 'Stage2 Circle1 Distance Speed Collect Result_Dam', 'Stage2 Circle2 Distance Speed Collect Result_Dam', 'Stage2 Circle3 Distance Speed Collect Result_Dam', 'Stage2 Circle4 Distance Speed Collect Result_Dam', 'Stage2 Line1 Distance Speed Collect Result_Dam', 'Stage2 Line2 Distance Speed Collect Result_Dam', 'Stage2 Line3 Distance Speed Collect Result_Dam', 'Stage2 Line4 Distance Speed Collect Result_Dam', 'Stage3 Circle1 Distance Speed Collect Result_Dam', 'Stage3 Circle2 Distance Speed Collect Result_Dam', 'Stage3 Circle3 Distance Speed Collect Result_Dam', 'Stage3 Circle4 Distance Speed Collect Result_Dam', 'Stage3 Line1 Distance Speed Collect Result_Dam', 'Stage3 Line2 Distance Speed Collect Result_Dam', 'Stage3 Line3 Distance Speed Collect Result_Dam', 'Stage3 Line4 Distance Speed Collect Result_Dam', 'THICKNESS 1 Collect Result_Dam', 'THICKNESS 2 Collect Result_Dam', 'THICKNESS 3 Collect Result_Dam', 'WorkMode Collect Result_Dam']]
data_fill1 = data[['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1', 'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1', 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1', 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1', 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1', 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1', 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1', 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1', 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1', 'HEAD Standby Position X Collect Result_Fill1', 'HEAD Standby Position Y Collect Result_Fill1', 'HEAD Standby Position Z Collect Result_Fill1', 'Head Clean Position X Collect Result_Fill1', 'Head Clean Position Y Collect Result_Fill1', 'Head Clean Position Z Collect Result_Fill1', 'Head Purge Position X Collect Result_Fill1', 'Head Purge Position Y Collect Result_Fill1', 'Head Purge Position Z Collect Result_Fill1', 'Machine Tact time Collect Result_Fill1', 'PalletID Collect Result_Fill1', 'Production Qty Collect Result_Fill1', 'Receip No Collect Result_Fill1', 'WorkMode Collect Result_Fill1']]
data_fill2 = data[['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2', 'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2', 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2', 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2', 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2', 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2', 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2', 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2', 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2', 'HEAD Standby Position X Collect Result_Fill2', 'HEAD Standby Position Y Collect Result_Fill2', 'HEAD Standby Position Z Collect Result_Fill2', 'Head Clean Position X Collect Result_Fill2', 'Head Clean Position Y Collect Result_Fill2', 'Head Clean Position Z Collect Result_Fill2', 'Head Purge Position X Collect Result_Fill2', 'Head Purge Position Y Collect Result_Fill2', 'Head Purge Position Z Collect Result_Fill2', 'Machine Tact time Collect Result_Fill2', 'PalletID Collect Result_Fill2', 'Production Qty Collect Result_Fill2', 'Receip No Collect Result_Fill2', 'WorkMode Collect Result_Fill2']]

In [None]:
# OK 또는 NaN 값이 있는 행 식별
def check_and_shift_row(row):
    # 조건에 맞는지 확인
    if any((row == 'OK') | (row.isna())):
        # 열을 한 칸씩 앞으로 밀고 마지막 열을 NaN으로 설정
        row = row.shift(-1)
        row.iloc[-1] = np.nan
    return row

# 각 행에 대해 check_and_shift_row 함수를 적용
data_dam = data_dam.apply(check_and_shift_row, axis=1)
data_fill1 = data_fill1.apply(check_and_shift_row, axis=1)
data_fill2 = data_fill2.apply(check_and_shift_row, axis=1)

In [None]:
data[data_dam.columns] = data_dam
data[data_fill1.columns] = data_fill1
data[data_fill2.columns] = data_fill2

In [None]:
data.drop('WorkMode Collect Result_Dam', axis=1, inplace=True)
data.drop('WorkMode Collect Result_Fill1', axis=1, inplace=True)
data.drop('WorkMode Collect Result_Fill2', axis=1, inplace=True)

In [None]:
#stage별 시간당 resin 도포량
data['Rate(Stage1) Result_Dam'] = data['Dispense Volume(Stage1) Collect Result_Dam'] / (data['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam']+1)
data['Rate(Stage2) Result_Dam'] = data['Dispense Volume(Stage2) Collect Result_Dam'] / (data['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam']+1)
data['Rate(Stage3) Result_Dam'] = data['Dispense Volume(Stage3) Collect Result_Dam'] / (data['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam']+1)
data['Rate(Stage1) Result_Fill1'] = data['Dispense Volume(Stage1) Collect Result_Fill1'] / (data['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1']+1)
data['Rate(Stage2) Result_Fill1'] = data['Dispense Volume(Stage2) Collect Result_Fill1'] / (data['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1']+1)
data['Rate(Stage3) Result_Fill1'] = data['Dispense Volume(Stage3) Collect Result_Fill1'] / (data['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1']+1)
#stage별 일의양
data['Product(Stage1) Result_Dam'] = data['Dispense Volume(Stage1) Collect Result_Dam'] * data['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam']
data['Product(Stage2) Result_Dam'] = data['Dispense Volume(Stage2) Collect Result_Dam'] * data['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam']
data['Product(Stage3) Result_Dam'] = data['Dispense Volume(Stage3) Collect Result_Dam'] * data['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam']
data['Product(Stage1) Result_Fill1'] = data['Dispense Volume(Stage1) Collect Result_Fill1'] * data['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1']
data['Product(Stage2) Result_Fill1'] = data['Dispense Volume(Stage2) Collect Result_Fill1'] * data['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1']
data['Product(Stage3) Result_Fill1'] = data['Dispense Volume(Stage3) Collect Result_Fill1'] * data['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1']
#AutoClave 평균 Pressure
data['Mean Pressure Collect Result_AutoClave'] = (data['1st Pressure Collect Result_AutoClave']+data['2nd Pressure Collect Result_AutoClave']+data['3rd Pressure Collect Result_AutoClave'])/3
data['Pressure_1'] = data['2nd Pressure Collect Result_AutoClave']-data['1st Pressure Collect Result_AutoClave']
data['Pressure_2'] = data['3rd Pressure Collect Result_AutoClave']-data['1st Pressure Collect Result_AutoClave']
data['Pressure_3'] = data['3rd Pressure Collect Result_AutoClave']-data['2nd Pressure Collect Result_AutoClave']
# cure position
data['CURE POSITION X Collect Result_Dam'] = data['CURE END POSITION X Collect Result_Dam'] - data['CURE START POSITION X Collect Result_Dam']
data['CURE POSITION X Collect Result_Fill2'] = data['CURE END POSITION X Collect Result_Fill2'] - data['CURE START POSITION X Collect Result_Fill2']
data['CURE POSITION Z Collect Result_Fill2'] = data['CURE END POSITION Z Collect Result_Fill2'] - data['CURE START POSITION Z Collect Result_Fill2']
data['CURE TIME X Collect Result_Dam'] = data['CURE POSITION X Collect Result_Dam'] / data['CURE SPEED Collect Result_Dam']
data['CURE TIME X Collect Result_Fill2'] = data['CURE POSITION X Collect Result_Fill2'] / data['CURE SPEED Collect Result_Fill2']
data['CURE TIME Z Collect Result_Fill2'] = data['CURE POSITION Z Collect Result_Fill2'] / data['CURE SPEED Collect Result_Fill2']
data['DISCHARGED RESIN Collect Result_Dam'] = (data['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam'] + data['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam'] + data['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam'])/data['DISCHARGED SPEED OF RESIN Collect Result_Dam']
#stage별 circle line speed 평균값
data['Circle1'] = (data['Stage2 Circle1 Distance Speed Collect Result_Dam'] + data['Stage3 Circle1 Distance Speed Collect Result_Dam']) / 2
data['Circle2'] = (data['Stage1 Circle2 Distance Speed Collect Result_Dam'] + data['Stage2 Circle2 Distance Speed Collect Result_Dam'] + data['Stage3 Circle2 Distance Speed Collect Result_Dam']) / 3
data['Circle3'] = (data['Stage1 Circle3 Distance Speed Collect Result_Dam'] + data['Stage2 Circle3 Distance Speed Collect Result_Dam'] + data['Stage3 Circle3 Distance Speed Collect Result_Dam']) / 3
data['Circle4'] = (data['Stage1 Circle4 Distance Speed Collect Result_Dam'] + data['Stage2 Circle4 Distance Speed Collect Result_Dam'] + data['Stage3 Circle4 Distance Speed Collect Result_Dam']) / 3
data['Line1'] = (data['Stage1 Line1 Distance Speed Collect Result_Dam'] + data['Stage2 Line1 Distance Speed Collect Result_Dam'] + data['Stage3 Line1 Distance Speed Collect Result_Dam']) / 3
data['Line2'] = (data['Stage1 Line2 Distance Speed Collect Result_Dam'] + data['Stage2 Line2 Distance Speed Collect Result_Dam'] + data['Stage3 Line2 Distance Speed Collect Result_Dam']) / 3
data['Line3'] = (data['Stage1 Line3 Distance Speed Collect Result_Dam'] + data['Stage2 Line3 Distance Speed Collect Result_Dam'] + data['Stage3 Line3 Distance Speed Collect Result_Dam']) / 3
data['Line4'] = (data['Stage1 Line4 Distance Speed Collect Result_Dam'] + data['Stage2 Line4 Distance Speed Collect Result_Dam'] + data['Stage3 Line4 Distance Speed Collect Result_Dam']) / 3

In [None]:
data['CL1'] = (data['Stage1 Circle1 Distance Speed Collect Result_Dam']) / (data['Stage1 Line1 Distance Speed Collect Result_Dam']+1)
data['CL2'] = (data['Stage1 Circle2 Distance Speed Collect Result_Dam']) / (data['Stage1 Line2 Distance Speed Collect Result_Dam']+1)
data['CL3'] = (data['Stage1 Circle3 Distance Speed Collect Result_Dam']) / (data['Stage1 Line3 Distance Speed Collect Result_Dam']+1)
data['CL4'] = (data['Stage1 Circle4 Distance Speed Collect Result_Dam']) / (data['Stage1 Line4 Distance Speed Collect Result_Dam']+1)

data['CL5'] = (data['Stage2 Circle1 Distance Speed Collect Result_Dam']) / (data['Stage2 Line1 Distance Speed Collect Result_Dam']+1)
data['CL6'] = (data['Stage2 Circle2 Distance Speed Collect Result_Dam']) / (data['Stage2 Line2 Distance Speed Collect Result_Dam']+1)
data['CL7'] = (data['Stage2 Circle3 Distance Speed Collect Result_Dam']) / (data['Stage2 Line3 Distance Speed Collect Result_Dam']+1)
data['CL8'] = (data['Stage2 Circle4 Distance Speed Collect Result_Dam']) / (data['Stage2 Line4 Distance Speed Collect Result_Dam']+1)

data['CL9'] = (data['Stage3 Circle1 Distance Speed Collect Result_Dam']) / (data['Stage3 Line1 Distance Speed Collect Result_Dam']+1)
data['CL10'] = (data['Stage3 Circle2 Distance Speed Collect Result_Dam']) / (data['Stage3 Line2 Distance Speed Collect Result_Dam']+1)
data['CL11'] = (data['Stage3 Circle3 Distance Speed Collect Result_Dam']) / (data['Stage3 Line3 Distance Speed Collect Result_Dam']+1)
data['CL12'] = (data['Stage3 Circle4 Distance Speed Collect Result_Dam']) / (data['Stage3 Line4 Distance Speed Collect Result_Dam']+1)

In [None]:
# 'Production Qty Collect Result_Dam' << feature importance가 가장 높음
data['Qty_1'] = data['Production Qty Collect Result_Fill1'] - data['Production Qty Collect Result_Dam']
data['Qty_2'] = data['Production Qty Collect Result_Fill2'] - data['Production Qty Collect Result_Dam']
data['Qty_3'] = data['Production Qty Collect Result_Fill2'] - data['Production Qty Collect Result_Fill1']
data['Qty_4'] = data['Production Qty Collect Result_Dam'] / ((data['Circle1']+data['Circle2']+data['Circle3']+data['Circle4'])/4)
data['Qty_5'] = data['Production Qty Collect Result_Dam'] / ((data['Line1']+data['Line2']+data['Line3']+data['Line4'])/4)
data['Qty_6'] = data['Production Qty Collect Result_Dam'] * ((data['Rate(Stage1) Result_Dam']+data['Rate(Stage2) Result_Dam']+data['Rate(Stage3) Result_Dam'])/3)
data['Qty_7'] = data['Production Qty Collect Result_Fill1'] * ((data['Rate(Stage1) Result_Fill1']+data['Rate(Stage2) Result_Fill1']+data['Rate(Stage3) Result_Fill1'])/3)

In [None]:
# 'Machine Tact time Collect Result_Fill1'에 0이 존재해서 나누기가 불가능 >> 따라서 평균으로 치환
mean_tact_time = data['Machine Tact time Collect Result_Fill1'].replace(0, np.nan).mean()

# 0 값을 평균값으로 대체
data['Machine Tact time Collect Result_Fill1'] = data['Machine Tact time Collect Result_Fill1'].replace(0, mean_tact_time)

In [None]:
data['Machine1'] = data['Machine Tact time Collect Result_Dam'] - data['Machine Tact time Collect Result_Fill1']
data['Machine2'] = data['Machine Tact time Collect Result_Dam'] - data['Machine Tact time Collect Result_Fill2']
data['Machine3'] = data['Machine Tact time Collect Result_Fill1'] - data['Machine Tact time Collect Result_Fill2']
data['Machine4'] = (data['Machine Tact time Collect Result_Dam'] + data['Machine Tact time Collect Result_Fill1'] + data['Machine Tact time Collect Result_Fill2'])/3

data['Machine5'] = data['Production Qty Collect Result_Dam'] / data['Machine Tact time Collect Result_Dam']
data['Machine6'] = data['Production Qty Collect Result_Fill1'] / data['Machine Tact time Collect Result_Fill1']
data['Machine7'] = data['Production Qty Collect Result_Fill2'] / data['Machine Tact time Collect Result_Fill2']

카테고리화 전처리

In [None]:
columns_to_convert = ['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
                    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
                    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2']

# 각 컬럼을 float 타입으로 변환
data[columns_to_convert] = data[columns_to_convert].astype(float)

In [None]:
cat_features = data.select_dtypes(include=['object']).columns.to_list()
num_features = data.select_dtypes(exclude=['object']).columns.to_list()

In [None]:
cat_features.remove('target')

In [None]:
cat_features.remove('Set ID')

In [None]:
from sklearn.preprocessing import LabelEncoder

# 데이터프레임의 각 컬럼에 대해 라벨 인코딩 적용
for col in cat_features:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])

### train, test 데이터 분할

In [None]:
train_data = data.iloc[:train.shape[0], :]
test_data = data.iloc[train.shape[0]:, :]

In [None]:
# # 3D 산점도 그리기
# fig = plt.figure(figsize=(8, 6))
# ax = fig.add_subplot(111, projection='3d')

# # 각 축에 해당하는 데이터 지정
# ax.scatter(data['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2'], 
#            data['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2'], 
#            data['HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2'], c='b', marker='o')

# # 축 레이블 지정
# ax.set_xlabel('X axis')
# ax.set_ylabel('Y axis')
# ax.set_zlabel('Z axis')

# # 그래프 보여주기
# plt.show()

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
# KMeans 군집화 (예를 들어 3개의 군집으로 나눔)
kmeans = KMeans(n_clusters=5, random_state=42)
train_data['Cluster_D1'] = kmeans.fit_predict(train_data[['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
                                        'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam',
                                        'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam']])
test_data['Cluster_D1'] = kmeans.predict(test_data[['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
                                        'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam',
                                        'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam']])
# 라벨 인코딩 (각 군집에 숫자 라벨 부여)
label_encoder = LabelEncoder()
train_data['Cluster_Label_D1'] = label_encoder.fit_transform(train_data['Cluster_D1'])
test_data['Cluster_Label_D1'] = label_encoder.transform(test_data['Cluster_D1'])

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
# KMeans 군집화 (예를 들어 3개의 군집으로 나눔)
kmeans = KMeans(n_clusters=7, random_state=42)
train_data['Cluster_F1'] = kmeans.fit_predict(train_data[['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
                                       'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1',
                                        'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1']])
test_data['Cluster_F1'] = kmeans.predict(test_data[['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
                                       'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1',
                                        'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1']])
# 라벨 인코딩 (각 군집에 숫자 라벨 부여)
label_encoder = LabelEncoder()
train_data['Cluster_Label_F1'] = label_encoder.fit_transform(train_data['Cluster_F1'])
test_data['Cluster_Label_F1'] = label_encoder.transform(test_data['Cluster_F1'])

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
# KMeans 군집화 (예를 들어 3개의 군집으로 나눔)
kmeans = KMeans(n_clusters=2, random_state=42)
train_data['Cluster_FF1'] = kmeans.fit_predict(train_data[['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2',
                                        'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2',
                                        'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2']])
test_data['Cluster_FF1'] = kmeans.predict(test_data[['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2',
                                        'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2',
                                        'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2']])
# 라벨 인코딩 (각 군집에 숫자 라벨 부여)
label_encoder = LabelEncoder()
train_data['Cluster_Label_FF1'] = label_encoder.fit_transform(train_data['Cluster_FF1'])
test_data['Cluster_Label_FF1'] = label_encoder.transform(test_data['Cluster_FF1'])

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
# KMeans 군집화 (예를 들어 3개의 군집으로 나눔)
kmeans = KMeans(n_clusters=4, random_state=42)
train_data['Cluster_D2'] = kmeans.fit_predict(train_data[['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',
                                        'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam',
                                        'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam']])
test_data['Cluster_D2'] = kmeans.predict(test_data[['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',
                                        'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam',
                                        'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam']])
# 라벨 인코딩 (각 군집에 숫자 라벨 부여)
label_encoder = LabelEncoder()
train_data['Cluster_Label_D2'] = label_encoder.fit_transform(train_data['Cluster_D2'])
test_data['Cluster_Label_D2'] = label_encoder.transform(test_data['Cluster_D2'])

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
# KMeans 군집화 (예를 들어 3개의 군집으로 나눔)
kmeans = KMeans(n_clusters=5, random_state=42)
train_data['Cluster_F2'] = kmeans.fit_predict(train_data[['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1',
                                        'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1']])
test_data['Cluster_F2'] = kmeans.predict(test_data[['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1',
                                        'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1']])
# 라벨 인코딩 (각 군집에 숫자 라벨 부여)
label_encoder = LabelEncoder()
train_data['Cluster_Label_F2'] = label_encoder.fit_transform(train_data['Cluster_F2'])
test_data['Cluster_Label_F2'] = label_encoder.transform(test_data['Cluster_F2'])

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
# KMeans 군집화 (예를 들어 3개의 군집으로 나눔)
kmeans = KMeans(n_clusters=2, random_state=42)
train_data['Cluster_FF2'] = kmeans.fit_predict(train_data[['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2',
                                        'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2',
                                        'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2']])
test_data['Cluster_FF2'] = kmeans.predict(test_data[['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2',
                                        'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2',
                                        'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2']])
# 라벨 인코딩 (각 군집에 숫자 라벨 부여)
label_encoder = LabelEncoder()
train_data['Cluster_Label_FF2'] = label_encoder.fit_transform(train_data['Cluster_FF2'])
test_data['Cluster_Label_FF2'] = label_encoder.transform(test_data['Cluster_FF2'])

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
# KMeans 군집화 (예를 들어 3개의 군집으로 나눔)
kmeans = KMeans(n_clusters=4, random_state=42)
train_data['Cluster_D3'] = kmeans.fit_predict(train_data[['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
                                        'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam',
                                        'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam']])
test_data['Cluster_D3'] = kmeans.predict(test_data[['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
                                        'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam',
                                        'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam']])
# 라벨 인코딩 (각 군집에 숫자 라벨 부여)
label_encoder = LabelEncoder()
train_data['Cluster_Label_D3'] = label_encoder.fit_transform(train_data['Cluster_D3'])
test_data['Cluster_Label_D3'] = label_encoder.transform(test_data['Cluster_D3'])

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
# KMeans 군집화 (예를 들어 3개의 군집으로 나눔)
kmeans = KMeans(n_clusters=5, random_state=42)
train_data['Cluster_F3'] = kmeans.fit_predict(train_data[['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1',
                                        'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1',
                                        'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1']])
test_data['Cluster_F3'] = kmeans.predict(test_data[['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1',
                                        'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1',
                                        'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1']])
# 라벨 인코딩 (각 군집에 숫자 라벨 부여)
label_encoder = LabelEncoder()
train_data['Cluster_Label_F3'] = label_encoder.fit_transform(train_data['Cluster_F3'])
test_data['Cluster_Label_F3'] = label_encoder.transform(test_data['Cluster_F3'])

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
# KMeans 군집화 (예를 들어 3개의 군집으로 나눔)
kmeans = KMeans(n_clusters=3, random_state=42)
train_data['Cluster_FF3'] = kmeans.fit_predict(train_data[['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2',
                                        'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2',
                                        'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2']])
test_data['Cluster_FF3'] = kmeans.predict(test_data[['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2',
                                        'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2',
                                        'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2']])
# 라벨 인코딩 (각 군집에 숫자 라벨 부여)
label_encoder = LabelEncoder()
train_data['Cluster_Label_FF3'] = label_encoder.fit_transform(train_data['Cluster_FF3'])
test_data['Cluster_Label_FF3'] = label_encoder.transform(test_data['Cluster_FF3'])

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
# KMeans 군집화 (예를 들어 3개의 군집으로 나눔)
kmeans = KMeans(n_clusters=3, random_state=42)
train_data['Cluster_C'] = kmeans.fit_predict(train_data[['Head Clean Position X Collect Result_Dam',
                                            'Head Clean Position Y Collect Result_Dam',
                                            'Head Clean Position Z Collect Result_Dam']])
test_data['Cluster_C'] = kmeans.predict(test_data[['Head Clean Position X Collect Result_Dam',
                                            'Head Clean Position Y Collect Result_Dam',
                                            'Head Clean Position Z Collect Result_Dam']])
# 라벨 인코딩 (각 군집에 숫자 라벨 부여)
label_encoder = LabelEncoder()
train_data['Cluster_Label_C'] = label_encoder.fit_transform(train_data['Cluster_C'])
test_data['Cluster_Label_C'] = label_encoder.transform(test_data['Cluster_C'])

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
# KMeans 군집화 (예를 들어 3개의 군집으로 나눔)
kmeans = KMeans(n_clusters=3, random_state=42)
train_data['Cluster_P'] = kmeans.fit_predict(train_data[['Head Purge Position X Collect Result_Dam',
                                            'Head Purge Position Y Collect Result_Dam',
                                            'Head Purge Position Z Collect Result_Dam']])
test_data['Cluster_P'] = kmeans.predict(test_data[['Head Purge Position X Collect Result_Dam',
                                            'Head Purge Position Y Collect Result_Dam',
                                            'Head Purge Position Z Collect Result_Dam']])
# 라벨 인코딩 (각 군집에 숫자 라벨 부여)
label_encoder = LabelEncoder()
train_data['Cluster_Label_P'] = label_encoder.fit_transform(train_data['Cluster_P'])
test_data['Cluster_Label_P'] = label_encoder.transform(test_data['Cluster_P'])

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
# KMeans 군집화 (예를 들어 3개의 군집으로 나눔)
kmeans = KMeans(n_clusters=3, random_state=42)
train_data['Cluster_Z'] = kmeans.fit_predict(train_data[['Head Zero Position X Collect Result_Dam',
                                            'Head Zero Position Y Collect Result_Dam',
                                            'Head Zero Position Z Collect Result_Dam']])
test_data['Cluster_Z'] = kmeans.predict(test_data[['Head Zero Position X Collect Result_Dam',
                                            'Head Zero Position Y Collect Result_Dam',
                                            'Head Zero Position Z Collect Result_Dam']])
# 라벨 인코딩 (각 군집에 숫자 라벨 부여)
label_encoder = LabelEncoder()
train_data['Cluster_Label_Z'] = label_encoder.fit_transform(train_data['Cluster_Z'])
test_data['Cluster_Label_Z'] = label_encoder.transform(test_data['Cluster_Z'])

In [None]:
# object 타입인 컬럼만 추출
object_columns_train = train_data.select_dtypes(include=['object']).columns
object_columns_test = test_data.select_dtypes(include=['object']).columns
# 추출한 컬럼들 출력
print(object_columns_train)
print(object_columns_test)

Index(['target', 'Set ID'], dtype='object')
Index(['target', 'Set ID'], dtype='object')


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_data[num_features] = scaler.fit_transform(train_data[num_features])
test_data[num_features] = scaler.transform(test_data[num_features])

In [None]:
train_data.drop('Set ID', axis = 1, inplace = True)

In [None]:
# 데이터와 레이블 분리
X = train_data.drop('target', axis=1)  # 특성 데이터
y = train_data['target']  # 레이블 데이

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from pycaret.classification import setup
from imblearn.under_sampling import OneSidedSelection
oss = OneSidedSelection(random_state=RANDOM_STATE)


# OSS 적용하여 언더샘플링 수행
X_res, y_res = oss.fit_resample(X, y)

# Normal에 대한 AbNormal의 비율 (0~1사이 값)
rus = RandomUnderSampler(sampling_strategy=0.3, random_state=1010)

X_resampled, y_resampled = rus.fit_resample(X_res, y_res)

train_data = pd.DataFrame(X_resampled, columns=X.columns)
train_data['target'] = y_resampled

In [None]:
train_data['target'].value_counts()

target
Normal      7833
AbNormal    2350
Name: count, dtype: int64

In [None]:
y_train = train_data['target']
X_train = train_data.drop('target', axis=1)

In [None]:
# 피처 간 상관관계 행렬 계산
correlation_matrix = X_train.corr()

# 상관관계가 1 이상인 피처 쌍 찾기
high_correlations = []

# 상관관계 행렬에서 상삼각 행렬만 확인 (중복 방지)
for i in range(len(correlation_matrix.columns)):
    for j in range(i + 1, len(correlation_matrix.columns)):
        if correlation_matrix.iloc[i, j] >= 1:
            high_correlations.append((correlation_matrix.columns[i], correlation_matrix.columns[j]))

# 상관관계가 1 인 피처 중 하나만 선택하여 제거할 피처 리스트 생성
features_to_drop = list(set([pair[1] for pair in high_correlations]))

print("제거할 피처들:", features_to_drop)

제거할 피처들: ['Stage3 Line2 Distance Speed Collect Result_Dam', 'Cluster_Label_D2', 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2', 'Workorder_Fill1', 'CURE END POSITION Θ Collect Result_Dam', 'Cluster_Label_P', 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam', 'Line4', 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2', 'CURE END POSITION X Collect Result_Dam', 'Cluster_Label_FF1', 'Stage1 Line2 Distance Speed Collect Result_Dam', 'Stage3 Circle4 Distance Speed Collect Result_Dam', 'Cluster_FF2', 'Cluster_Label_F1', 'Stage1 Circle3 Distance Speed Collect Result_Dam', 'CURE START POSITION Z Collect Result_Fill2', 'Stage2 Circle3 Distance Speed Collect Result_Dam', 'Cluster_Label_Z', 'Stage3 Circle3 Distance Speed Collect Result_Dam', 'Cluster_Label_D3', 'CURE START POSITION Θ Collect Result_Dam', 'CURE END POSITION Z Collect Result_Dam', 'Cluster_Label_C', 'CL4', 'Circle4', 'Cluster_Label_FF2', 'Workorder_AutoClave', 'HEAD NORMAL COORDINATE Z AXIS(Stage3) 

In [None]:
high_correlations

[('Equipment_Dam', 'CURE END POSITION X Collect Result_Dam'),
 ('Equipment_Dam', 'CURE END POSITION Z Collect Result_Dam'),
 ('Equipment_Dam', 'CURE END POSITION Θ Collect Result_Dam'),
 ('Equipment_Dam', 'CURE START POSITION Θ Collect Result_Dam'),
 ('Equipment_Dam', 'CURE POSITION X Collect Result_Dam'),
 ('Model.Suffix_Dam', 'Model.Suffix_AutoClave'),
 ('Model.Suffix_Dam', 'Model.Suffix_Fill1'),
 ('Model.Suffix_Dam', 'Model.Suffix_Fill2'),
 ('Workorder_Dam', 'Workorder_AutoClave'),
 ('Workorder_Dam', 'Workorder_Fill1'),
 ('Workorder_Dam', 'Workorder_Fill2'),
 ('CURE END POSITION X Collect Result_Dam',
  'CURE END POSITION Z Collect Result_Dam'),
 ('CURE END POSITION X Collect Result_Dam',
  'CURE END POSITION Θ Collect Result_Dam'),
 ('CURE END POSITION X Collect Result_Dam',
  'CURE START POSITION Θ Collect Result_Dam'),
 ('CURE END POSITION X Collect Result_Dam',
  'CURE POSITION X Collect Result_Dam'),
 ('CURE END POSITION Z Collect Result_Dam',
  'CURE END POSITION Θ Collect Res

In [None]:
X_train.drop(features_to_drop,axis=1,inplace=True)

In [None]:
X_train

Unnamed: 0,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam,Dispense Volume(Stage1) Collect Result_Dam,...,Cluster_D1,Cluster_F1,Cluster_D2,Cluster_F2,Cluster_D3,Cluster_F3,Cluster_FF3,Cluster_C,Cluster_P,Cluster_Z
0,0,1,482,-0.395838,0.786319,1.040015,-0.019298,0.621097,0.010008,-0.277852,...,2,0,3,0,2,0,0,1,1,2
1,1,0,240,-0.395838,-1.271748,-0.961577,0.728603,-0.983422,0.733251,0.335719,...,1,1,2,1,1,1,1,2,1,0
2,0,3,599,1.140752,0.786319,1.040015,0.167677,0.771520,0.117155,1.685575,...,2,0,3,0,2,0,0,1,1,2
3,0,0,393,-0.395838,0.786319,1.040015,-0.286406,0.320250,-0.284646,-0.492602,...,4,0,0,0,0,0,0,2,1,2
4,0,0,277,-0.395838,0.786319,-0.961577,1.877165,1.874627,1.885082,1.256075,...,4,0,0,0,0,0,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11223,0,1,603,3.189539,0.786319,1.040015,0.167677,0.771520,0.117155,1.685575,...,2,0,3,0,2,0,0,1,1,2
21843,1,0,390,-0.395838,-1.271748,1.040015,-0.286406,0.370391,-0.284646,-0.492602,...,1,1,1,1,1,1,1,2,1,2
10976,1,0,199,-0.395838,-1.271748,-0.961577,0.728603,-0.983422,0.733251,0.335719,...,1,1,2,1,1,1,1,2,1,0
4108,1,0,230,-0.395838,-1.271748,-0.961577,0.728603,-0.983422,0.733251,0.335719,...,1,1,2,1,1,1,1,2,1,0


In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, f1_score
import optuna

X = X_train
y = y_train

def objective_decision(trial):
    cat_l2_leaf_reg = trial.suggest_float("l2_leaf_reg", 1e-5, 10.0, log = True)
    cat_bootstrap_type=trial.suggest_categorical("bootstrap_type", ["Bayesian"])
    cat_depth = trial.suggest_int('depth', 2, 11, step=1)
    cat_learning_rate = trial.suggest_float('learning_rate', 0.0001, 0.1, log=True)
    cat_iterations = trial.suggest_int('iterations', 100, 1000, step = 20)
    cat_random_strength=trial.suggest_float("random_strength", 1e-5, 10.0, log=True)
    cat_min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 1, 30)
    cat_bagging_temperature= trial.suggest_float('bagging_temperature', 1e-5, 10.0,log = True)
    

    regressor_obj = CatBoostClassifier(
        random_strength=cat_random_strength,
        bootstrap_type=cat_bootstrap_type,
        l2_leaf_reg=cat_l2_leaf_reg,
        depth=cat_depth,
        learning_rate=cat_learning_rate,
        iterations=cat_iterations,
        min_data_in_leaf=cat_min_data_in_leaf,
        bagging_temperature=cat_bagging_temperature,
        od_wait = 50,
#         cat_features=cat_features,  # 카테고리형 변수 지정
        verbose=0  # 훈련 중 출력 억제
    )

    # F1 스코어 계산 시 'AbNormal'을 양성 클래스로 설정
    f1_scorer = make_scorer(f1_score, pos_label='AbNormal')
    
    # cross_val_score 호출 시 'scoring' 매개변수에 f1_scorer 전달
    score = cross_val_score(regressor_obj, X, y, cv=5, n_jobs=-1, scoring=f1_scorer, error_score='raise')
    
    mean_f1 = score.mean()
    return mean_f1

# 최적화 실행
study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction="maximize")
study.optimize(objective_decision, n_trials=50)

# 최적화 결과 보기
print("Best score:", study.best_value)
print("Best parameters:", study.best_params)



[I 2024-08-28 16:56:42,039] A new study created in memory with name: no-name-95d833af-1d60-4747-9033-b069c6750276
[I 2024-08-28 16:57:09,749] Trial 0 finished with value: 0.14268736164998624 and parameters: {'l2_leaf_reg': 0.018215019929305177, 'bootstrap_type': 'Bayesian', 'depth': 4, 'learning_rate': 0.0018773825301295225, 'iterations': 860, 'random_strength': 1.067365440933585e-05, 'min_data_in_leaf': 4, 'bagging_temperature': 0.10580215372643757}. Best is trial 0 with value: 0.14268736164998624.
[I 2024-08-28 16:57:29,934] Trial 1 finished with value: 0.1471983417627942 and parameters: {'l2_leaf_reg': 0.9018130811571128, 'bootstrap_type': 'Bayesian', 'depth': 3, 'learning_rate': 0.005312268141071225, 'iterations': 920, 'random_strength': 0.00017997522811796088, 'min_data_in_leaf': 6, 'bagging_temperature': 4.469524711497964e-05}. Best is trial 1 with value: 0.1471983417627942.
[I 2024-08-28 16:58:46,858] Trial 2 finished with value: 0.2534855285208809 and parameters: {'l2_leaf_reg'

Best score: 0.31123844785247246
Best parameters: {'l2_leaf_reg': 0.0337544297525961, 'bootstrap_type': 'Bayesian', 'depth': 8, 'learning_rate': 0.06859568747956753, 'iterations': 540, 'random_strength': 0.0008363623167617604, 'min_data_in_leaf': 30, 'bagging_temperature': 0.006519292631500432}


In [None]:
cat = CatBoostClassifier(**study.best_params)
cat.fit(X,y)

0:	learn: 0.6626375	total: 72ms	remaining: 38.8s
1:	learn: 0.6365366	total: 92.5ms	remaining: 24.9s
2:	learn: 0.6146658	total: 108ms	remaining: 19.3s
3:	learn: 0.5965431	total: 122ms	remaining: 16.3s
4:	learn: 0.5808999	total: 134ms	remaining: 14.4s
5:	learn: 0.5675051	total: 146ms	remaining: 13s
6:	learn: 0.5553779	total: 159ms	remaining: 12.1s
7:	learn: 0.5453776	total: 171ms	remaining: 11.4s
8:	learn: 0.5351514	total: 184ms	remaining: 10.9s
9:	learn: 0.5276232	total: 195ms	remaining: 10.4s
10:	learn: 0.5203702	total: 208ms	remaining: 9.98s
11:	learn: 0.5139717	total: 219ms	remaining: 9.63s
12:	learn: 0.5078809	total: 231ms	remaining: 9.37s
13:	learn: 0.5032962	total: 242ms	remaining: 9.1s
14:	learn: 0.4977135	total: 255ms	remaining: 8.91s
15:	learn: 0.4932593	total: 266ms	remaining: 8.72s
16:	learn: 0.4896808	total: 280ms	remaining: 8.6s
17:	learn: 0.4862503	total: 292ms	remaining: 8.48s
18:	learn: 0.4828571	total: 304ms	remaining: 8.34s
19:	learn: 0.4801751	total: 316ms	remaining: 

<catboost.core.CatBoostClassifier at 0x7f3a882e4be0>

In [None]:
# 하이퍼파라미터 튜닝을 위한 목적 함수 정의
def objective(trial):
    rf_n_estimators = trial.suggest_int("n_estimators", 10,1000, step=20)
    rf_max_features = trial.suggest_categorical("max_features",['sqrt', 'log2']) 
    rf_max_depth = trial.suggest_int("max_depth", 1,11, step=1)
    rf_bootstrap = trial.suggest_categorical('bootstrap', [True, False])
    rf_class_weight = trial.suggest_categorical('class_weight', ['balanced', 'balanced_subsample', None])
    rf_min_samples_split = trial.suggest_int("min_samples_split",2, 100, step=2)
    rf_min_samples_leaf = trial.suggest_int("min_samples_leaf",1,50, step=1)
    
    regressor_obj = RandomForestClassifier(
        n_estimators=rf_n_estimators,
        bootstrap=rf_bootstrap,
        class_weight=rf_class_weight,
        max_features=rf_max_features,
        max_depth=rf_max_depth,
        min_samples_split=rf_min_samples_split,
        min_samples_leaf=rf_min_samples_leaf,
        verbose=0
    )
    # F1 스코어 계산 시 'AbNormal'을 양성 클래스로 설정
    f1_scorer = make_scorer(f1_score, pos_label='AbNormal')
    
    # cross_val_score 호출 시 'scoring' 매개변수에 f1_scorer 전달
    score = cross_val_score(regressor_obj, X, y, cv=5, n_jobs=-1, scoring=f1_scorer, error_score='raise')
    
    mean_f1 = score.mean()
    return mean_f1

# 최적화 실행
study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction="maximize")
study.optimize(objective, n_trials=100)

# 최적화 결과 보기
print("Best score:", study.best_value)
print("Best parameters:", study.best_params)

[I 2024-08-28 17:53:16,964] A new study created in memory with name: no-name-6b8d681c-ba0f-478c-bc9a-5fcfb060eea9
[I 2024-08-28 17:53:28,541] Trial 0 finished with value: 0.4221031199281785 and parameters: {'n_estimators': 550, 'max_features': 'log2', 'max_depth': 10, 'bootstrap': False, 'class_weight': 'balanced_subsample', 'min_samples_split': 58, 'min_samples_leaf': 45}. Best is trial 0 with value: 0.4221031199281785.
[I 2024-08-28 17:53:31,530] Trial 1 finished with value: 0.38118256904085845 and parameters: {'n_estimators': 210, 'max_features': 'sqrt', 'max_depth': 3, 'bootstrap': True, 'class_weight': 'balanced_subsample', 'min_samples_split': 44, 'min_samples_leaf': 48}. Best is trial 0 with value: 0.4221031199281785.
[I 2024-08-28 17:53:47,944] Trial 2 finished with value: 0.4084067548279816 and parameters: {'n_estimators': 810, 'max_features': 'sqrt', 'max_depth': 5, 'bootstrap': False, 'class_weight': 'balanced', 'min_samples_split': 62, 'min_samples_leaf': 6}. Best is trial 

Best score: 0.4334789017756629
Best parameters: {'n_estimators': 510, 'max_features': 'log2', 'max_depth': 10, 'bootstrap': False, 'class_weight': 'balanced_subsample', 'min_samples_split': 52, 'min_samples_leaf': 17}


In [None]:
rf = RandomForestClassifier(**study.best_params)
rf.fit(X,y)

In [None]:
# from xgboost import XGBClassifier

# # 하이퍼파라미터 튜닝을 위한 목적 함수 정의
# y_en = y.map({'AbNormal': 1, 'Normal': 0})

# def objective(trial):
#     xgb_max_depth = trial.suggest_int("max_depth", 2,11, step=1)
#     xgb_learning_rate = trial.suggest_float("learning_rate",0.0001,0.1,log=True) 
#     xgb_n_estimators = trial.suggest_int("n_estimators", 10,1000, step=20)
#     xgb_reg_lambda = trial.suggest_float("reg_lambda", 1e-5, 10,log=True)
#     xgb_reg_alpha = trial.suggest_float("reg_alpha", 1e-5, 10,log=True)
#     xgb_subsample = trial.suggest_float('subsample', 0.2, 1.0,step=0.01)
#     xgb_min_child_weight =  trial.suggest_int('min_child_weight', 1, 15,step=1)
#     xgb_gamma = trial.suggest_float("gamma", 0, 1.0, step=0.005)
#     xgb_colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0, step=0.01)
#     xgb_colsample_bylevel = trial.suggest_float('colsample_bylevel', 0.5, 1.0, step=0.01)
#     xgb_colsample_bynode = trial.suggest_float('colsample_bynode', 0.5, 1.0, step=0.01)
    
#     regressor_obj = XGBClassifier(
#         objective = 'binary:logistic',
#         eval_metric='logloss',
#         booster = 'gbtree',
#         tree_method = 'hist',
#         max_depth = xgb_max_depth,
#         learning_rate = xgb_learning_rate,
#         n_estimators = xgb_n_estimators,
#         reg_lambda = xgb_reg_lambda,
#         reg_alpha = xgb_reg_alpha,
#         subsample = xgb_subsample,
#         min_child_weight = xgb_min_child_weight,
#         colsample_bytree = xgb_colsample_bytree,
#         colsample_bylevel = xgb_colsample_bylevel,
#         colsample_bynode = xgb_colsample_bynode,
#         gamma = xgb_gamma
    
#     )
#     # F1 스코어 계산 시 'AbNormal=1'을 양성 클래스로 설정
#     f1_scorer = make_scorer(f1_score, pos_label=1)
    
#     # cross_val_score 호출 시 'scoring' 매개변수에 f1_scorer 전달
#     score = cross_val_score(regressor_obj, X, y_en, cv=5, n_jobs=-1, scoring=f1_scorer, error_score='raise')
    
#     mean_f1 = score.mean()
#     return mean_f1

# # 최적화 실행
# study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction="maximize")
# study.optimize(objective, n_trials=300)

# # 최적화 결과 보기
# print("Best score:", study.best_value)
# print("Best parameters:", study.best_params)

In [None]:
# xgb = XGBClassifier(**study.best_params)
# xgb.fit(X,y_en)

In [None]:
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(
    estimators=[
        ('cat', cat),
#         ('xgb', xgb),
        ('rf', rf)
    ],
    voting='soft'  # 'hard' or 'soft'
)

In [None]:
voting_clf.fit(X, y)

0:	learn: 0.6626375	total: 15.5ms	remaining: 8.36s
1:	learn: 0.6365366	total: 28ms	remaining: 7.53s
2:	learn: 0.6146658	total: 39.4ms	remaining: 7.05s
3:	learn: 0.5965431	total: 51.7ms	remaining: 6.93s
4:	learn: 0.5808999	total: 63.6ms	remaining: 6.8s
5:	learn: 0.5675051	total: 74.9ms	remaining: 6.66s
6:	learn: 0.5553779	total: 86.4ms	remaining: 6.58s
7:	learn: 0.5453776	total: 98.3ms	remaining: 6.54s
8:	learn: 0.5351514	total: 111ms	remaining: 6.53s
9:	learn: 0.5276232	total: 122ms	remaining: 6.46s
10:	learn: 0.5203702	total: 134ms	remaining: 6.44s
11:	learn: 0.5139717	total: 145ms	remaining: 6.39s
12:	learn: 0.5078809	total: 157ms	remaining: 6.38s
13:	learn: 0.5032962	total: 169ms	remaining: 6.33s
14:	learn: 0.4977135	total: 181ms	remaining: 6.33s
15:	learn: 0.4932593	total: 193ms	remaining: 6.31s
16:	learn: 0.4896808	total: 204ms	remaining: 6.29s
17:	learn: 0.4862503	total: 216ms	remaining: 6.27s
18:	learn: 0.4828571	total: 229ms	remaining: 6.27s
19:	learn: 0.4801751	total: 240ms	re

In [None]:
test_data.drop(['Set ID'], axis = 1, inplace = True)

In [None]:
test_data.drop(['target'], axis = 1, inplace = True)

In [None]:
# 학습된 Isolation Forest로 테스트 데이터 예측
# test_data['iso_forest_anomaly'] = iso_forest.predict(test_data)

In [None]:
test_data.drop(features_to_drop, axis = 1, inplace = True)

In [None]:
y_pred = voting_clf.predict(test_data)
# y_pred_prob = voting_clf.predict_proba(test_data)

In [None]:
# y_pred_score = pd.DataFrame({
#     'Prediction': y_pred,
#     'Probability_Class_0': [prob[0] for prob in y_pred_prob],
#     'Probability_Class_1': [prob[1] for prob in y_pred_prob]
# })

In [None]:
# # 우선 'Probability_Class_0' 값이 상위 6% 이상인 부분을 찾기 위해 기준을 계산합니다.
# threshold = y_pred_score['Probability_Class_0'].quantile(0.94)

# # 'Probability_Class_0' 값이 threshold 이하인 부분을 'normal'로 변경
# y_pred_score['Prediction'] = y_pred_score.apply(
#     lambda row: 'Normal' if row['Probability_Class_0'] <= threshold else row['Prediction'], axis=1
# )


### 최종 제출

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = y_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

In [None]:
df_sub

Unnamed: 0,Set ID,target
0,0001be084fbc4aaa9d921f39e595961b,AbNormal
1,0005bbd180064abd99e63f9ed3e1ac80,Normal
2,000948934c4140d883d670adcb609584,Normal
3,000a6bfd02874c6296dc7b2e9c5678a7,AbNormal
4,0018e78ce91343678716e2ea27a51c95,Normal
...,...,...
17356,ffea508b59934d689b540f95eb3fa730,Normal
17357,ffed8923c8a448a98afc641b770be153,Normal
17358,fff1e73734da40adbe805359b3efb462,Normal
17359,fff8e38bdd09470baf95f71e92075dec,Normal


In [None]:
df_sub['target'].value_counts()

target
Normal      15709
AbNormal     1652
Name: count, dtype: int64

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**
