# 제품 이상여부 판별 프로젝트

## 1. 데이터 불러오기

### 필수 라이브러리

In [19]:
import os
from pprint import pprint
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import xgboost as xgb
import shap

pd.set_option('display.max_columns', None)
pd.set_option('display.max_row', None)

In [20]:
ROOT_DIR = "data"
RANDOM_STATE = 110
# Load data
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))

In [21]:
drop_cols = []
for column in train_data.columns:
    if (train_data[column].notnull().sum() // 2) < train_data[column].isnull().sum():
        drop_cols.append(column)
train_data = train_data.drop(drop_cols, axis=1)

### 전처리 함수

In [22]:
#전처리 함수
import pandas as pd
import numpy as np

# 파트 1: 전처리 초기 단계
def preprocess_initial(df):
    # 고유값이 하나인 컬럼 찾기
    unique_value_1_columns = [column for column in df.columns if df[column].nunique() == 1]

    # 모든 행의 값이 다른 컬럼 찾기
    row_count = len(df)
    matching_row_columns = [column for column in df.columns if df[column].value_counts().size == row_count]

    # 고유값, 혼합값 피쳐 제거
    df.drop(columns=unique_value_1_columns, inplace=True)
    df.drop(columns=matching_row_columns, inplace=True)
    return df

# 파트 2: 유사한 피쳐 제거
def reduce_dataframe(df):
    def get_value_counts_ratio(series):
        value_counts = series.value_counts(normalize=True)
        return value_counts.sort_values().values

    # 고유값 매핑 확인 함수
    def check_value_mapping(df, col1, col2):
        unique_values_1 = df[col1].unique()
        unique_values_2 = df[col2].unique()

        if len(unique_values_1) != len(unique_values_2):
            return False

        value_mapping = {}
        for val1 in unique_values_1:
            corresponding_values = df[df[col1] == val1][col2].unique()
            if len(corresponding_values) != 1:
                return False
            value_mapping[val1] = corresponding_values[0]

        for val1 in unique_values_1:
            ratio_1 = (df[col1] == val1).mean()
            ratio_2 = (df[col2] == value_mapping[val1]).mean()
            if ratio_1 != ratio_2:
                return False

        return True

    def compare_all_features(df):
        ratios = {column: get_value_counts_ratio(df[column]) for column in df.columns}
        similar_columns_dict = {column: [] for column in df.columns}

        # 고유값 비율이 같은 열들 찾기
        columns = list(ratios.keys())
        for i in range(len(columns)):
            for j in range(i + 1, len(columns)):
                if np.array_equal(ratios[columns[i]], ratios[columns[j]]):
                    similar_columns_dict[columns[i]].append(columns[j])

        # 고유값 비율이 같고 매핑도 동일한 피쳐들 찾기
        comparisons = []
        for key, values in similar_columns_dict.items():
            for value in values:
                if check_value_mapping(df, key, value):
                    comparisons.append((key, value))

        if comparisons:
            print("다음 피쳐 쌍은 고유값 비율과 양상이 동일합니다:")
            for base, compare in comparisons:
                print(f"'{base}'와(과) '{compare}'")
        else:
            print("모든 피쳐의 고유값 비율과 양상이 동일하지 않습니다.")

        return comparisons

    comparisons = compare_all_features(df)
    columns_to_remove = set()
    for _, col_to_remove in comparisons:
        columns_to_remove.add(col_to_remove)

    df = df.drop(columns=columns_to_remove)
    global df_columns
    df_columns = df.columns
    return df

# 파트 3: 통계 계산 및 추가 전처리
def finalize_preprocessing(df):

    # 장비별 Collect Result 컬럼들 식별
    collect_result_columns_dam = [col for col in df.columns if 'Collect Result' in col and 'Dam' in col]
    collect_result_columns_autoclave = [col for col in df.columns if 'Collect Result' in col and 'AutoClave' in col]
    collect_result_columns_fill1 = [col for col in df.columns if 'Collect Result' in col and 'Fill1' in col]
    collect_result_columns_fill2 = [col for col in df.columns if 'Collect Result' in col and 'Fill2' in col]

    # Dam 장비의 평균, 표준편차, 합계 계산
    df['Collect_Result_Mean_Dam'] = df[collect_result_columns_dam].astype(float).mean(axis=1)
    df['Collect_Result_Std_Dam'] = df[collect_result_columns_dam].astype(float).std(axis=1)
    df['Collect_Result_Sum_Dam'] = df[collect_result_columns_dam].astype(float).sum(axis=1)

    # AutoClave 장비의 평균, 표준편차, 합계 계산
    df['Collect_Result_Mean_AutoClave'] = df[collect_result_columns_autoclave].astype(float).mean(axis=1)
    df['Collect_Result_Std_AutoClave'] = df[collect_result_columns_autoclave].astype(float).std(axis=1)
    df['Collect_Result_Sum_AutoClave'] = df[collect_result_columns_autoclave].astype(float).sum(axis=1)

    # Fill1 장비의 평균, 표준편차, 합계 계산
    df['Collect_Result_Mean_Fill1'] = df[collect_result_columns_fill1].astype(float).mean(axis=1)
    df['Collect_Result_Std_Fill1'] = df[collect_result_columns_fill1].astype(float).std(axis=1)
    df['Collect_Result_Sum_Fill1'] = df[collect_result_columns_fill1].astype(float).sum(axis=1)

    # Fill2 장비의 평균, 표준편차, 합계 계산
    df['Collect_Result_Mean_Fill2'] = df[collect_result_columns_fill2].astype(float).mean(axis=1)
    df['Collect_Result_Std_Fill2'] = df[collect_result_columns_fill2].astype(float).std(axis=1)
    df['Collect_Result_Sum_Fill2'] = df[collect_result_columns_fill2].astype(float).sum(axis=1)

    return df

# train 전처리 함수
def preprocess_train_dataframe(df):
    df = preprocess_initial(df)
    df = reduce_dataframe(df)
    return df

# test 전처리 함수
def preprocess_test_dataframe(df):
    df = preprocess_initial(df)
    return df

In [23]:
df_train_pre = preprocess_train_dataframe(train_data)

다음 피쳐 쌍은 고유값 비율과 양상이 동일합니다:
'Equipment_Dam'와(과) 'CURE END POSITION X Collect Result_Dam'
'Equipment_Dam'와(과) 'CURE END POSITION Z Collect Result_Dam'
'Equipment_Dam'와(과) 'CURE END POSITION Θ Collect Result_Dam'
'Equipment_Dam'와(과) 'CURE START POSITION X Collect Result_Dam'
'Equipment_Dam'와(과) 'CURE START POSITION Θ Collect Result_Dam'
'Model.Suffix_Dam'와(과) 'Model.Suffix_AutoClave'
'Model.Suffix_Dam'와(과) 'Model.Suffix_Fill1'
'Model.Suffix_Dam'와(과) 'Model.Suffix_Fill2'
'Workorder_Dam'와(과) 'Workorder_AutoClave'
'Workorder_Dam'와(과) 'Workorder_Fill1'
'Workorder_Dam'와(과) 'Workorder_Fill2'
'CURE END POSITION X Collect Result_Dam'와(과) 'CURE END POSITION Z Collect Result_Dam'
'CURE END POSITION X Collect Result_Dam'와(과) 'CURE END POSITION Θ Collect Result_Dam'
'CURE END POSITION X Collect Result_Dam'와(과) 'CURE START POSITION X Collect Result_Dam'
'CURE END POSITION X Collect Result_Dam'와(과) 'CURE START POSITION Θ Collect Result_Dam'
'CURE END POSITION Z Collect Result_Dam'와(과) 'CURE END POSITI

In [70]:
features = df_train_pre.columns

In [24]:
# # 컬럼 목록을 사용자가 제공한 순서에 맞춰 재정렬
# columns_in_order = [
#     # 시간 관련(Time-related) 변수
#     'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam',
#     'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam',
#     'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam',
#     'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1',
#     'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1',
#     'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1',
#     '1st Pressure 1st Pressure Unit Time_AutoClave',
#     '2nd Pressure Unit Time_AutoClave',
#     '3rd Pressure Unit Time_AutoClave',
#     'Machine Tact time Collect Result_Dam',
#     'Machine Tact time Collect Result_Fill1',
#     'Machine Tact time Collect Result_Fill2',

#     # 속도 관련(Speed-related) 변수
#     'CURE SPEED Collect Result_Dam',
#     'CURE SPEED Collect Result_Fill2',
#     'Stage1 Circle1 Distance Speed Collect Result_Dam',
#     'Stage1 Circle2 Distance Speed Collect Result_Dam',
#     'Stage1 Line1 Distance Speed Collect Result_Dam',
#     'Stage1 Line2 Distance Speed Collect Result_Dam',
#     'Stage1 Line4 Distance Speed Collect Result_Dam',
#     'Stage2 Circle1 Distance Speed Collect Result_Dam',
#     'Stage2 Circle2 Distance Speed Collect Result_Dam',
#     'Stage2 Line2 Distance Speed Collect Result_Dam',
#     'Stage2 Line3 Distance Speed Collect Result_Dam',
#     'Stage2 Line4 Distance Speed Collect Result_Dam',
#     'Stage3 Circle1 Distance Speed Collect Result_Dam',
#     'Stage3 Circle2 Distance Speed Collect Result_Dam',
#     'Stage3 Line1 Distance Speed Collect Result_Dam',
#     'Stage3 Line2 Distance Speed Collect Result_Dam',
#     'Stage3 Line4 Distance Speed Collect Result_Dam',
#     'DISCHARGED SPEED OF RESIN Collect Result_Dam',
#     'DISCHARGED SPEED OF RESIN Collect Result_Fill1',

#     # 볼륨 관련(Volume-related) 변수
#     'Dispense Volume(Stage1) Collect Result_Dam',
#     'Dispense Volume(Stage2) Collect Result_Dam',
#     'Dispense Volume(Stage3) Collect Result_Dam',
#     'Dispense Volume(Stage2) Collect Result_Fill1',

#     # 좌표 관련(Coordinate-related) 변수
#     'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',
#     'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
#     'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam',
#     'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam',
#     'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam',
#     'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam',
#     'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam',
#     'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1',
#     'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1',
#     'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1',
#     'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1',
#     'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1',
#     'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1',
#     'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1',
#     'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1',
#     'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2',
#     'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2',
#     'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2',

#     # 포지션 관련(Position-related) 변수
#     'HEAD Standby Position X Collect Result_Dam',
#     'HEAD Standby Position Y Collect Result_Dam',
#     'Head Clean Position Z Collect Result_Dam',
#     'Head Purge Position X Collect Result_Dam',
#     'Head Purge Position Z Collect Result_Dam',
#     'Head Zero Position Y Collect Result_Dam',
#     'Head Zero Position Z Collect Result_Dam',
#     'HEAD Standby Position X Collect Result_Fill1',
#     'Head Purge Position Z Collect Result_Fill1',
#     'CURE END POSITION X Collect Result_Fill2',
#     'CURE END POSITION Z Collect Result_Fill2',
#     'CURE STANDBY POSITION Z Collect Result_Fill2',
#     'HEAD Standby Position X Collect Result_Fill2',

#     # 압력 관련(Pressure-related) 변수
#     '1st Pressure Collect Result_AutoClave',
#     '2nd Pressure Collect Result_AutoClave',
#     '3rd Pressure Collect Result_AutoClave',

#     # 온도 관련(Temperature-related) 변수
#     'Chamber Temp. Collect Result_AutoClave',
#     'Chamber Temp. Unit Time_AutoClave',
#     'Chamber Temp. Judge Value_AutoClave',

#     # 두께 관련(Thickness-related) 변수
#     'THICKNESS 1 Collect Result_Dam',
#     'THICKNESS 2 Collect Result_Dam',
#     'THICKNESS 3 Collect Result_Dam',

#     # 기타 변수(Miscellaneous Variables)
#     'PalletID Collect Result_Dam',
#     'Production Qty Collect Result_Dam',
#     'Receip No Collect Result_Dam',
#     'WorkMode Collect Result_Dam',
#     'PalletID Collect Result_Fill1',
#     'Production Qty Collect Result_Fill1',
#     'Receip No Collect Result_Fill1',
#     'WorkMode Collect Result_Fill1',
#     'PalletID Collect Result_Fill2',
#     'Production Qty Collect Result_Fill2',
#     'Receip No Collect Result_Fill2',
#     'WorkMode Collect Result_Fill2',
#     'Equipment_Dam',
#     'Model.Suffix_Dam',
#     'Workorder_Dam',
#     'Equipment_Fill1',
#     'Equipment_Fill2',
#     'target'
# ]

# df_reordered = df_train_pre[columns_in_order]

In [25]:
df_order = df_train_pre.copy()

### 이상치 탐지

In [26]:
# # 이상치 탐지, 새로운 피쳐 생성
# def detect_outliers_iqr(df, column):
#     Q1 = df[column].quantile(0.25)
#     Q3 = df[column].quantile(0.75)
#     IQR = Q3 - Q1
    
#     lower_bound = Q1 - 1.5 * IQR
#     upper_bound = Q3 + 1.5 * IQR
    
#     df[f'{column}_outlier'] = ((df[column] < lower_bound) | (df[column] > upper_bound)).astype(int)
#     return df

# # 모든 컬럼에 대해 이상치 피쳐 생성
# for column in df.columns:
#     # 숫자형 컬럼에 대해서만 이상치 탐지
#     if df[column].dtype in ['int64', 'float64']:
#         df = detect_outliers_iqr(df, column)

# 결과 확인

In [27]:
# # target이 'abnormal'인 경우만 필터링
# Normal_df = df_reordered[df_reordered['target'] == 'Normal']

# # 각 컬럼별로 value_counts() 적용
# value_counts_dict = {}

# for column in Normal_df.columns:
#     if column != 'target':  # target 컬럼 제외
#         value_counts_dict[column] = Normal_df[column].value_counts()

# # 결과 확인 (예: 특정 컬럼의 value_counts)
# for column, value_counts in value_counts_dict.items():
#     print(f"Value counts for {column}:\n{value_counts}\n")

## 2. 데이터 전처리

### SMOTE-Tokmek

In [28]:
# import pandas as pd
# from sklearn.preprocessing import OneHotEncoder
# from imblearn.combine import SMOTETomek
# from sklearn.compose import ColumnTransformer

# # 특징(feature)와 타겟(target)을 분리
# X = df_order.drop(columns=['target','Workorder_Dam'],axis=1)
# y = df_order["target"]

# # 범주형 데이터 인코딩
# categorical_features = X.select_dtypes(include=['object']).columns
# encoder = OneHotEncoder()
# X_encoded = encoder.fit_transform(X[categorical_features]).toarray()
# X_encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(categorical_features))

# # 인코딩된 범주형 데이터를 원래 숫자형 데이터와 결합
# X_numeric = X.drop(categorical_features, axis=1).reset_index(drop=True)
# X_combined = pd.concat([X_numeric, X_encoded_df], axis=1)

# # SMOTE-Tomek 적용
# smote_tomek = SMOTETomek(sampling_strategy= {"AbNormal": 10000},random_state=42)
# X_resampled, y_resampled = smote_tomek.fit_resample(X_combined, y)

# # 다시 DataFrame으로 변환
# df_resampled = pd.concat([pd.DataFrame(X_resampled, columns=X_combined.columns), pd.Series(y_resampled, name='target')], axis=1)

# # 결과 확인
# print(f"Resampled target distribution:\n{df_resampled['target'].value_counts()}")

# # 원래의 DataFrame과 동일한 형식으로 인덱스를 초기화
# df_train_preprocessed = df_resampled.reset_index(drop=True)

### 언더 샘플링 

In [29]:
# from sklearn.preprocessing import OneHotEncoder
# # 범주형 데이터 인코딩
# categorical_features = df_order.select_dtypes(include=['object']).columns
# categorical_features = categorical_features.drop('target','Workorder_Dam')
# encoder = OneHotEncoder()
# X_encoded = encoder.fit_transform(df_order[categorical_features]).toarray()
# X_encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(categorical_features))

# # 인코딩된 범주형 데이터를 원래 숫자형 데이터와 결합
# X_numeric = df_order.drop(categorical_features, axis=1).reset_index(drop=True)
# df_order_encoded = pd.concat([X_numeric, X_encoded_df], axis=1)

In [49]:
normal_ratio = 1.0  # 1.0 means 1:1 ratio

df_normal = df_order[df_order["target"] == "Normal"]
df_abnormal = df_order[df_order["target"] == "AbNormal"]

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

df_normal = df_normal.sample(
    n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE
)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
df_concat.value_counts("target")

  Total: Normal: 38156, AbNormal: 2350


target
AbNormal    2350
Normal      2350
dtype: int64

In [50]:
df_concat['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].replace('OK', 500, inplace=True)
df_concat['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].replace('OK', 800, inplace=True)
df_concat['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].replace('OK', 800, inplace=True)

# 숫자로 변환할 수 없는 데이터는 NaN으로 변환
df_concat['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'] = pd.to_numeric(df_concat['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'], errors='coerce')
df_concat['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam'] = pd.to_numeric(df_concat['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam'], errors='coerce')
df_concat['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'] = pd.to_numeric(df_concat['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'], errors='coerce')

df_concat['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'] = pd.to_numeric(df_concat['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'], errors='coerce')
df_concat['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'] = pd.to_numeric(df_concat['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'], errors='coerce')
df_concat['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam'] = pd.to_numeric(df_concat['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam'], errors='coerce')

df_concat['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam'] = pd.to_numeric(df_concat['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam'], errors='coerce')
df_concat['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam'] = pd.to_numeric(df_concat['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam'], errors='coerce')

# 결측치를 처리하기 위한 최빈값 계산 및 채우기
columns_to_fill = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',
    'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam',
]

for column in columns_to_fill:
    mode_value = df_concat[column].mode()[0]
    df_concat[column].fillna(mode_value, inplace=True)

# 유클리드 거리 피쳐 생성 (좌표 관련)
df_concat['Head_Coord_Distance_Stage1_2_Dam'] = np.sqrt(
    (df_concat['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam'] - df_concat['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam']) ** 2 +
    (df_concat['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'] - df_concat['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam']) ** 2 +
    (df_concat['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam'] - df_concat['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam']) ** 2
)


### 전처리 함수

In [51]:
# 1. 시간 관련 피쳐 생성
df_concat['Resin_Time_Diff_Stage1_2_Dam'] = df_concat['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam'] - df_concat['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam']
df_concat['Resin_Time_Diff_Stage1_3_Dam'] = df_concat['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam'] - df_concat['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam']
df_concat['Resin_Time_Diff_Stage2_3_Dam'] = df_concat['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam'] - df_concat['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam']
df_concat['Resin_Time_Diff_Stage1_2_Fill1'] = df_concat['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1'] - df_concat['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1']
df_concat['Resin_Time_Diff_Stage1_3_Fill1'] = df_concat['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1'] - df_concat['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1']
df_concat['Resin_Time_Diff_Stage2_3_Fill1'] = df_concat['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1'] - df_concat['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1']

df_concat['Machine_Tact_Time_Dam_Fill1_Diff'] = df_concat['Machine Tact time Collect Result_Dam'] - df_concat['Machine Tact time Collect Result_Fill1']
df_concat['Machine_Tact_Time_Fill1_Fill2_Diff'] = df_concat['Machine Tact time Collect Result_Fill1'] - df_concat['Machine Tact time Collect Result_Fill2']

# 2. 속도 관련 피쳐 생성
df_concat['CURE_SPEED_Dam_Fill_Diff'] = df_concat['CURE SPEED Collect Result_Dam'] - df_concat['CURE SPEED Collect Result_Fill2']
df_concat['Stage1_Circle_Speed_Diff_Dam'] = df_concat['Stage1 Circle2 Distance Speed Collect Result_Dam'] - df_concat['Stage1 Circle1 Distance Speed Collect Result_Dam']
df_concat['Stage2_Circle_Speed_Diff_Dam'] = df_concat['Stage2 Circle2 Distance Speed Collect Result_Dam'] - df_concat['Stage2 Circle1 Distance Speed Collect Result_Dam']
df_concat['Stage3_Circle_Speed_Diff_Dam'] = df_concat['Stage3 Circle2 Distance Speed Collect Result_Dam'] - df_concat['Stage3 Circle1 Distance Speed Collect Result_Dam']

df_concat['Stage1_Line21_Speed_Diff_Dam'] = df_concat['Stage1 Line2 Distance Speed Collect Result_Dam'] - df_concat['Stage1 Line1 Distance Speed Collect Result_Dam']
df_concat['Stage1_Line41_Speed_Diff_Dam'] = df_concat['Stage1 Line4 Distance Speed Collect Result_Dam'] - df_concat['Stage1 Line1 Distance Speed Collect Result_Dam']

df_concat['Stage2_Line_Speed_Diff_Dam'] = df_concat['Stage2 Line3 Distance Speed Collect Result_Dam'] - df_concat['Stage2 Line2 Distance Speed Collect Result_Dam']
df_concat['Stage3_Line_Speed_Diff_Dam'] = df_concat['Stage3 Line2 Distance Speed Collect Result_Dam'] - df_concat['Stage3 Line1 Distance Speed Collect Result_Dam']

# 3. 볼륨 관련 피쳐 생성
df_concat['Dispense_Volume_Ratio_Stage1_2_Dam'] = df_concat['Dispense Volume(Stage1) Collect Result_Dam'] / df_concat['Dispense Volume(Stage2) Collect Result_Dam']
df_concat['Dispense_Volume_Ratio_Stage2_3_Dam'] = df_concat['Dispense Volume(Stage2) Collect Result_Dam'] / df_concat['Dispense Volume(Stage3) Collect Result_Dam']

# 4. 좌표 관련 피쳐 생성
df_concat['Head_X_Coord_Diff_Stage2_3_Dam'] = df_concat['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'] - df_concat['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam']
df_concat['Head_Y_Coord_Diff_Stage1_2_Dam'] = df_concat['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'] - df_concat['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam']
df_concat['Head_Z_Coord_Diff_Stage1_2_Dam'] = df_concat['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam'] - df_concat['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam']

df_concat['Head_X_Coord_Diff_Stage2_3_Fill1'] = df_concat['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1'] - df_concat['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1']
df_concat['Head_Y_Coord_Diff_Stage1_2_Fill1'] = df_concat['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1'] - df_concat['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1']
df_concat['Head_Z_Coord_Diff_Stage1_2_Fill1'] = df_concat['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1'] - df_concat['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1']

# 5. 포지션 관련 피쳐 생성
df_concat['Head_Z_Clean_Purge_Diff_Dam'] = df_concat['Head Clean Position Z Collect Result_Dam'] - df_concat['Head Purge Position Z Collect Result_Dam']
df_concat['Head_X_Standby_Diff_Dam_Fill1'] = df_concat['HEAD Standby Position X Collect Result_Dam'] - df_concat['HEAD Standby Position X Collect Result_Fill1']
df_concat['Cure_End_Position_Diff_Fill2'] = df_concat['CURE END POSITION X Collect Result_Fill2'] - df_concat['CURE END POSITION Z Collect Result_Fill2']

# 6. 압력 관련 피쳐 생성
df_concat['Pressure_Diff_1st_2nd_AutoClave'] = df_concat['1st Pressure Collect Result_AutoClave'] - df_concat['2nd Pressure Collect Result_AutoClave']
df_concat['Pressure_Diff_2nd_3rd_AutoClave'] = df_concat['2nd Pressure Collect Result_AutoClave'] - df_concat['3rd Pressure Collect Result_AutoClave']

# 8. 두께 관련 피쳐 생성
df_concat['Thickness_Diff_1_2'] = df_concat['THICKNESS 1 Collect Result_Dam'] - df_concat['THICKNESS 2 Collect Result_Dam']
df_concat['Thickness_Diff_2_3'] = df_concat['THICKNESS 2 Collect Result_Dam'] - df_concat['THICKNESS 3 Collect Result_Dam']
df_concat['Thickness_Diff_1_3'] = df_concat['THICKNESS 1 Collect Result_Dam'] - df_concat['THICKNESS 3 Collect Result_Dam']

# 비선형 조합 피쳐 생성
df_concat['Resin_Speed_Product_Dam'] = df_concat['Resin_Time_Diff_Stage1_2_Dam'] * df_concat['CURE_SPEED_Dam_Fill_Diff']
df_concat['Thickness_Diff_1_2_Squared'] = df_concat['Thickness_Diff_1_2'] ** 2

# 유클리드 거리 피쳐 생성 (좌표 관련)
df_concat['Head_Coord_Distance_Stage1_2_Dam'] = np.sqrt(
    (df_concat['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam'] - df_concat['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam']) ** 2 +
    (df_concat['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'] - df_concat['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam']) ** 2 +
    (df_concat['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam'] - df_concat['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam']) ** 2
)
from sklearn.preprocessing import StandardScaler
# 표준화 (정규화도 가능)
scaler = StandardScaler()
scaled_columns = ['Resin_Time_Diff_Stage1_2_Dam', 'CURE_SPEED_Dam_Fill_Diff', 'Thickness_Diff_1_2']
df_concat[scaled_columns] = scaler.fit_transform(df_concat[scaled_columns])

In [53]:
train_x = df_concat.copy()

In [54]:
y = train_x.target
train_x.drop(columns = ['target'],inplace =True)
train_x = train_x.select_dtypes(exclude=['object'])

In [56]:
# 타겟 변수 인코딩
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [57]:
# 학습 데이터와 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(train_x, y_encoded, test_size=0.2, random_state=42)

# XGBoost 분류기 모델 초기화
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# F1 스코어 계산
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.4f}")

# SHAP 값 계산
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# SHAP 값의 평균 절대값 계산
shap_values_df = pd.DataFrame(shap_values, columns=X_test.columns)
shap_importance = shap_values_df.abs().mean().sort_values(ascending=False)
shap_importance
# # SHAP 값 시각화
# shap.summary_plot(shap_values, X_test)

F1 Score: 0.5752




Production Qty Collect Result_Dam                             0.209966
PalletID Collect Result_Dam                                   0.166902
Production Qty Collect Result_Fill2                           0.162120
Receip No Collect Result_Dam                                  0.161293
Production Qty Collect Result_Fill1                           0.128366
PalletID Collect Result_Fill1                                 0.122195
Pressure_Diff_2nd_3rd_AutoClave                               0.117669
1st Pressure Collect Result_AutoClave                         0.115036
Chamber Temp. Collect Result_AutoClave                        0.111539
PalletID Collect Result_Fill2                                 0.110915
Pressure_Diff_1st_2nd_AutoClave                               0.094460
HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam      0.088513
3rd Pressure Collect Result_AutoClave                         0.088298
Receip No Collect Result_Fill1                                0.083266
Machin

In [58]:
# SHAP 값이 0.01 이상인 피처만 선택
important_features = shap_importance[shap_importance >= 0.01].index.tolist()

In [59]:
train_x_shap = train_x[important_features]

In [60]:
from pycaret.classification import *
setup_clf = setup(data=train_x_shap, target=y, n_jobs=1)

Unnamed: 0,Description,Value
0,Session id,2400
1,Target,target
2,Target type,Binary
3,Target mapping,"AbNormal: 0, Normal: 1"
4,Original data shape,"(4700, 72)"
5,Transformed data shape,"(4700, 72)"
6,Transformed train set shape,"(3290, 72)"
7,Transformed test set shape,"(1410, 72)"
8,Numeric features,71
9,Preprocess,True


In [61]:
top3 = compare_models(sort='F1', n_select=3, fold=5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.6076,0.6407,0.6076,0.6078,0.6074,0.2152,0.2154,8.672
gbc,Gradient Boosting Classifier,0.6006,0.6336,0.6006,0.6011,0.6001,0.2012,0.2017,1.008
rf,Random Forest Classifier,0.597,0.6341,0.597,0.5973,0.5966,0.1939,0.1943,0.528
ada,Ada Boost Classifier,0.5945,0.6267,0.5945,0.5951,0.5939,0.1891,0.1896,0.302
lightgbm,Light Gradient Boosting Machine,0.5894,0.6257,0.5894,0.5896,0.5891,0.1787,0.1789,0.174
et,Extra Trees Classifier,0.5836,0.6183,0.5836,0.5836,0.5836,0.1672,0.1672,0.412
ridge,Ridge Classifier,0.5833,0.6112,0.5833,0.5835,0.5831,0.1666,0.1667,0.044
lda,Linear Discriminant Analysis,0.5818,0.6115,0.5818,0.582,0.5814,0.1635,0.1638,0.072
lr,Logistic Regression,0.5775,0.6022,0.5775,0.5778,0.5772,0.155,0.1553,0.46
xgboost,Extreme Gradient Boosting,0.5742,0.6146,0.5742,0.5743,0.574,0.1483,0.1485,0.246


Processing:   0%|          | 0/71 [00:00<?, ?it/s]

In [None]:
# tuned_top3 = [tune_model(i) for i in top3]

In [62]:
blender_top3 = blend_models(estimator_list=top3, verbose=True, method='auto')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6353,0.7006,0.6353,0.6358,0.635,0.2706,0.2711
1,0.6444,0.6688,0.6444,0.6447,0.6442,0.2888,0.2891
2,0.6109,0.6475,0.6109,0.6113,0.6107,0.222,0.2223
3,0.6079,0.6411,0.6079,0.6087,0.6073,0.216,0.2167
4,0.6079,0.616,0.6079,0.6087,0.6073,0.216,0.2167
5,0.614,0.613,0.614,0.6151,0.6129,0.2277,0.2289
6,0.6049,0.6466,0.6049,0.6049,0.6048,0.2097,0.2097
7,0.6079,0.654,0.6079,0.608,0.6078,0.2159,0.216
8,0.6322,0.6724,0.6322,0.6325,0.6321,0.2645,0.2648
9,0.5532,0.6034,0.5532,0.5532,0.5531,0.1063,0.1063


Processing:   0%|          | 0/6 [00:00<?, ?it/s]

In [None]:
# blender_tune3 = tune_model(blender_top3)

In [64]:
final_model = finalize_model(blender_top3)

## 4. 제출하기

### 테스트 데이터 예측

In [68]:
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

In [69]:
df_test = preprocess_test_dataframe(test_data)
df_test_pre = df_test.copy()

In [71]:
df_test_pre = df_test_pre[features]

In [76]:
df_test_pre.drop(columns ='target',inplace =  True)

In [77]:
df_test_pre['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].replace('OK', 500, inplace=True)
df_test_pre['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].replace('OK', 800, inplace=True)
df_test_pre['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].replace('OK', 800, inplace=True)

# 숫자로 변환할 수 없는 데이터는 NaN으로 변환
df_test_pre['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'] = pd.to_numeric(df_test_pre['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'], errors='coerce')
df_test_pre['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam'] = pd.to_numeric(df_test_pre['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam'], errors='coerce')
df_test_pre['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'] = pd.to_numeric(df_test_pre['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'], errors='coerce')

df_test_pre['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'] = pd.to_numeric(df_test_pre['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'], errors='coerce')
df_test_pre['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'] = pd.to_numeric(df_test_pre['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'], errors='coerce')
df_test_pre['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam'] = pd.to_numeric(df_test_pre['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam'], errors='coerce')

df_test_pre['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam'] = pd.to_numeric(df_test_pre['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam'], errors='coerce')
df_test_pre['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam'] = pd.to_numeric(df_test_pre['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam'], errors='coerce')

# 결측치를 처리하기 위한 최빈값 계산 및 채우기
columns_to_fill = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',
    'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam',
]

for column in columns_to_fill:
    mode_value = df_test_pre[column].mode()[0]
    df_test_pre[column].fillna(mode_value, inplace=True)

# 유클리드 거리 피쳐 생성 (좌표 관련)
df_test_pre['Head_Coord_Distance_Stage1_2_Dam'] = np.sqrt(
    (df_test_pre['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam'] - df_test_pre['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam']) ** 2 +
    (df_test_pre['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'] - df_test_pre['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam']) ** 2 +
    (df_test_pre['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam'] - df_test_pre['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam']) ** 2
)


# 1. 시간 관련 피쳐 생성
df_test_pre['Resin_Time_Diff_Stage1_2_Dam'] = df_test_pre['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam'] - df_test_pre['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam']
df_test_pre['Resin_Time_Diff_Stage1_3_Dam'] = df_test_pre['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam'] - df_test_pre['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam']
df_test_pre['Resin_Time_Diff_Stage2_3_Dam'] = df_test_pre['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam'] - df_test_pre['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam']
df_test_pre['Resin_Time_Diff_Stage1_2_Fill1'] = df_test_pre['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1'] - df_test_pre['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1']
df_test_pre['Resin_Time_Diff_Stage1_3_Fill1'] = df_test_pre['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1'] - df_test_pre['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1']
df_test_pre['Resin_Time_Diff_Stage2_3_Fill1'] = df_test_pre['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1'] - df_test_pre['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1']

df_test_pre['Machine_Tact_Time_Dam_Fill1_Diff'] = df_test_pre['Machine Tact time Collect Result_Dam'] - df_test_pre['Machine Tact time Collect Result_Fill1']
df_test_pre['Machine_Tact_Time_Fill1_Fill2_Diff'] = df_test_pre['Machine Tact time Collect Result_Fill1'] - df_test_pre['Machine Tact time Collect Result_Fill2']

# 2. 속도 관련 피쳐 생성
df_test_pre['CURE_SPEED_Dam_Fill_Diff'] = df_test_pre['CURE SPEED Collect Result_Dam'] - df_test_pre['CURE SPEED Collect Result_Fill2']
df_test_pre['Stage1_Circle_Speed_Diff_Dam'] = df_test_pre['Stage1 Circle2 Distance Speed Collect Result_Dam'] - df_test_pre['Stage1 Circle1 Distance Speed Collect Result_Dam']
df_test_pre['Stage2_Circle_Speed_Diff_Dam'] = df_test_pre['Stage2 Circle2 Distance Speed Collect Result_Dam'] - df_test_pre['Stage2 Circle1 Distance Speed Collect Result_Dam']
df_test_pre['Stage3_Circle_Speed_Diff_Dam'] = df_test_pre['Stage3 Circle2 Distance Speed Collect Result_Dam'] - df_test_pre['Stage3 Circle1 Distance Speed Collect Result_Dam']

df_test_pre['Stage1_Line21_Speed_Diff_Dam'] = df_test_pre['Stage1 Line2 Distance Speed Collect Result_Dam'] - df_test_pre['Stage1 Line1 Distance Speed Collect Result_Dam']
df_test_pre['Stage1_Line41_Speed_Diff_Dam'] = df_test_pre['Stage1 Line4 Distance Speed Collect Result_Dam'] - df_test_pre['Stage1 Line1 Distance Speed Collect Result_Dam']

df_test_pre['Stage2_Line_Speed_Diff_Dam'] = df_test_pre['Stage2 Line3 Distance Speed Collect Result_Dam'] - df_test_pre['Stage2 Line2 Distance Speed Collect Result_Dam']
df_test_pre['Stage3_Line_Speed_Diff_Dam'] = df_test_pre['Stage3 Line2 Distance Speed Collect Result_Dam'] - df_test_pre['Stage3 Line1 Distance Speed Collect Result_Dam']

# 3. 볼륨 관련 피쳐 생성
df_test_pre['Dispense_Volume_Ratio_Stage1_2_Dam'] = df_test_pre['Dispense Volume(Stage1) Collect Result_Dam'] / df_test_pre['Dispense Volume(Stage2) Collect Result_Dam']
df_test_pre['Dispense_Volume_Ratio_Stage2_3_Dam'] = df_test_pre['Dispense Volume(Stage2) Collect Result_Dam'] / df_test_pre['Dispense Volume(Stage3) Collect Result_Dam']

# 4. 좌표 관련 피쳐 생성
df_test_pre['Head_X_Coord_Diff_Stage2_3_Dam'] = df_test_pre['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'] - df_test_pre['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam']
df_test_pre['Head_Y_Coord_Diff_Stage1_2_Dam'] = df_test_pre['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'] - df_test_pre['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam']
df_test_pre['Head_Z_Coord_Diff_Stage1_2_Dam'] = df_test_pre['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam'] - df_test_pre['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam']

df_test_pre['Head_X_Coord_Diff_Stage2_3_Fill1'] = df_test_pre['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1'] - df_test_pre['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1']
df_test_pre['Head_Y_Coord_Diff_Stage1_2_Fill1'] = df_test_pre['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1'] - df_test_pre['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1']
df_test_pre['Head_Z_Coord_Diff_Stage1_2_Fill1'] = df_test_pre['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1'] - df_test_pre['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1']

# 5. 포지션 관련 피쳐 생성
df_test_pre['Head_Z_Clean_Purge_Diff_Dam'] = df_test_pre['Head Clean Position Z Collect Result_Dam'] - df_test_pre['Head Purge Position Z Collect Result_Dam']
df_test_pre['Head_X_Standby_Diff_Dam_Fill1'] = df_test_pre['HEAD Standby Position X Collect Result_Dam'] - df_test_pre['HEAD Standby Position X Collect Result_Fill1']
df_test_pre['Cure_End_Position_Diff_Fill2'] = df_test_pre['CURE END POSITION X Collect Result_Fill2'] - df_test_pre['CURE END POSITION Z Collect Result_Fill2']

# 6. 압력 관련 피쳐 생성
df_test_pre['Pressure_Diff_1st_2nd_AutoClave'] = df_test_pre['1st Pressure Collect Result_AutoClave'] - df_test_pre['2nd Pressure Collect Result_AutoClave']
df_test_pre['Pressure_Diff_2nd_3rd_AutoClave'] = df_test_pre['2nd Pressure Collect Result_AutoClave'] - df_test_pre['3rd Pressure Collect Result_AutoClave']

# 8. 두께 관련 피쳐 생성
df_test_pre['Thickness_Diff_1_2'] = df_test_pre['THICKNESS 1 Collect Result_Dam'] - df_test_pre['THICKNESS 2 Collect Result_Dam']
df_test_pre['Thickness_Diff_2_3'] = df_test_pre['THICKNESS 2 Collect Result_Dam'] - df_test_pre['THICKNESS 3 Collect Result_Dam']
df_test_pre['Thickness_Diff_1_3'] = df_test_pre['THICKNESS 1 Collect Result_Dam'] - df_test_pre['THICKNESS 3 Collect Result_Dam']

# 비선형 조합 피쳐 생성
df_test_pre['Resin_Speed_Product_Dam'] = df_test_pre['Resin_Time_Diff_Stage1_2_Dam'] * df_test_pre['CURE_SPEED_Dam_Fill_Diff']
df_test_pre['Thickness_Diff_1_2_Squared'] = df_test_pre['Thickness_Diff_1_2'] ** 2

# 유클리드 거리 피쳐 생성 (좌표 관련)
df_test_pre['Head_Coord_Distance_Stage1_2_Dam'] = np.sqrt(
    (df_test_pre['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam'] - df_test_pre['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam']) ** 2 +
    (df_test_pre['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'] - df_test_pre['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam']) ** 2 +
    (df_test_pre['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam'] - df_test_pre['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam']) ** 2
)
from sklearn.preprocessing import StandardScaler
# 표준화 (정규화도 가능)
scaler = StandardScaler()
scaled_columns = ['Resin_Time_Diff_Stage1_2_Dam', 'CURE_SPEED_Dam_Fill_Diff', 'Thickness_Diff_1_2']
df_test_pre[scaled_columns] = scaler.fit_transform(df_test_pre[scaled_columns])

In [79]:
prediction = predict_model(final_model, data=df_test_pre)

### 제출 파일 작성

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = prediction['prediction_label']

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**