# 제품 이상여부 판별 프로젝트

## 1. 데이터 불러오기

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import xgboost as xgb
import shap
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA


pd.set_option('display.max_columns', None)
pd.set_option('display.max_row', None)

ROOT_DIR = "data"
RANDOM_STATE = 110

In [2]:
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))

drop_cols = []
for column in train_data.columns:
    if (train_data[column].notnull().sum() // 2) < train_data[column].isnull().sum():
        drop_cols.append(column)
train_data = train_data.drop(drop_cols, axis=1)

## 2. 데이터 전처리

### 전처리 함수

In [3]:
#전처리 함수

# 파트 1: 전처리 초기 단계
def preprocess_initial(df):
    # 고유값이 하나인 컬럼 찾기
    unique_value_1_columns = [column for column in df.columns if df[column].nunique() == 1]

    # 모든 행의 값이 다른 컬럼 찾기
    row_count = len(df)
    matching_row_columns = [column for column in df.columns if df[column].value_counts().size == row_count]

    # 고유값, 혼합값 피쳐 제거
    df.drop(columns=unique_value_1_columns, inplace=True)
    df.drop(columns=matching_row_columns, inplace=True)
    return df

# 파트 2: 유사한 피쳐 제거
def reduce_dataframe(df):
    def get_value_counts_ratio(series):
        value_counts = series.value_counts(normalize=True)
        return value_counts.sort_values().values

    # 고유값 매핑 확인 함수
    def check_value_mapping(df, col1, col2):
        unique_values_1 = df[col1].unique()
        unique_values_2 = df[col2].unique()

        if len(unique_values_1) != len(unique_values_2):
            return False

        value_mapping = {}
        for val1 in unique_values_1:
            corresponding_values = df[df[col1] == val1][col2].unique()
            if len(corresponding_values) != 1:
                return False
            value_mapping[val1] = corresponding_values[0]

        for val1 in unique_values_1:
            ratio_1 = (df[col1] == val1).mean()
            ratio_2 = (df[col2] == value_mapping[val1]).mean()
            if ratio_1 != ratio_2:
                return False

        return True

    def compare_all_features(df):
        ratios = {column: get_value_counts_ratio(df[column]) for column in df.columns}
        similar_columns_dict = {column: [] for column in df.columns}

        # 고유값 비율이 같은 열들 찾기
        columns = list(ratios.keys())
        for i in range(len(columns)):
            for j in range(i + 1, len(columns)):
                if np.array_equal(ratios[columns[i]], ratios[columns[j]]):
                    similar_columns_dict[columns[i]].append(columns[j])

        # 고유값 비율이 같고 매핑도 동일한 피쳐들 찾기
        comparisons = []
        for key, values in similar_columns_dict.items():
            for value in values:
                if check_value_mapping(df, key, value):
                    comparisons.append((key, value))

        # if comparisons:
        #     print("다음 피쳐 쌍은 고유값 비율과 양상이 동일합니다:")
        #     for base, compare in comparisons:
        #         print(f"'{base}'와(과) '{compare}'")
        # else:
        #     print("모든 피쳐의 고유값 비율과 양상이 동일하지 않습니다.")

        return comparisons

    comparisons = compare_all_features(df)
    columns_to_remove = set()
    for _, col_to_remove in comparisons:
        columns_to_remove.add(col_to_remove)

    df = df.drop(columns=columns_to_remove)
    global df_columns
    df_columns = df.columns
    return df

# train 전처리 함수
def preprocess_train_dataframe(df):
    df = preprocess_initial(df)
    df = reduce_dataframe(df)
    return df

# test 전처리 함수
def preprocess_test_dataframe(df):
    df = preprocess_initial(df)
    return df

In [4]:
df_train_pre = preprocess_train_dataframe(train_data)
features = df_train_pre.columns
df_order = df_train_pre.copy()

In [5]:
# # 컬럼 목록을 사용자가 제공한 순서에 맞춰 재정렬
# columns_in_order = [
#     # 시간 관련(Time-related) 변수
#     'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam',
#     'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam',
#     'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam',

#     'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1',
#     'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1',
#     'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1',
    
#     '1st Pressure 1st Pressure Unit Time_AutoClave',
#     '2nd Pressure Unit Time_AutoClave',
#     '3rd Pressure Unit Time_AutoClave',
#     'Machine Tact time Collect Result_Dam',
#     'Machine Tact time Collect Result_Fill1',
#     'Machine Tact time Collect Result_Fill2',

#     # 속도 관련(Speed-related) 변수
#     'CURE SPEED Collect Result_Dam',
#     'CURE SPEED Collect Result_Fill2',
#     'Stage1 Circle1 Distance Speed Collect Result_Dam',
#     'Stage1 Circle2 Distance Speed Collect Result_Dam',
#     'Stage1 Line1 Distance Speed Collect Result_Dam',
#     'Stage1 Line2 Distance Speed Collect Result_Dam',
#     'Stage1 Line4 Distance Speed Collect Result_Dam',
#     'Stage2 Circle1 Distance Speed Collect Result_Dam',
#     'Stage2 Circle2 Distance Speed Collect Result_Dam',
#     'Stage2 Line2 Distance Speed Collect Result_Dam',
#     'Stage2 Line3 Distance Speed Collect Result_Dam',
#     'Stage2 Line4 Distance Speed Collect Result_Dam',
#     'Stage3 Circle1 Distance Speed Collect Result_Dam',
#     'Stage3 Circle2 Distance Speed Collect Result_Dam',
#     'Stage3 Line1 Distance Speed Collect Result_Dam',
#     'Stage3 Line2 Distance Speed Collect Result_Dam',
#     'Stage3 Line4 Distance Speed Collect Result_Dam',
#     'DISCHARGED SPEED OF RESIN Collect Result_Dam',
#     'DISCHARGED SPEED OF RESIN Collect Result_Fill1',

#     # 볼륨 관련(Volume-related) 변수
#     'Dispense Volume(Stage1) Collect Result_Dam',
#     'Dispense Volume(Stage2) Collect Result_Dam',
#     'Dispense Volume(Stage3) Collect Result_Dam',
#     'Dispense Volume(Stage2) Collect Result_Fill1',

#     # 좌표 관련(Coordinate-related) 변수
#     'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
#     'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',
#     'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',

#     'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam',
#     'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam',
#     'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam',

#     'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam',
#     'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam',

#     'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
#     'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1',
#     'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1',

#     'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1',
#     'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1',
#     'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1',

#     'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1',
#     'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1',
#     'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1',
    
#     'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2',
#     'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2',
#     'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2',
#     'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2',
    

#     # 포지션 관련(Position-related) 변수
#     'HEAD Standby Position X Collect Result_Dam',
#     'HEAD Standby Position Y Collect Result_Dam',
#     'Head Clean Position Z Collect Result_Dam',
#     'Head Purge Position X Collect Result_Dam',
#     'Head Purge Position Z Collect Result_Dam',
#     'Head Zero Position Y Collect Result_Dam',
#     'Head Zero Position Z Collect Result_Dam',
#     'HEAD Standby Position X Collect Result_Fill1',
#     'Head Purge Position Z Collect Result_Fill1',
#     'CURE END POSITION X Collect Result_Fill2',
#     'CURE END POSITION Z Collect Result_Fill2',
#     'CURE STANDBY POSITION Z Collect Result_Fill2',
#     'HEAD Standby Position X Collect Result_Fill2',

#     # 압력 관련(Pressure-related) 변수
#     '1st Pressure Collect Result_AutoClave',
#     '2nd Pressure Collect Result_AutoClave',
#     '3rd Pressure Collect Result_AutoClave',

#     # 온도 관련(Temperature-related) 변수
#     'Chamber Temp. Collect Result_AutoClave',
#     'Chamber Temp. Unit Time_AutoClave',
#     'Chamber Temp. Judge Value_AutoClave',

#     # 두께 관련(Thickness-related) 변수
#     'THICKNESS 1 Collect Result_Dam',
#     'THICKNESS 2 Collect Result_Dam',
#     'THICKNESS 3 Collect Result_Dam',

#     # 기타 변수(Miscellaneous Variables)
#     'PalletID Collect Result_Dam',
#     'Production Qty Collect Result_Dam',
#     'Receip No Collect Result_Dam',
#     'WorkMode Collect Result_Dam',
#     'PalletID Collect Result_Fill1',
#     'Production Qty Collect Result_Fill1',
#     'Receip No Collect Result_Fill1',
#     'WorkMode Collect Result_Fill1',
#     'PalletID Collect Result_Fill2',
#     'Production Qty Collect Result_Fill2',
#     'Receip No Collect Result_Fill2',
#     'WorkMode Collect Result_Fill2',
#     'Equipment_Dam',
#     'Model.Suffix_Dam',
#     'Workorder_Dam',
#     'Equipment_Fill1',
#     'Equipment_Fill2',
#     'target'
# ]

# df_reordered = df_train_pre[columns_in_order]

### 결측치 처리

In [6]:
def preprocess_coordinates(df, columns_to_replace, columns_to_fill):
    # 1. 특정 값('OK')을 NaN으로 변환
    for column in columns_to_replace:
        df[column].replace('OK', np.nan, inplace=True)
    
    # 2. 숫자로 변환할 수 없는 데이터를 NaN으로 변환
    for column in columns_to_replace:
        df[column] = pd.to_numeric(df[column], errors='coerce')
    
    # 3. 결측치를 처리하기 위한 최빈값 계산 및 채우기
    for column in columns_to_fill:
        mode_value = df[column].mode()[0]
        df[column].fillna(mode_value, inplace=True)
    
    return df

In [7]:
# 결측치 열
# 'OK' 값을 NaN으로 변환하고 결측치 처리할 열 리스트
columns_to_replace = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'
]
# 결측치를 처리할 열 리스트
columns_to_fill = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',
    'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'
]

In [8]:
# 결측치 함수 적용
df_concat = preprocess_coordinates(df_order, columns_to_replace, columns_to_fill)

In [9]:
df_concat.head(2)

Unnamed: 0,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,CURE SPEED Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam,Dispense Volume(Stage1) Collect Result_Dam,Dispense Volume(Stage2) Collect Result_Dam,Dispense Volume(Stage3) Collect Result_Dam,HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam,HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam,HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam,HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam,HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam,HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam,HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam,HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam,HEAD Standby Position X Collect Result_Dam,HEAD Standby Position Y Collect Result_Dam,Head Clean Position Z Collect Result_Dam,Head Purge Position X Collect Result_Dam,Head Purge Position Z Collect Result_Dam,Head Zero Position Y Collect Result_Dam,Head Zero Position Z Collect Result_Dam,Machine Tact time Collect Result_Dam,PalletID Collect Result_Dam,Production Qty Collect Result_Dam,Receip No Collect Result_Dam,Stage1 Circle1 Distance Speed Collect Result_Dam,Stage1 Circle2 Distance Speed Collect Result_Dam,Stage1 Line1 Distance Speed Collect Result_Dam,Stage1 Line2 Distance Speed Collect Result_Dam,Stage1 Line4 Distance Speed Collect Result_Dam,Stage2 Circle1 Distance Speed Collect Result_Dam,Stage2 Circle2 Distance Speed Collect Result_Dam,Stage2 Line2 Distance Speed Collect Result_Dam,Stage2 Line3 Distance Speed Collect Result_Dam,Stage2 Line4 Distance Speed Collect Result_Dam,Stage3 Circle1 Distance Speed Collect Result_Dam,Stage3 Circle2 Distance Speed Collect Result_Dam,Stage3 Line1 Distance Speed Collect Result_Dam,Stage3 Line2 Distance Speed Collect Result_Dam,Stage3 Line4 Distance Speed Collect Result_Dam,THICKNESS 1 Collect Result_Dam,THICKNESS 2 Collect Result_Dam,THICKNESS 3 Collect Result_Dam,WorkMode Collect Result_Dam,1st Pressure Collect Result_AutoClave,1st Pressure 1st Pressure Unit Time_AutoClave,2nd Pressure Collect Result_AutoClave,2nd Pressure Unit Time_AutoClave,3rd Pressure Collect Result_AutoClave,3rd Pressure Unit Time_AutoClave,Chamber Temp. Collect Result_AutoClave,Chamber Temp. Unit Time_AutoClave,Chamber Temp. Judge Value_AutoClave,Equipment_Fill1,DISCHARGED SPEED OF RESIN Collect Result_Fill1,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1,DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1,Dispense Volume(Stage2) Collect Result_Fill1,HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1,HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1,HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1,HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1,HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1,HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1,HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1,HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1,HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1,HEAD Standby Position X Collect Result_Fill1,Head Purge Position Z Collect Result_Fill1,Machine Tact time Collect Result_Fill1,PalletID Collect Result_Fill1,Production Qty Collect Result_Fill1,Receip No Collect Result_Fill1,WorkMode Collect Result_Fill1,Equipment_Fill2,CURE END POSITION X Collect Result_Fill2,CURE END POSITION Z Collect Result_Fill2,CURE SPEED Collect Result_Fill2,CURE STANDBY POSITION Z Collect Result_Fill2,HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2,HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2,HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2,HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2,HEAD Standby Position X Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,target
0,Dam dispenser #1,AJX75334505,4F1XA938-1,100,16,14.9,8.4,14.7,1.04,0.58,1.02,549.0,550.5,464.3,161.4,384.0,383.1,384.0,274.51,274.51,257,66.0,124.0,66.0,505.0,300.0,265.0,58.5,7,127,1,5800,5800,5800,5600,5800,5300,5300,5300,5300,5300,5800,5800,5800,5800,5800,0.0,0.0,0.0,0.312,240,0.493,1,0.499,120,54,361,OK,Fill1 dispenser #1,10.6,12.7,3.6,12.6,3.42,838.4,838.4,458.7,157.0,430.0,429.8,430.3,244.52,244.52,244.52,50,128.0,55.7,7,127,1,Fill2 dispenser #1,240,33,48,33,835.5,835.5,458.0,428.0,243.7,114.612,19.9,7,127,1,Normal
1,Dam dispenser #1,AJX75334505,3KPM0016-2,70,10,21.3,4.9,21.3,1.49,0.34,1.49,550.3,463.8,160.8,377.3,377.3,377.3,282.15,282.15,257.0,66,130.85,257.0,130.85,300.0,265.0,65.1,7.0,185,1,4000,4000,4000,4000,4000,9000,9000,9000,9000,9000,4000,4000,4000,4000,4000,0,0.0,0.0,7.0,0.311,241,0.311,121,0.498,121,54,483,OK,Fill1 dispenser #1,10.6,13.5,3.6,13.5,3.42,838.4,458.5,157.0,430.5,430.5,430.8,244.4,244.4,244.4,289.0,145,56.5,7.0,185,1,7,Fill2 dispenser #1,240,33,50,33,835.5,458.0,156.0,427.9,270.0,19.6,7.0,185,1,0,Normal


### 오버 샘플링 후 target 비율 조정

In [10]:
# df_numeric = df_order.select_dtypes(exclude=['object'])
# X = df_numeric
# y = df_order['target'] 

In [13]:
# 원본 데이터에서 object 타입 열을 제외
df_numeric_only = df_concat.select_dtypes(include=['int64', 'float64'])
# 'target' 열 추가 (필요하다면 이 열의 타입도 확인)
df_numeric_only['target'] = df_concat['target']

In [None]:
import pandas as pd
from ctgan import CTGAN
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer

# 1. Metadata 생성
metadata = SingleTableMetadata()

metadata.detect_from_dataframe(data=df_numeric_only)

# 2. 특정 열의 타입 수동 설정
column_sdtypes = {
    'Stage1 Line1 Distance Speed Collect Result_Dam': 'numerical',
    'Stage1 Line2 Distance Speed Collect Result_Dam': 'numerical',
    'Stage2 Line2 Distance Speed Collect Result_Dam': 'numerical',
    'Stage3 Line1 Distance Speed Collect Result_Dam': 'numerical',
    'Stage3 Line2 Distance Speed Collect Result_Dam': 'numerical',
    'target': 'categorical'  # 예를 들어, target 열을 categorical로 설정
}

for column, sdtype in column_sdtypes.items():
    metadata.update_column(column_name=column, sdtype=sdtype)

# 2. CTGAN 모델 생성 (epochs를 모델 초기화 시에 설정)
ctgan = CTGANSynthesizer(metadata, epochs=300)  # epochs 설정

# 3. 비정상 클래스(AbNormal) 데이터 추출
df_abnormal = df_numeric_only[df_numeric_only['target'] == 'AbNormal']

# 4. CTGAN 학습
ctgan.fit(df_abnormal)  # fit 메소드에서 epochs 파라미터 없이 호출

# 5. 증강 데이터 생성 (AbNormal 데이터와 유사한 데이터 생성)
new_data = ctgan.sample(3000)  # Normal 데이터의 수에 맞춰 AbNormal 데이터를 증강

In [15]:
# 6. 기존 데이터와 증강 데이터를 결합
df_augmented = pd.concat([df_numeric_only, new_data])

# 7. 결합된 데이터프레임 확인
print(df_augmented['target'].value_counts())

Normal      38156
AbNormal     5350
Name: target, dtype: int64


In [None]:
# # 잡음 추가
# noise_factor = 0.001
# numeric_columns = X_resampled.select_dtypes(include=[np.number]).columns

# for col in numeric_columns:
#     noise = np.random.normal(0, X_resampled[col].std() * noise_factor, size=X_resampled[col].shape)
#     X_resampled[col] += noise

# df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
# df_resampled['target'] = y_resampled 

In [16]:
normal_ratio = 1.0  # 1.0 means 1:1 ratio

df_normal = df_augmented[df_augmented["target"] == "Normal"]
df_abnormal = df_augmented[df_augmented["target"] == "AbNormal"]

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

df_normal = df_normal.sample(
    n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE
)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
df_concat.value_counts("target")

  Total: Normal: 38156, AbNormal: 5350


target
AbNormal    5350
Normal      5350
dtype: int64

### 피쳐 생성 함수

In [17]:
def feature_engineering(df):
    # 1. 시간 관련 피처 생성
    df['Resin_Time_Diff_Stage1_2_Dam'] = df['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam'] - df['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam']
    df['Resin_Time_Diff_Stage1_3_Dam'] = df['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam'] - df['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam']
    df['Resin_Time_Diff_Stage2_3_Dam'] = df['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam'] - df['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam']
    df['Resin_Time_Diff_Stage1_2_Fill1'] = df['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1'] - df['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1']
    df['Resin_Time_Diff_Stage1_3_Fill1'] = df['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1'] - df['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1']
    df['Resin_Time_Diff_Stage2_3_Fill1'] = df['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1'] - df['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1']

    df['Machine_Tact_Time_Dam_Fill1_Diff'] = df['Machine Tact time Collect Result_Dam'] - df['Machine Tact time Collect Result_Fill1']
    df['Machine_Tact_Time_Fill1_Fill2_Diff'] = df['Machine Tact time Collect Result_Fill1'] - df['Machine Tact time Collect Result_Fill2']

    # 2. 속도 관련 피처 생성
    df['CURE_SPEED_Dam_Fill_Diff'] = df['CURE SPEED Collect Result_Dam'] - df['CURE SPEED Collect Result_Fill2']
    df['Stage1_Circle_Speed_Diff_Dam'] = df['Stage1 Circle2 Distance Speed Collect Result_Dam'] - df['Stage1 Circle1 Distance Speed Collect Result_Dam']
    df['Stage2_Circle_Speed_Diff_Dam'] = df['Stage2 Circle2 Distance Speed Collect Result_Dam'] - df['Stage2 Circle1 Distance Speed Collect Result_Dam']
    df['Stage3_Circle_Speed_Diff_Dam'] = df['Stage3 Circle2 Distance Speed Collect Result_Dam'] - df['Stage3 Circle1 Distance Speed Collect Result_Dam']

    df['Stage1_Line21_Speed_Diff_Dam'] = df['Stage1 Line2 Distance Speed Collect Result_Dam'] - df['Stage1 Line1 Distance Speed Collect Result_Dam']
    df['Stage1_Line41_Speed_Diff_Dam'] = df['Stage1 Line4 Distance Speed Collect Result_Dam'] - df['Stage1 Line1 Distance Speed Collect Result_Dam']

    df['Stage2_Line_Speed_Diff_Dam'] = df['Stage2 Line3 Distance Speed Collect Result_Dam'] - df['Stage2 Line2 Distance Speed Collect Result_Dam']
    df['Stage3_Line_Speed_Diff_Dam'] = df['Stage3 Line2 Distance Speed Collect Result_Dam'] - df['Stage3 Line1 Distance Speed Collect Result_Dam']

    # 3. 볼륨 관련 피처 생성
    df['Dispense_Volume_Ratio_Stage1_2_Dam'] = df['Dispense Volume(Stage1) Collect Result_Dam'] / df['Dispense Volume(Stage2) Collect Result_Dam']
    df['Dispense_Volume_Ratio_Stage2_3_Dam'] = df['Dispense Volume(Stage2) Collect Result_Dam'] / df['Dispense Volume(Stage3) Collect Result_Dam']

    # 4. 좌표 관련 피처 생성
    df['Head_X_Coord_Diff_Stage2_3_Dam'] = df['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'] - df['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam']
    df['Head_Y_Coord_Diff_Stage1_2_Dam'] = df['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'] - df['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam']
    df['Head_Z_Coord_Diff_Stage1_2_Dam'] = df['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam'] - df['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam']

    df['Head_X_Coord_Diff_Stage2_3_Fill1'] = df['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1'] - df['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1']
    df['Head_Y_Coord_Diff_Stage1_2_Fill1'] = df['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1'] - df['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1']
    df['Head_Z_Coord_Diff_Stage1_2_Fill1'] = df['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1'] - df['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1']

    # 5. 포지션 관련 피처 생성
    df['Head_Z_Clean_Purge_Diff_Dam'] = df['Head Clean Position Z Collect Result_Dam'] - df['Head Purge Position Z Collect Result_Dam']
    df['Head_X_Standby_Diff_Dam_Fill1'] = df['HEAD Standby Position X Collect Result_Dam'] - df['HEAD Standby Position X Collect Result_Fill1']
    df['Cure_End_Position_Diff_Fill2'] = df['CURE END POSITION X Collect Result_Fill2'] - df['CURE END POSITION Z Collect Result_Fill2']

    # 6. 압력 관련 피처 생성
    df['Pressure_Diff_1st_2nd_AutoClave'] = df['1st Pressure Collect Result_AutoClave'] - df['2nd Pressure Collect Result_AutoClave']
    df['Pressure_Diff_2nd_3rd_AutoClave'] = df['2nd Pressure Collect Result_AutoClave'] - df['3rd Pressure Collect Result_AutoClave']

    # 7. 두께 관련 피처 생성
    df['Thickness_Diff_1_2'] = df['THICKNESS 1 Collect Result_Dam'] - df['THICKNESS 2 Collect Result_Dam']
    df['Thickness_Diff_2_3'] = df['THICKNESS 2 Collect Result_Dam'] - df['THICKNESS 3 Collect Result_Dam']
    df['Thickness_Diff_1_3'] = df['THICKNESS 1 Collect Result_Dam'] - df['THICKNESS 3 Collect Result_Dam']

    # 비선형 조합 피처 생성
    df['Resin_Speed_Product_Dam'] = df['Resin_Time_Diff_Stage1_2_Dam'] * df['CURE_SPEED_Dam_Fill_Diff']
    df['Thickness_Diff_1_2_Squared'] = df['Thickness_Diff_1_2'] ** 2

    # 유클리드 거리 피처 생성 (좌표 관련)
    df['Head_Coord_Distance_Stage1_2_Dam'] = np.sqrt(
        (df['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam'] - df['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam']) ** 2 +
        (df['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'] - df['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam']) ** 2 +
        (df['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam'] - df['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam']) ** 2
    )

    # 레진 방출 시간의 총합
    df['Total_Discharge_Time_Dam'] = (
        df['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam'] +
        df['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam'] +
        df['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam']
    )

    df['Total_Discharge_Time_Fill1'] = (
        df['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1'] +
        df['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1'] +
        df['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1']
    )

    # 기계 택트 타임 비율
    df['Tact_to_Discharge_Time_Ratio_Dam'] = (
        df['Machine Tact time Collect Result_Dam'] /
        df['Total_Discharge_Time_Dam']
    )

    # 총 볼륨
    df['Total_Volume_Dam'] = (
        df['Dispense Volume(Stage1) Collect Result_Dam'] +
        df['Dispense Volume(Stage2) Collect Result_Dam'] +
        df['Dispense Volume(Stage3) Collect Result_Dam']
    )

    df['Stage1_to_Total_Volume_Ratio_Dam'] = (
        df['Dispense Volume(Stage1) Collect Result_Dam'] /
        df['Total_Volume_Dam']
    )

    df['Stage2_to_Total_Volume_Ratio_Dam'] = (
        df['Dispense Volume(Stage2) Collect Result_Dam'] /
        df['Total_Volume_Dam']
    )

    df['Stage3_to_Total_Volume_Ratio_Dam'] = (
        df['Dispense Volume(Stage3) Collect Result_Dam'] /
        df['Total_Volume_Dam']
    )

    # 압력 증가율
    df['Pressure_Increment_Ratio_1_2'] = (
        (df['2nd Pressure Collect Result_AutoClave'] - df['1st Pressure Collect Result_AutoClave']) /
        df['1st Pressure Collect Result_AutoClave']
    )

    df['Pressure_Increment_Ratio_2_3'] = (
        (df['3rd Pressure Collect Result_AutoClave'] - df['2nd Pressure Collect Result_AutoClave']) /
        df['2nd Pressure Collect Result_AutoClave']
    )

    # 총 속도
    df['Total_Speed_Dam'] = (
        df['Stage1 Circle1 Distance Speed Collect Result_Dam'] +
        df['Stage1 Circle2 Distance Speed Collect Result_Dam'] +
        df['Stage1 Line1 Distance Speed Collect Result_Dam'] +
        df['Stage1 Line2 Distance Speed Collect Result_Dam'] +
        df['Stage1 Line4 Distance Speed Collect Result_Dam'] +
        df['Stage2 Circle1 Distance Speed Collect Result_Dam'] +
        df['Stage2 Circle2 Distance Speed Collect Result_Dam'] +
        df['Stage2 Line2 Distance Speed Collect Result_Dam'] +
        df['Stage2 Line3 Distance Speed Collect Result_Dam'] +
        df['Stage2 Line4 Distance Speed Collect Result_Dam'] +
        df['Stage3 Circle1 Distance Speed Collect Result_Dam'] +
        df['Stage3 Circle2 Distance Speed Collect Result_Dam'] +
        df['Stage3 Line1 Distance Speed Collect Result_Dam'] +
        df['Stage3 Line2 Distance Speed Collect Result_Dam'] +
        df['Stage3 Line4 Distance Speed Collect Result_Dam']
    )

    return df

In [19]:
# 피쳐 생성 함수 적용
df_concat = feature_engineering(df_concat)

# Scaler
numeric_columns = df_concat.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
df_concat[numeric_columns] = scaler.fit_transform(df_concat[numeric_columns])

train_x = df_concat.copy()

In [20]:
y = train_x.target
train_x.drop(columns = ['target'],inplace =True)
train_x = train_x.select_dtypes(exclude=['object'])

### PCA

In [None]:
# # PCA 모델 생성
# pca = PCA(n_components=0.95)  # 설명된 분산이 95%가 될 때까지 성분 수를 결정
# train_x_pca = pca.fit_transform(train_x)

In [None]:
# print("원래 피처 수:", train_x.shape[1])
# print("PCA 후 성분 수:", train_x_pca.shape[1])
# print("각 주성분의 설명된 분산 비율:", pca.explained_variance_ratio_)
# print("전체 설명된 분산 비율:", pca.explained_variance_ratio_.sum())

In [None]:
# # 주성분 이름을 생성 (PC1, PC2, ...)
# pca_columns = [f'PC{i+1}' for i in range(train_x_pca.shape[1])]

# # PCA 데이터프레임 생성
# df_pca = pd.DataFrame(train_x_pca, columns=pca_columns)

# # # 타겟 변수 추가
# # df_pca['target'] = y

### SHAP

In [21]:
# 타겟 변수 인코딩
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [22]:
# 학습 데이터와 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(train_x, y_encoded, test_size=0.2, random_state=42)

# XGBoost 분류기 모델 초기화
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# F1 스코어 계산
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.4f}")

# SHAP 값 계산
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# SHAP 값의 평균 절대값 계산
shap_values_df = pd.DataFrame(shap_values, columns=X_test.columns)
shap_importance = shap_values_df.abs().mean().sort_values(ascending=False)
shap_importance

F1 Score: 0.8048




HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam      0.571292
Receip No Collect Result_Fill1                                0.334242
Head_Z_Clean_Purge_Diff_Dam                                   0.287673
Head Zero Position Z Collect Result_Dam                       0.276252
Stage1_Line41_Speed_Diff_Dam                                  0.223404
Stage2_Line_Speed_Diff_Dam                                    0.182197
Production Qty Collect Result_Dam                             0.182060
Stage1_Line21_Speed_Diff_Dam                                  0.167238
Head_Coord_Distance_Stage1_2_Dam                              0.148802
Resin_Time_Diff_Stage1_3_Dam                                  0.148234
HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1    0.147512
Production Qty Collect Result_Fill2                           0.137834
PalletID Collect Result_Dam                                   0.129659
Production Qty Collect Result_Fill1                           0.126526
Total_

In [42]:
# shap 피쳐 기반
important_features = shap_importance[shap_importance > 0.1].index.tolist()
X_train_shap = X_train[important_features]
X_test_shap = X_test[important_features]

model.fit(X_train_shap, y_train)
y_pred = model.predict(X_test_shap)

# F1 스코어 계산
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.4f}")

F1 Score: 0.8152


In [43]:
# SHAP 값이 0.01 이상인 피처만 선택
important_features = shap_importance[shap_importance > 0.1].index.tolist()
train_x_shap = train_x[important_features]

## 3. 모델

In [32]:
catboost_model = CatBoostClassifier(verbose=0, random_state=42)
xgboost_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
randomforest_model = RandomForestClassifier(random_state=42)

# 앙상블 모델 정의 (VotingClassifier)
voting_clf = VotingClassifier(
    estimators=[
        ('catboost', catboost_model),
        ('xgboost', xgboost_model),
        ('randomforest', randomforest_model)
    ],
    voting='soft'
)

# 앙상블 모델 학습
voting_clf.fit(train_x_shap, y)

## 4. 제출하기

### 테스트 데이터 예측

In [33]:
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

df_test = preprocess_test_dataframe(test_data)

df_test_pre = df_test.copy()
df_test_pre = df_test_pre[features]

df_test_pre.drop(columns ='target',inplace =  True)

In [34]:
#결측값 함수
df_test_pre = preprocess_coordinates(df_test_pre, columns_to_replace, columns_to_fill)

In [35]:
# 피쳐 생성 함수
df_test_pre = feature_engineering(df_test_pre)
numeric_columns = df_test_pre.select_dtypes(include=['float64', 'int64']).columns

# 스케일러
scaler = StandardScaler()
df_test_pre[numeric_columns] = scaler.fit_transform(df_test_pre[numeric_columns])

# SHAP 피쳐 test 셋 적용
test_x_shap = df_test_pre[important_features]

In [36]:
y_pred = voting_clf.predict(test_x_shap)

### 제출 파일 작성

In [37]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = y_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**