# 제품 이상여부 판별 프로젝트

## 1. 데이터 불러오기

### 필수 라이브러리

In [1]:
import os
from pprint import pprint
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import xgboost as xgb
import shap

pd.set_option('display.max_columns', None)
pd.set_option('display.max_row', None)

In [2]:
ROOT_DIR = "data"
RANDOM_STATE = 110

# CSV 파일 읽기
X_Dam = pd.read_csv(os.path.join(ROOT_DIR, "Dam dispensing.csv"))
X_AutoClave = pd.read_csv(os.path.join(ROOT_DIR, "Auto clave.csv"))
X_Fill1 = pd.read_csv(os.path.join(ROOT_DIR, "Fill1 dispensing.csv"))
X_Fill2 = pd.read_csv(os.path.join(ROOT_DIR, "Fill2 dispensing.csv"))

# y 데이터 읽기
y = pd.read_csv(os.path.join(ROOT_DIR, "train_y.csv"))


Columns (62,64) have mixed types. Specify dtype option on import or set low_memory=False.
Columns (23,25) have mixed types. Specify dtype option on import or set low_memory=False.
Columns (32,34) have mixed types. Specify dtype option on import or set low_memory=False.
Columns (62,64) have mixed types. Specify dtype option on import or set low_memory=False.


### 데이터 병합

x 데이터 병합

In [3]:
# Rename columns
X_Dam.columns = [i + " - Dam" for i in X_Dam.columns]
X_AutoClave.columns = [i + " - AutoClave" for i in X_AutoClave.columns]
X_Fill1.columns = [i + " - Fill1" for i in X_Fill1.columns]
X_Fill2.columns = [i + " - Fill2" for i in X_Fill2.columns]
X_Dam = X_Dam.rename(columns={"Set ID - Dam": "Set ID"})
X_AutoClave = X_AutoClave.rename(columns={"Set ID - AutoClave": "Set ID"})
X_Fill1 = X_Fill1.rename(columns={"Set ID - Fill1": "Set ID"})
X_Fill2 = X_Fill2.rename(columns={"Set ID - Fill2": "Set ID"})

# Merge X
X = pd.merge(X_Dam, X_AutoClave, on="Set ID")
X = pd.merge(X, X_Fill1, on="Set ID")
X = pd.merge(X, X_Fill2, on="Set ID")
X = X.drop(X[X.duplicated(subset="Set ID")].index).reset_index(drop=True)

x 데이터와 y 데이터 병합

In [4]:
# Merge X and y
df_merged = pd.merge(X, y, "inner", on="Set ID")

# Drop columns with more than half of the values missing
drop_cols = []
for column in df_merged.columns:
    if (df_merged[column].notnull().sum() // 2) < df_merged[
        column
    ].isnull().sum():
        drop_cols.append(column)
df_merged = df_merged.drop(drop_cols, axis=1)

# Drop Lot ID
df_merged = df_merged.drop("LOT ID - Dam", axis=1)

In [5]:
# 결측치 피쳐 세 개의 피쳐가 결측치 수가 동일함
na_list = []
for i in df_merged.columns:
    if df_merged[i].isna().sum() > 0:
        na_list.append(i)
df_merged[na_list] = df_merged[na_list].fillna(0)

## 2. 데이터 전처리

### @고유값(전부 다른 값, 단일값) 피쳐 제거@

In [6]:
# 고유값이 1개인 피쳐 찾기
unique_value_1_columns = [column for column in df_merged.columns if df_merged[column].nunique() == 1]

print("고유값이 1개인 피쳐들:")
print(unique_value_1_columns)

고유값이 1개인 피쳐들:
['Wip Line - Dam', 'Process Desc. - Dam', 'Insp. Seq No. - Dam', 'Insp Judge Code - Dam', 'Collect Result.4 - Dam', 'Collect Result.5 - Dam', 'Collect Result.6 - Dam', 'Collect Result.8 - Dam', 'Wip Line - AutoClave', 'Process Desc. - AutoClave', 'Equipment - AutoClave', 'Insp. Seq No. - AutoClave', 'Insp Judge Code - AutoClave', 'Judge Value - AutoClave', 'Judge Value.1 - AutoClave', 'Judge Value.2 - AutoClave', 'Wip Line - Fill1', 'Process Desc. - Fill1', 'Insp. Seq No. - Fill1', 'Insp Judge Code - Fill1', 'Wip Line - Fill2', 'Process Desc. - Fill2', 'Insp. Seq No. - Fill2', 'Insp Judge Code - Fill2', 'Collect Result.2 - Fill2', 'Collect Result.4 - Fill2', 'Collect Result.6 - Fill2', 'Collect Result.9 - Fill2', 'Collect Result.10 - Fill2', 'Collect Result.11 - Fill2', 'Collect Result.12 - Fill2', 'Collect Result.13 - Fill2', 'Collect Result.14 - Fill2', 'Collect Result.15 - Fill2', 'Collect Result.16 - Fill2']


In [7]:
# value_counts 길이가 데이터프레임의 행 수와 같은 피쳐 찾기
row_count = len(df_merged)
matching_row_columns = [column for column in df_merged.columns if df_merged[column].value_counts().size == row_count]

print("value_counts의 길이가 데이터프레임의 행 수와 같은 피쳐들:") 
matching_row_columns.remove('Collect Date - Dam') # 밑에서 이 피쳐로 정렬 후 train_test_split 진행하여 다시 넣어줌
matching_row_columns.remove('Collect Date - Fill1')
matching_row_columns.remove('Collect Date - Fill2')
print(matching_row_columns)

value_counts의 길이가 데이터프레임의 행 수와 같은 피쳐들:
['Set ID', 'LOT ID - AutoClave', 'LOT ID - Fill1', 'LOT ID - Fill2']


### @혼합값 대체@

In [8]:
# 각 열의 데이터 타입 확인
def check_mixed_types(column):
    types = set(map(type, column.dropna()))
    return len(types) > 1

mixed_type_columns = [column for column in df_merged.columns if check_mixed_types(df_merged[column])]

print("혼합된 타입이 있는 피처들:")
print(mixed_type_columns)

혼합된 타입이 있는 피처들:
['Collect Result.17 - Dam', 'Collect Result.7 - Fill1', 'Collect Result.17 - Fill2']


혼합된 타입이 있는 피처들:  
['Collect Result.17 - Dam', 'Collect Result.7 - Fill1', 'Collect Result.17 - Fill2']  
각 OK 값을 대체할 수 있는 값의 기준 필요  
일단 AbNormal과 Normal에 따른 큰 차이가 보이진 않음  
최빈값이나 평균으로 대체하기 보단 그에 상응하는, 구별되는 값으로 대체하는 것으로 먼저 조치

In [9]:
# print(df_merged[df_merged['target'] == 'AbNormal']['Collect Result.17 - Dam'].value_counts())
# print(df_merged[df_merged['target'] == 'Normal']['Collect Result.17 - Dam'].value_counts())

# print(df_merged[df_merged['target'] == 'AbNormal']['Collect Result.7 - Fill1'].value_counts())
# print(df_merged[df_merged['target'] == 'Normal']['Collect Result.7 - Fill1'].value_counts())

# print(df_merged[df_merged['target'] == 'AbNormal']['Collect Result.17 - Fill2'].value_counts())
# print(df_merged[df_merged['target'] == 'Normal']['Collect Result.17 - Fill2'].value_counts())

In [10]:
# 혼합값 대체   ## 이거 다음엔 안해보는걸로
df_merged.loc[df_merged['Collect Result.17 - Dam'] == 'OK', 'Collect Result.17 - Dam'] = 5000
df_merged.loc[df_merged['Collect Result.7 - Fill1'] == 'OK', 'Collect Result.7 - Fill1'] = 8000
df_merged.loc[df_merged['Collect Result.17 - Fill2'] == 'OK', 'Collect Result.17 - Fill2'] = 6000

# 고유값, 혼합값 피쳐 제거
df_merged.drop(columns = unique_value_1_columns, inplace = True)
df_merged.drop(columns = matching_row_columns, inplace = True)

# 컬럼 리스트
columns_to_convert = ['Collect Result.17 - Dam', 'Collect Result.7 - Fill1', 'Collect Result.17 - Fill2']

# 문자열을 숫자형으로 변환
for column in columns_to_convert:
    df_merged[column] = pd.to_numeric(df_merged[column], errors='coerce')


### @이상치 탐지@ 

In [11]:
# # 숫자형 피쳐 목록
# numeric_features = [col for col in df_merged.columns if pd.api.types.is_numeric_dtype(df_merged[col])]

# # 서브플롯 설정
# num_features = len(numeric_features)
# num_cols = 5  # 열의 개수
# num_rows = (num_features + num_cols - 1) // num_cols  # 행의 개수

# fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, num_rows * 5))

# # 박스플롯 생성
# for i, feature in enumerate(numeric_features):
#     row = i // num_cols
#     col = i % num_cols
#     sns.boxplot(x=df_merged[feature], ax=axes[row, col])
#     axes[row, col].set_title(f'Boxplot of {feature}')

# # 빈 서브플롯 숨기기
# for i in range(num_features, num_rows * num_cols):
#     fig.delaxes(axes.flatten()[i])

# plt.tight_layout()
# plt.show()

In [12]:
# # 이상치 탐지 함수 정의 (IQR 방법 사용)
# def detect_outliers(df, column):
#     Q1 = df[column].quantile(0.25)
#     Q3 = df[column].quantile(0.75)
#     IQR = Q3 - Q1
#     lower_bound = Q1 - 1.5 * IQR
#     upper_bound = Q3 + 1.5 * IQR
#     outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
#     return outliers

# # 대상 컬럼 목록
# outlier_columns = [
#     'Collect Result.20 - Dam', 'Collect Result.24 - Dam', 'Collect Result.25 - Dam','Collect Result.39 - Dam',
#     'Collect Result.40 - Dam', 'Collect Result.41 - Dam', 'Collect Result.67 - Dam',
#     'Collect Result.68 - Dam', 'Unit Time - AutoClave', 'Collect Result.2 - AutoClave',
#     'Collect Result.3 - AutoClave', 'Collect Result.1 - Fill1', 'Collect Result.2 - Fill1',
#     'Collect Result.5 - Fill1', 'Collect Result.27 - Fill1', 'Collect Result.28 - Fill1',
#     'Collect Result.37 - Fill2', 'Collect Result.38 - Fill2'
# ]

# # 이상치 탐지 및 해당 행의 target 값 확인
# for column in outlier_columns:
#     outliers = detect_outliers(df_merged, column)
#     if not outliers.empty:
#         print(f"Outliers in {column}:")
#         print(outliers[['target']])
#         print("Target value counts:")
#         print(outliers['target'].value_counts())
#         print("\n")
#     else:
#         print(f"No outliers detected in {column}\n")

Collect Result.24 - Dam, Collect Result.25 - Dam 둘의 이상치 양상이 같음  
Collect Result.67 - Dam, Collect Result.68 - Dam 둘의 이상치 양상이 같음  

### 언더 샘플링 

In [13]:
normal_ratio = 1.0  # 1.0 means 1:1 ratio

df_normal = df_merged[df_merged["target"] == "Normal"]
df_abnormal = df_merged[df_merged["target"] == "AbNormal"]

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

df_normal = df_normal.sample(
    n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE
)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
df_concat.value_counts("target")

  Total: Normal: 38156, AbNormal: 2350


target
AbNormal    2350
Normal      2350
dtype: int64

### @피쳐별 동일 양상 제거@

In [14]:
# 날짜 데이터로 변환
df_concat['Collect Date - Dam'] = pd.to_datetime(df_concat['Collect Date - Dam'])
df_concat['Collect Date - AutoClave'] = pd.to_datetime(df_concat['Collect Date - AutoClave'])
df_concat['Collect Date - Fill1'] = pd.to_datetime(df_concat['Collect Date - Fill1'])
df_concat['Collect Date - Fill2'] = pd.to_datetime(df_concat['Collect Date - Fill2'])

# 고유값 비율 계산 함수
def get_value_counts_ratio(series):
    value_counts = series.value_counts(normalize=True)
    return value_counts.sort_values().values

# 고유값 매핑 확인 함수
def check_value_mapping(df, col1, col2):
    unique_values_1 = df[col1].unique()
    unique_values_2 = df[col2].unique()

    if len(unique_values_1) != len(unique_values_2):
        return False

    value_mapping = {}
    for val1 in unique_values_1:
        corresponding_values = df[df[col1] == val1][col2].unique()
        if len(corresponding_values) != 1:
            return False
        value_mapping[val1] = corresponding_values[0]

    for val1 in unique_values_1:
        if isinstance(val1, pd.Timestamp):
            ratio_1 = (df[col1].apply(lambda x: x.date() if isinstance(x, pd.Timestamp) else x) == val1.date()).mean()
            ratio_2 = (df[col2].apply(lambda x: x.date() if isinstance(x, pd.Timestamp) else x) == value_mapping[val1].date()).mean()
        else:
            ratio_1 = (df[col1] == val1).mean()
            ratio_2 = (df[col2] == value_mapping[val1]).mean()
        if ratio_1 != ratio_2:
            return False

    return True

def compare_all_features(df):
    ratios = {column: get_value_counts_ratio(df[column]) for column in df.columns if not pd.api.types.is_datetime64_any_dtype(df[column])}
    similar_columns_dict = {column: [] for column in df.columns if not pd.api.types.is_datetime64_any_dtype(df[column])}

    # 고유값 비율이 같은 열들 찾기
    columns = list(ratios.keys())
    for i in range(len(columns)):
        for j in range(i + 1, len(columns)):
            if np.array_equal(ratios[columns[i]], ratios[columns[j]]):
                similar_columns_dict[columns[i]].append(columns[j])

    # 고유값 비율이 같고 매핑도 동일한 피쳐들 찾기
    comparisons = []
    for key, values in similar_columns_dict.items():
        for value in values:
            if check_value_mapping(df, key, value):
                comparisons.append((key, value))

    if comparisons:
        print("다음 피쳐 쌍은 고유값 비율과 양상이 동일합니다:")
        for base, compare in comparisons:
            print(f"'{base}'와(과) '{compare}'")
    else:
        print("모든 피쳐의 고유값 비율과 양상이 동일하지 않습니다.")

    return comparisons

def reduce_dataframe(df):
    comparisons = compare_all_features(df)
    columns_to_remove = set()
    for _, col_to_remove in comparisons:
        columns_to_remove.add(col_to_remove)
    
    reduced_df = df.drop(columns=columns_to_remove)
    return reduced_df

# 함수 실행
df_reduced = reduce_dataframe(df_concat)


다음 피쳐 쌍은 고유값 비율과 양상이 동일합니다:
'Equipment - Dam'와(과) 'Collect Result - Dam'
'Equipment - Dam'와(과) 'Collect Result.1 - Dam'
'Equipment - Dam'와(과) 'Collect Result.2 - Dam'
'Equipment - Dam'와(과) 'Collect Result.7 - Dam'
'Equipment - Dam'와(과) 'Collect Result.9 - Dam'
'Model.Suffix - Dam'와(과) 'Model.Suffix - AutoClave'
'Model.Suffix - Dam'와(과) 'Model.Suffix - Fill1'
'Model.Suffix - Dam'와(과) 'Model.Suffix - Fill2'
'Workorder - Dam'와(과) 'Workorder - AutoClave'
'Workorder - Dam'와(과) 'Workorder - Fill1'
'Workorder - Dam'와(과) 'Workorder - Fill2'
'Collect Result - Dam'와(과) 'Collect Result.1 - Dam'
'Collect Result - Dam'와(과) 'Collect Result.2 - Dam'
'Collect Result - Dam'와(과) 'Collect Result.7 - Dam'
'Collect Result - Dam'와(과) 'Collect Result.9 - Dam'
'Collect Result.1 - Dam'와(과) 'Collect Result.2 - Dam'
'Collect Result.1 - Dam'와(과) 'Collect Result.7 - Dam'
'Collect Result.1 - Dam'와(과) 'Collect Result.9 - Dam'
'Collect Result.2 - Dam'와(과) 'Collect Result.7 - Dam'
'Collect Result.2 - Dam'와(과) 'Collect

In [15]:
print("최종적으로 남겨진 피처들:")
print(len(df_reduced.columns))

최종적으로 남겨진 피처들:
100


In [16]:
# import pandas as pd
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import cross_val_score
# from sklearn.preprocessing import LabelEncoder, OneHotEncod
# er
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.impute import SimpleImputer


# # 단계별 컬럼 분류
# dam_cols = [col for col in df_reduced.columns if 'Dam' in col]
# dam_cols.remove('Workorder - Dam')
# autoclave_cols = [col for col in df_reduced.columns if 'AutoClave' in col]
# autoclave_cols.remove('Collect Date - AutoClave')
# fill1_cols = [col for col in df_reduced.columns if 'Fill1' in col]
# fill2_cols = [col for col in df_reduced.columns if 'Fill2' in col]

# # 타겟 컬럼 추가
# target_col = 'target'  # 실제 타겟 컬럼명을 지정하세요

# # 단계별 데이터프레임 생성 (타겟 컬럼 포함)
# dam_df_reduced = df_reduced.loc[:, dam_cols + [target_col]].copy()
# autoclave_df_reduced = df_reduced.loc[:, autoclave_cols + [target_col]].copy()
# fill1_df_reduced = df_reduced.loc[:, fill1_cols + [target_col]].copy()
# fill2_df_reduced = df_reduced.loc[:, fill2_cols + [target_col]].copy()

# # 타겟 값 인코딩
# label_encoder = LabelEncoder()
# df_reduced.loc[:, target_col] = label_encoder.fit_transform(df_reduced[target_col])

# # 모델 학습 및 평가 함수 정의
# def evaluate_model(dataframe, target_col='target'):
#     X = dataframe.drop(columns=[target_col])
#     y = dataframe[target_col]
    
#     # OneHotEncoder를 사용하여 범주형 변수를 인코딩합니다.
#     categorical_features = X.select_dtypes(include=['object']).columns
#     numerical_features = X.select_dtypes(exclude=['object']).columns

#     # 파이프라인 정의
#     preprocessor = ColumnTransformer(
#         transformers=[
#             ('num', SimpleImputer(strategy='mean'), numerical_features),
#             ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
#         ])

#     model = Pipeline(steps=[('preprocessor', preprocessor),
#                             ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))])

#     scores = cross_val_score(model, X, y, cv=5, scoring='f1_weighted')  # pos_label 문제 해결 위해 f1_weighted 사용
#     return scores.mean()

# # 조합 생성 및 평가를 위한 함수
# def create_combination_and_evaluate(df_list, target_col):
#     combined_df = pd.concat([df.drop(columns=[target_col]) for df in df_list], axis=1)
#     combined_df.loc[:, target_col] = df_list[0][target_col].values
#     return evaluate_model(combined_df)

# # 각 단계별 조합 생성 및 평가
# combinations = [
#     ([dam_df_reduced], 'Dam'),
#     ([autoclave_df_reduced], 'AutoClave'),
#     ([fill1_df_reduced], 'Fill1'),
#     ([fill2_df_reduced], 'Fill2'),
#     ([dam_df_reduced, autoclave_df_reduced], 'Dam + AutoClave'),
#     ([dam_df_reduced, fill1_df_reduced], 'Dam + Fill1'),
#     ([dam_df_reduced, fill2_df_reduced], 'Dam + Fill2'),
#     ([autoclave_df_reduced, fill1_df_reduced], 'AutoClave + Fill1'),
#     ([autoclave_df_reduced, fill2_df_reduced], 'AutoClave + Fill2'),
#     ([fill1_df_reduced, fill2_df_reduced], 'Fill1 + Fill2'),
#     ([dam_df_reduced, autoclave_df_reduced, fill1_df_reduced], 'Dam + AutoClave + Fill1'),
#     ([dam_df_reduced, autoclave_df_reduced, fill2_df_reduced], 'Dam + AutoClave + Fill2'),
#     ([autoclave_df_reduced, fill1_df_reduced, fill2_df_reduced], 'AutoClave + Fill1 + Fill2'),
# ]

# # 각 조합별 모델 평가
# for combination, name in combinations:
#     score = create_combination_and_evaluate(combination, target_col)
#     print(f"{name} 단계 모델 성능 (F1 score): {score}")


### 데이터 분할

In [17]:
df_train = df_reduced.sort_values(by=["Collect Date - Dam"])

In [18]:
print(df_train.shape)

(4700, 100)


In [19]:
# df_train_shap = df_train[['Collect Date - Dam','Collect Result.17 - Dam','Collect Result.38 - Dam', 'Collect Result.39 - Dam', 'Collect Result.40 - Dam',
# 'Collect Result - AutoClave', 'Collect Result.1 - AutoClave', 'Unit Time.1 - AutoClave', 'Collect Result.2 - AutoClave', 'Collect Result.3 - AutoClave',
# 'Collect Result.26 - Fill1', 'Collect Result.27 - Fill1', 
# 'Collect Result.36 - Fill2', 'Collect Result.37 - Fill2','target'
# ]]

In [20]:
# 날짜 데이터로 변환
df_train['Collect Date - Dam'] = pd.to_datetime(df_train['Collect Date - Dam'])
df_train['Collect Date - AutoClave'] = pd.to_datetime(df_train['Collect Date - AutoClave'])
df_train['Collect Date - Fill1'] = pd.to_datetime(df_train['Collect Date - Fill1'])
df_train['Collect Date - Fill2'] = pd.to_datetime(df_train['Collect Date - Fill2'])

# 장비별 Collect Result 컬럼들 식별
collect_result_columns_dam = [col for col in df_train.columns if 'Collect Result' in col and 'Dam' in col]
collect_result_columns_autoclave = [col for col in df_train.columns if 'Collect Result' in col and 'AutoClave' in col]
collect_result_columns_fill1 = [col for col in df_train.columns if 'Collect Result' in col and 'Fill1' in col]
collect_result_columns_fill2 = [col for col in df_train.columns if 'Collect Result' in col and 'Fill2' in col]

# Dam 장비의 평균, 표준편차, 합계 계산
df_train['Collect_Result_Mean_Dam'] = df_train[collect_result_columns_dam].astype(float).mean(axis=1)
df_train['Collect_Result_Std_Dam'] = df_train[collect_result_columns_dam].astype(float).std(axis=1)
df_train['Collect_Result_Sum_Dam'] = df_train[collect_result_columns_dam].astype(float).sum(axis=1)

# AutoClave 장비의 평균, 표준편차, 합계 계산
df_train['Collect_Result_Mean_AutoClave'] = df_train[collect_result_columns_autoclave].astype(float).mean(axis=1)
df_train['Collect_Result_Std_AutoClave'] = df_train[collect_result_columns_autoclave].astype(float).std(axis=1)
df_train['Collect_Result_Sum_AutoClave'] = df_train[collect_result_columns_autoclave].astype(float).sum(axis=1)

# Fill1 장비의 평균, 표준편차, 합계 계산
df_train['Collect_Result_Mean_Fill1'] = df_train[collect_result_columns_fill1].astype(float).mean(axis=1)
df_train['Collect_Result_Std_Fill1'] = df_train[collect_result_columns_fill1].astype(float).std(axis=1)
df_train['Collect_Result_Sum_Fill1'] = df_train[collect_result_columns_fill1].astype(float).sum(axis=1)

# Fill2 장비의 평균, 표준편차, 합계 계산
df_train['Collect_Result_Mean_Fill2'] = df_train[collect_result_columns_fill2].astype(float).mean(axis=1)
df_train['Collect_Result_Std_Fill2'] = df_train[collect_result_columns_fill2].astype(float).std(axis=1)
df_train['Collect_Result_Sum_Fill2'] = df_train[collect_result_columns_fill2].astype(float).sum(axis=1)

# 각 장비 간의 시간 차이 계산 (초 단위)
df_train['Time_Difference_Dam_Fill1'] = (df_train['Collect Date - Dam'] - df_train['Collect Date - Fill1']).dt.total_seconds()
df_train['Time_Difference_Dam_Fill2'] = (df_train['Collect Date - Dam'] - df_train['Collect Date - Fill2']).dt.total_seconds()
df_train['Time_Difference_Dam_AutoClave'] = (df_train['Collect Date - Dam'] - df_train['Collect Date - AutoClave']).dt.total_seconds()
df_train['Time_Difference_Fill1_Fill2'] = (df_train['Collect Date - Fill1'] - df_train['Collect Date - Fill2']).dt.total_seconds()
df_train['Time_Difference_Fill1_AutoClave'] = (df_train['Collect Date - Fill1'] - df_train['Collect Date - AutoClave']).dt.total_seconds()
df_train['Time_Difference_Fill2_AutoClave'] = (df_train['Collect Date - Fill2'] - df_train['Collect Date - AutoClave']).dt.total_seconds()

In [21]:
# import sweetviz as sv

# # Sweetviz 분석 리포트 생성
# report = sv.analyze(df_train_shap)

# # 리포트 저장 및 표시
# report.show_html('변동피쳐데이터.html')

In [22]:
df_train['Collect Date - Dam'] = pd.to_datetime(df_train['Collect Date - Dam']).apply(lambda x: x.timestamp())
df_train['Collect Date - Fill1'] = pd.to_datetime(df_train['Collect Date - Fill1']).apply(lambda x: x.timestamp())
df_train['Collect Date - Fill2'] = pd.to_datetime(df_train['Collect Date - Fill2']).apply(lambda x: x.timestamp())
df_train['Collect Date - AutoClave'] = pd.to_datetime(df_train['Collect Date - AutoClave']).apply(lambda x: x.timestamp())

In [23]:
y_train = df_train.target
df_train.drop(columns = ['target'],inplace =True)

In [24]:
print(df_train.dtypes)

Equipment - Dam                     object
Model.Suffix - Dam                  object
Workorder - Dam                     object
Collect Date - Dam                 float64
Collect Result.3 - Dam               int64
Collect Result.10 - Dam              int64
Collect Result.11 - Dam            float64
Collect Result.12 - Dam            float64
Collect Result.13 - Dam            float64
Collect Result.14 - Dam            float64
Collect Result.15 - Dam            float64
Collect Result.16 - Dam            float64
Collect Result.17 - Dam            float64
Collect Result.18 - Dam            float64
Collect Result.19 - Dam            float64
Collect Result.20 - Dam            float64
Collect Result.21 - Dam            float64
Collect Result.22 - Dam            float64
Collect Result.23 - Dam            float64
Collect Result.24 - Dam            float64
Collect Result.26 - Dam            float64
Collect Result.27 - Dam              int64
Collect Result.31 - Dam            float64
Collect Res

### SHAP

In [25]:
# # NaN 및 무한대 값 처리
# df_train_shap = df_train.replace([np.inf, -np.inf], np.nan).dropna()
# # df_encoded = df_encoded.applymap(lambda x: x if x > 1e-2 else 0)
# df_train_shap['target'] = df_train_shap['target'].map({'AbNormal': 1, 'Normal': 0})
# df_cleaned = df_train_shap.drop(columns=['Workorder - Dam'])

# # 문자열 데이터를 범주형으로 변환
# categorical_cols = df_cleaned.select_dtypes(include=['object']).columns
# df_cleaned[categorical_cols] = df_cleaned[categorical_cols].astype('category')

# # 범주형 데이터를 원-핫 인코딩
# df_encoded = pd.get_dummies(df_cleaned, drop_first=True)

# # 독립 변수와 종속 변수 분리
# X_shap = df_encoded.drop(columns=['target'])
# y_shap = df_encoded['target']

# # 학습 데이터와 테스트 데이터 분리
# X_train, X_test, y_train, y_test = train_test_split(X_shap, y_shap, test_size=0.3, random_state=42)

In [26]:
# # XGBoost 모델 학습
# model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
# model.fit(X_train, y_train)

# # 예측
# y_pred = model.predict(X_test)

# # F1 score 계산
# f1 = f1_score(y_test, y_pred)
# print(f'F1 Score: {f1}')

In [27]:
# # SHAP 값 계산
# explainer = shap.TreeExplainer(model)
# shap_values = explainer.shap_values(X_test)

# # SHAP 값을 데이터프레임으로 변환
# shap_values_df = pd.DataFrame(shap_values, columns=X_test.columns)

# # 각 피쳐의 평균 절대 SHAP 값 계산
# mean_abs_shap_values = shap_values_df.abs().mean()

# # 상위 N개의 피쳐 선택 (여기서는 상위 10개 피쳐를 예시로 선택)
# top_features = mean_abs_shap_values.sort_values(ascending=False).head(50)
# # 평균 절대 SHAP 값이 0.01 이상인 피처 선택
# selected_features = top_features[top_features >= 0.01]

# # 피처 이름만 추출
# selected_feature_names = selected_features.index.tolist()

# # 결과 출력
# print("평균 절대 SHAP 값이 0.01 이상인 피처들:\n", selected_feature_names)

In [28]:
# # SHAP 값 계산
# explainer = shap.TreeExplainer(model)
# shap_values = explainer.shap_values(X_test)

# # SHAP 값 시각화
# shap.summary_plot(shap_values, X_test, plot_type="bar")
# shap.summary_plot(shap_values, X_test)



In [29]:
# X_train_shap = X_shap[selected_feature_names]

## 3. 모델 학습

### 모델 정의 

In [30]:
from catboost import CatBoostClassifier
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import cross_val_score
import optuna
import warnings
warnings.filterwarnings("ignore")

model = RandomForestClassifier(random_state=RANDOM_STATE)

### 모델 학습

In [31]:
features = []

for col in df_train.columns:
    try:
        df_train[col] = df_train[col].astype(int)
        features.append(col)
    except:
        continue

if "Set ID" in features:
    features.remove("Set ID")

train_x = df_train[features]
train_y = y_train

In [32]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(train_y)

### Catboost

In [33]:
# 하이퍼파라미터 튜닝을 위한 목적 함수 정의 + L2 추가 
def objective_decision(trial):
    cat_depth = trial.suggest_int('depth', 2, 10, step=1)
    cat_learning_rate = trial.suggest_float('learning_rate', 0.001, 0.01, step=0.001)
    cat_iterations = trial.suggest_int('iterations', 100, 300, step=10)
    cat_l2_leaf_reg = trial.suggest_float('l2_leaf_reg', 1e-5, 1e-1, log=True)

    classifier_obj = CatBoostClassifier(
        depth=cat_depth,
        learning_rate=cat_learning_rate,
        iterations=cat_iterations,
        l2_leaf_reg=cat_l2_leaf_reg,
        early_stopping_rounds=50,  # 조기 종료 설정
        verbose=0  # 훈련 중 출력 억제
    )

    # 교차 검증을 통해 F1 스코어 계산
    score = cross_val_score(classifier_obj, train_x, y_encoded, cv=5, n_jobs=-1, scoring='f1', error_score='raise')
    mean_f1 = score.mean()
    return mean_f1

# 최적화 실행
study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction="maximize")
study.optimize(objective_decision, n_trials=10) 

# 최적화 결과 보기
print("Best score:", study.best_value)
print("Best parameters:", study.best_params)

[I 2024-08-05 20:58:54,577] A new study created in memory with name: no-name-9800cf03-e42a-4d1f-9bf1-aed742b5a72a
[I 2024-08-05 20:58:59,989] Trial 0 finished with value: 0.3244089084920887 and parameters: {'depth': 6, 'learning_rate': 0.003, 'iterations': 180, 'l2_leaf_reg': 0.023938918670661075}. Best is trial 0 with value: 0.3244089084920887.
[I 2024-08-05 20:59:03,336] Trial 1 finished with value: 0.3731035866605333 and parameters: {'depth': 2, 'learning_rate': 0.002, 'iterations': 240, 'l2_leaf_reg': 0.020109951371648067}. Best is trial 1 with value: 0.3731035866605333.
[I 2024-08-05 20:59:07,197] Trial 2 finished with value: 0.351775122383572 and parameters: {'depth': 3, 'learning_rate': 0.006, 'iterations': 280, 'l2_leaf_reg': 6.867655291394631e-05}. Best is trial 1 with value: 0.3731035866605333.
[I 2024-08-05 20:59:09,946] Trial 3 finished with value: 0.3238420295941064 and parameters: {'depth': 3, 'learning_rate': 0.002, 'iterations': 140, 'l2_leaf_reg': 0.08212873599011565}.

Best score: 0.3731035866605333
Best parameters: {'depth': 2, 'learning_rate': 0.002, 'iterations': 240, 'l2_leaf_reg': 0.020109951371648067}


In [34]:
cat = CatBoostClassifier(**study.best_params)
cat.fit(train_x, y_encoded)

0:	learn: 0.6929550	total: 137ms	remaining: 32.8s
1:	learn: 0.6927676	total: 139ms	remaining: 16.5s
2:	learn: 0.6925791	total: 141ms	remaining: 11.1s
3:	learn: 0.6923772	total: 142ms	remaining: 8.38s
4:	learn: 0.6919903	total: 144ms	remaining: 6.76s
5:	learn: 0.6918121	total: 145ms	remaining: 5.67s
6:	learn: 0.6916364	total: 147ms	remaining: 4.88s
7:	learn: 0.6914564	total: 148ms	remaining: 4.3s
8:	learn: 0.6912792	total: 150ms	remaining: 3.85s
9:	learn: 0.6911023	total: 152ms	remaining: 3.49s
10:	learn: 0.6909367	total: 153ms	remaining: 3.19s
11:	learn: 0.6905991	total: 156ms	remaining: 2.96s
12:	learn: 0.6904130	total: 157ms	remaining: 2.75s
13:	learn: 0.6900638	total: 159ms	remaining: 2.57s
14:	learn: 0.6898824	total: 161ms	remaining: 2.41s
15:	learn: 0.6896058	total: 162ms	remaining: 2.27s
16:	learn: 0.6894980	total: 164ms	remaining: 2.15s
17:	learn: 0.6892557	total: 166ms	remaining: 2.04s
18:	learn: 0.6890698	total: 167ms	remaining: 1.95s
19:	learn: 0.6889278	total: 169ms	remainin

<catboost.core.CatBoostClassifier at 0x21d07416450>

### xgboost

In [35]:
from xgboost import XGBClassifier

def objective_decision(trial):
    xgbm_n_estimators = trial.suggest_int('n_estimators', 300, 400, step=50)
    xgbm_learning_rate = trial.suggest_float('learning_rate', 0.02, 0.05, step=0.01)
    xgbm_max_depth = trial.suggest_int('max_depth', 3, 10, step=1)
    xgbm_gamma = trial.suggest_float('gamma', 0.0, 0.5, step=0.1)
    xgbm_min_child_weight = trial.suggest_int('min_child_weight', 1, 10, step=1)
    xgbm_subsample = trial.suggest_float('subsample', 0.5, 1.0, step=0.1)
    xgbm_colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0, step=0.1)

    classifier_obj = XGBClassifier(
        n_estimators=xgbm_n_estimators,
        learning_rate=xgbm_learning_rate,
        max_depth=xgbm_max_depth,
        gamma=xgbm_gamma,
        min_child_weight=xgbm_min_child_weight,
        subsample=xgbm_subsample,
        colsample_bytree=xgbm_colsample_bytree,
        use_label_encoder=False,
        eval_metric='logloss'
    )

    score = cross_val_score(classifier_obj, train_x, y_encoded, cv=5, n_jobs=-1, scoring='f1', error_score='raise')
    mean_f1 = score.mean()
    return mean_f1

# 최적화 실행
study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction="maximize")
study.optimize(objective_decision, n_trials=10) 

# 최적화 결과 보기
print("Best score:", study.best_value)
print("Best parameters:", study.best_params)

[I 2024-08-05 21:00:28,094] A new study created in memory with name: no-name-b5498029-1782-4321-a83e-a2277420c0cb
[I 2024-08-05 21:00:29,374] Trial 0 finished with value: 0.3435762668150716 and parameters: {'n_estimators': 350, 'learning_rate': 0.03, 'max_depth': 6, 'gamma': 0.5, 'min_child_weight': 1, 'subsample': 0.5, 'colsample_bytree': 0.9}. Best is trial 0 with value: 0.3435762668150716.
[I 2024-08-05 21:00:30,585] Trial 1 finished with value: 0.3445106864233857 and parameters: {'n_estimators': 400, 'learning_rate': 0.02, 'max_depth': 7, 'gamma': 0.5, 'min_child_weight': 3, 'subsample': 0.6, 'colsample_bytree': 0.5}. Best is trial 1 with value: 0.3445106864233857.
[I 2024-08-05 21:00:31,709] Trial 2 finished with value: 0.34293916697531157 and parameters: {'n_estimators': 300, 'learning_rate': 0.05, 'max_depth': 9, 'gamma': 0.1, 'min_child_weight': 9, 'subsample': 0.6, 'colsample_bytree': 0.7}. Best is trial 1 with value: 0.3445106864233857.
[I 2024-08-05 21:00:32,625] Trial 3 fin

Best score: 0.3616543924520876
Best parameters: {'n_estimators': 300, 'learning_rate': 0.02, 'max_depth': 3, 'gamma': 0.30000000000000004, 'min_child_weight': 4, 'subsample': 0.8, 'colsample_bytree': 0.8}


In [36]:
xgbm = XGBClassifier(**study.best_params)
xgbm.fit(train_x, y_encoded)

In [37]:
# y_encoded #0은 AbNormal, 1은 Normal

### RandomForest

In [38]:
def objective(trial):
    # 하이퍼파라미터 설정
    rf_n_estimators = trial.suggest_int('n_estimators', 200, 400, step=20)
    rf_max_depth = trial.suggest_int('max_depth', 2, 15, step=2)
    rf_min_samples_split = trial.suggest_int('min_samples_split', 5, 20, step=2)
    rf_max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 2, 60, step=2)
    rf_min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10, step=1)
    rf_max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])

    classifier_obj = RandomForestClassifier(
        n_estimators=rf_n_estimators,
        max_depth=rf_max_depth,
        min_samples_split=rf_min_samples_split,
        max_leaf_nodes=rf_max_leaf_nodes,
        min_samples_leaf=rf_min_samples_leaf,
        max_features=rf_max_features,
        random_state=100
    )

    # 교차 검증을 통해 F1 스코어 계산
    score = cross_val_score(classifier_obj, train_x, y_encoded, cv=5, n_jobs=-1, scoring='f1')
    mean_f1 = score.mean()
    return mean_f1

# 최적화 실행
study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction="maximize")
study.optimize(objective, n_trials=10)

# 최적화 결과 보기
print("Best score:", study.best_value)
print("Best parameters:", study.best_params)

[I 2024-08-05 21:00:39,540] A new study created in memory with name: no-name-20143f81-97b1-4e27-b820-075c3ab0dff7
[I 2024-08-05 21:00:40,486] Trial 0 finished with value: 0.3251591173226765 and parameters: {'n_estimators': 300, 'max_depth': 4, 'min_samples_split': 11, 'max_leaf_nodes': 52, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 0 with value: 0.3251591173226765.
[I 2024-08-05 21:00:41,505] Trial 1 finished with value: 0.37569688663813167 and parameters: {'n_estimators': 380, 'max_depth': 2, 'min_samples_split': 13, 'max_leaf_nodes': 54, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.37569688663813167.
[I 2024-08-05 21:00:42,368] Trial 2 finished with value: 0.32629326926338614 and parameters: {'n_estimators': 240, 'max_depth': 14, 'min_samples_split': 17, 'max_leaf_nodes': 12, 'min_samples_leaf': 9, 'max_features': 'log2'}. Best is trial 1 with value: 0.37569688663813167.
[I 2024-08-05 21:00:43,690] Trial 3 finished with value: 0.325

Best score: 0.37569688663813167
Best parameters: {'n_estimators': 380, 'max_depth': 2, 'min_samples_split': 13, 'max_leaf_nodes': 54, 'min_samples_leaf': 3, 'max_features': 'sqrt'}


In [39]:
rf = RandomForestClassifier(**study.best_params)
rf.fit(train_x, y_encoded)

## 4. 제출하기

### 테스트 데이터 예측

테스트 데이터 불러오기

In [40]:
df_test_y = pd.read_csv(os.path.join("submission.csv"))

In [41]:
df_test = pd.merge(X, df_test_y, "inner", on="Set ID")

In [42]:
# 'OK' 값을 포함하는 컬럼 찾기
ok_columns = df_test.columns[df_test.isin(['OK']).any()]

# 'OK' 값을 포함하는 컬럼 출력
print("OK 값을 포함하는 컬럼: ", ok_columns.tolist())

OK 값을 포함하는 컬럼:  ['Insp Judge Code - Dam', 'Collect Result.17 - Dam', 'Judge Value.17 - Dam', 'Insp Judge Code - AutoClave', 'Judge Value - AutoClave', 'Judge Value.1 - AutoClave', 'Judge Value.2 - AutoClave', 'Judge Value.3 - AutoClave', 'Collect Result.4 - AutoClave', 'Judge Value.4 - AutoClave', 'Insp Judge Code - Fill1', 'Collect Result.7 - Fill1', 'Judge Value.7 - Fill1', 'Insp Judge Code - Fill2', 'Collect Result.17 - Fill2', 'Judge Value.17 - Fill2']


In [43]:
df_test.replace('OK', np.nan, inplace=True)

In [44]:
# 테스트셋 적용
df_test.loc[df_test['Collect Result.17 - Dam'] == 'OK', 'Collect Result.17 - Dam'] = 5000
df_test.loc[df_test['Collect Result.7 - Fill1'] == 'OK', 'Collect Result.7 - Fill1'] = 8000
df_test.loc[df_test['Collect Result.17 - Fill2'] == 'OK', 'Collect Result.17 - Fill2'] = 6000

# 고유값, 혼합값 피쳐 제거
df_test.drop(columns = unique_value_1_columns, inplace = True)
df_test.drop(columns = matching_row_columns, inplace = True)

# 컬럼 리스트
columns_to_convert = ['Collect Result.17 - Dam', 'Collect Result.7 - Fill1', 'Collect Result.17 - Fill2']
# 문자열을 숫자형으로 변환
for column in columns_to_convert:
    df_test[column] = pd.to_numeric(df_test[column], errors='coerce')

# 날짜 데이터로 변환
df_test['Collect Date - Dam'] = pd.to_datetime(df_test['Collect Date - Dam'])
df_test['Collect Date - AutoClave'] = pd.to_datetime(df_test['Collect Date - AutoClave'])
df_test['Collect Date - Fill1'] = pd.to_datetime(df_test['Collect Date - Fill1'])
df_test['Collect Date - Fill2'] = pd.to_datetime(df_test['Collect Date - Fill2'])

# # 동일 고유값 피쳐 제거 함수 실행
# df_test = reduce_dataframe(df_test)

# 장비별 Collect Result 컬럼들 식별
collect_result_columns_dam = [col for col in df_test.columns if 'Collect Result' in col and 'Dam' in col]
collect_result_columns_autoclave = [col for col in df_test.columns if 'Collect Result' in col and 'AutoClave' in col]
collect_result_columns_fill1 = [col for col in df_test.columns if 'Collect Result' in col and 'Fill1' in col]
collect_result_columns_fill2 = [col for col in df_test.columns if 'Collect Result' in col and 'Fill2' in col]

# Dam 장비의 평균, 표준편차, 합계 계산
df_test['Collect_Result_Mean_Dam'] = df_test[collect_result_columns_dam].astype(float).mean(axis=1)
df_test['Collect_Result_Std_Dam'] = df_test[collect_result_columns_dam].astype(float).std(axis=1)
df_test['Collect_Result_Sum_Dam'] = df_test[collect_result_columns_dam].astype(float).sum(axis=1)

# AutoClave 장비의 평균, 표준편차, 합계 계산
df_test['Collect_Result_Mean_AutoClave'] = df_test[collect_result_columns_autoclave].astype(float).mean(axis=1)
df_test['Collect_Result_Std_AutoClave'] = df_test[collect_result_columns_autoclave].astype(float).std(axis=1)
df_test['Collect_Result_Sum_AutoClave'] = df_test[collect_result_columns_autoclave].astype(float).sum(axis=1)

# Fill1 장비의 평균, 표준편차, 합계 계산
df_test['Collect_Result_Mean_Fill1'] = df_test[collect_result_columns_fill1].astype(float).mean(axis=1)
df_test['Collect_Result_Std_Fill1'] = df_test[collect_result_columns_fill1].astype(float).std(axis=1)
df_test['Collect_Result_Sum_Fill1'] = df_test[collect_result_columns_fill1].astype(float).sum(axis=1)

# Fill2 장비의 평균, 표준편차, 합계 계산
df_test['Collect_Result_Mean_Fill2'] = df_test[collect_result_columns_fill2].astype(float).mean(axis=1)
df_test['Collect_Result_Std_Fill2'] = df_test[collect_result_columns_fill2].astype(float).std(axis=1)
df_test['Collect_Result_Sum_Fill2'] = df_test[collect_result_columns_fill2].astype(float).sum(axis=1)

# 각 장비 간의 시간 차이 계산 (초 단위)
df_test['Time_Difference_Dam_Fill1'] = (df_test['Collect Date - Dam'] - df_test['Collect Date - Fill1']).dt.total_seconds()
df_test['Time_Difference_Dam_Fill2'] = (df_test['Collect Date - Dam'] - df_test['Collect Date - Fill2']).dt.total_seconds()
df_test['Time_Difference_Dam_AutoClave'] = (df_test['Collect Date - Dam'] - df_test['Collect Date - AutoClave']).dt.total_seconds()
df_test['Time_Difference_Fill1_Fill2'] = (df_test['Collect Date - Fill1'] - df_test['Collect Date - Fill2']).dt.total_seconds()
df_test['Time_Difference_Fill1_AutoClave'] = (df_test['Collect Date - Fill1'] - df_test['Collect Date - AutoClave']).dt.total_seconds()
df_test['Time_Difference_Fill2_AutoClave'] = (df_test['Collect Date - Fill2'] - df_test['Collect Date - AutoClave']).dt.total_seconds()


df_test['Collect Date - Dam'] = pd.to_datetime(df_test['Collect Date - Dam']).apply(lambda x: x.timestamp())
df_test['Collect Date - Fill1'] = pd.to_datetime(df_test['Collect Date - Fill1']).apply(lambda x: x.timestamp())
df_test['Collect Date - Fill2'] = pd.to_datetime(df_test['Collect Date - Fill2']).apply(lambda x: x.timestamp())
df_test['Collect Date - AutoClave'] = pd.to_datetime(df_test['Collect Date - AutoClave']).apply(lambda x: x.timestamp())

In [45]:
df_test_x = df_test[train_x.columns]

for col in df_test_x.columns:
    try:
        df_test_x.loc[:, col] = df_test_x[col].astype(int)
    except:
        continue

In [46]:
cat_test_pred = cat.predict(df_test_x)
xgbm_test_pred = xgbm.predict(df_test_x)
rf_test_pred = rf.predict(df_test_x)

### voting 전 평가

In [47]:
# 학습데이터 70%, 평가데이터 30%로 데이터 분할
X_tra, X_dev, y_tra, y_dev = train_test_split(train_x, y_encoded, test_size=0.3, stratify=train_y, random_state=0)

In [48]:
y_dev

array([1, 1, 1, ..., 0, 1, 0])

In [49]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score

averaging = VotingClassifier(
    estimators = [('xgbm', xgbm), ('cat', cat),  ('rf', rf)], voting='soft', weights=[4, 4, 3])

averaging.fit(X_tra, y_tra)

ensemble_pred = averaging.predict(X_dev)

print(f1_score(y_dev, ensemble_pred))

0:	learn: 0.6929467	total: 1.53ms	remaining: 367ms
1:	learn: 0.6927191	total: 2.95ms	remaining: 351ms
2:	learn: 0.6923609	total: 4.33ms	remaining: 342ms
3:	learn: 0.6921518	total: 5.75ms	remaining: 339ms
4:	learn: 0.6918632	total: 7.35ms	remaining: 345ms
5:	learn: 0.6916796	total: 8.66ms	remaining: 338ms
6:	learn: 0.6915032	total: 9.9ms	remaining: 329ms
7:	learn: 0.6911558	total: 11.4ms	remaining: 332ms
8:	learn: 0.6909663	total: 12.8ms	remaining: 328ms
9:	learn: 0.6908036	total: 14.1ms	remaining: 324ms
10:	learn: 0.6906177	total: 15.4ms	remaining: 322ms
11:	learn: 0.6904160	total: 16.8ms	remaining: 320ms
12:	learn: 0.6902622	total: 18.2ms	remaining: 317ms
13:	learn: 0.6899070	total: 19.5ms	remaining: 315ms
14:	learn: 0.6897682	total: 20.9ms	remaining: 313ms
15:	learn: 0.6895699	total: 22.3ms	remaining: 312ms
16:	learn: 0.6893911	total: 23.6ms	remaining: 310ms
17:	learn: 0.6892645	total: 24.9ms	remaining: 307ms
18:	learn: 0.6890769	total: 26ms	remaining: 303ms
19:	learn: 0.6889286	tota

### voting

In [50]:
from sklearn.ensemble import VotingClassifier

averaging_model = VotingClassifier(
    estimators = [('xgbm', xgbm), ('cat', cat),  ('rf', rf)], voting='soft', weights=[4, 4, 3])

averaging_model.fit(train_x, train_y)

ensemble_pred = averaging_model.predict(df_test_x)

0:	learn: 0.6929550	total: 1.78ms	remaining: 426ms
1:	learn: 0.6927676	total: 3.55ms	remaining: 423ms
2:	learn: 0.6925791	total: 5.05ms	remaining: 399ms
3:	learn: 0.6923772	total: 6.59ms	remaining: 389ms
4:	learn: 0.6919903	total: 8.28ms	remaining: 389ms
5:	learn: 0.6918121	total: 9.76ms	remaining: 381ms
6:	learn: 0.6916364	total: 11.3ms	remaining: 376ms
7:	learn: 0.6914564	total: 12.8ms	remaining: 372ms
8:	learn: 0.6912792	total: 14.5ms	remaining: 372ms
9:	learn: 0.6911023	total: 16ms	remaining: 368ms
10:	learn: 0.6909367	total: 17.6ms	remaining: 367ms
11:	learn: 0.6905991	total: 19.3ms	remaining: 367ms
12:	learn: 0.6904130	total: 20.9ms	remaining: 365ms
13:	learn: 0.6900638	total: 22.6ms	remaining: 365ms
14:	learn: 0.6898824	total: 24.2ms	remaining: 364ms
15:	learn: 0.6896058	total: 26ms	remaining: 364ms
16:	learn: 0.6894980	total: 27.8ms	remaining: 365ms
17:	learn: 0.6892557	total: 29.6ms	remaining: 366ms
18:	learn: 0.6890698	total: 31.5ms	remaining: 367ms
19:	learn: 0.6889278	total

In [51]:
ensemble_pred 

array(['AbNormal', 'AbNormal', 'AbNormal', ..., 'Normal', 'Normal',
       'Normal'], dtype=object)

### 제출 파일 작성

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = ensemble_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**