# 제품 이상여부 판별 프로젝트

## 1. 데이터 불러오기

### 필수 라이브러리

In [1]:
import os
from pprint import pprint
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

pd.set_option('display.max_columns', None)

In [2]:
ROOT_DIR = "data"
RANDOM_STATE = 110

# CSV 파일 읽기
X_Dam = pd.read_csv(os.path.join(ROOT_DIR, "Dam dispensing.csv"))
X_AutoClave = pd.read_csv(os.path.join(ROOT_DIR, "Auto clave.csv"))
X_Fill1 = pd.read_csv(os.path.join(ROOT_DIR, "Fill1 dispensing.csv"))
X_Fill2 = pd.read_csv(os.path.join(ROOT_DIR, "Fill2 dispensing.csv"))

# y 데이터 읽기
y = pd.read_csv(os.path.join(ROOT_DIR, "train_y.csv"))


  X_Dam = pd.read_csv(os.path.join(ROOT_DIR, "Dam dispensing.csv"))
  X_AutoClave = pd.read_csv(os.path.join(ROOT_DIR, "Auto clave.csv"))
  X_Fill1 = pd.read_csv(os.path.join(ROOT_DIR, "Fill1 dispensing.csv"))
  X_Fill2 = pd.read_csv(os.path.join(ROOT_DIR, "Fill2 dispensing.csv"))


### 데이터 병합

x 데이터 병합

In [3]:
# Rename columns
X_Dam.columns = [i + " - Dam" for i in X_Dam.columns]
X_AutoClave.columns = [i + " - AutoClave" for i in X_AutoClave.columns]
X_Fill1.columns = [i + " - Fill1" for i in X_Fill1.columns]
X_Fill2.columns = [i + " - Fill2" for i in X_Fill2.columns]
X_Dam = X_Dam.rename(columns={"Set ID - Dam": "Set ID"})
X_AutoClave = X_AutoClave.rename(columns={"Set ID - AutoClave": "Set ID"})
X_Fill1 = X_Fill1.rename(columns={"Set ID - Fill1": "Set ID"})
X_Fill2 = X_Fill2.rename(columns={"Set ID - Fill2": "Set ID"})

# Merge X
X = pd.merge(X_Dam, X_AutoClave, on="Set ID")
X = pd.merge(X, X_Fill1, on="Set ID")
X = pd.merge(X, X_Fill2, on="Set ID")
X = X.drop(X[X.duplicated(subset="Set ID")].index).reset_index(drop=True)

x 데이터와 y 데이터 병합

In [4]:
# Merge X and y
df_merged = pd.merge(X, y, "inner", on="Set ID")

# Drop columns with more than half of the values missing
drop_cols = []
for column in df_merged.columns:
    if (df_merged[column].notnull().sum() // 2) < df_merged[
        column
    ].isnull().sum():
        drop_cols.append(column)
df_merged = df_merged.drop(drop_cols, axis=1)

# Drop Lot ID
df_merged = df_merged.drop("LOT ID - Dam", axis=1)

In [5]:
# 결측치 피쳐 세 개의 피쳐가 결측치 수가 동일함
na_list = []
for i in df_merged.columns:
    if df_merged[i].isna().sum() > 0:
        na_list.append(i)
na_list

['Collect Result.17 - Dam',
 'Collect Result.7 - Fill1',
 'Collect Result.17 - Fill2']

## 2. 데이터 전처리

### @고유값(전부 다른 값, 단일값) 피쳐 제거@

In [6]:
# 고유값이 1개인 피쳐 찾기
unique_value_1_columns = [column for column in df_merged.columns if df_merged[column].nunique() == 1]

print("고유값이 1개인 피쳐들:")
print(unique_value_1_columns)

고유값이 1개인 피쳐들:
['Wip Line - Dam', 'Process Desc. - Dam', 'Insp. Seq No. - Dam', 'Insp Judge Code - Dam', 'Collect Result.4 - Dam', 'Collect Result.5 - Dam', 'Collect Result.6 - Dam', 'Collect Result.8 - Dam', 'Wip Line - AutoClave', 'Process Desc. - AutoClave', 'Equipment - AutoClave', 'Insp. Seq No. - AutoClave', 'Insp Judge Code - AutoClave', 'Judge Value - AutoClave', 'Judge Value.1 - AutoClave', 'Judge Value.2 - AutoClave', 'Wip Line - Fill1', 'Process Desc. - Fill1', 'Insp. Seq No. - Fill1', 'Insp Judge Code - Fill1', 'Wip Line - Fill2', 'Process Desc. - Fill2', 'Insp. Seq No. - Fill2', 'Insp Judge Code - Fill2', 'Collect Result.2 - Fill2', 'Collect Result.4 - Fill2', 'Collect Result.6 - Fill2', 'Collect Result.9 - Fill2', 'Collect Result.10 - Fill2', 'Collect Result.11 - Fill2', 'Collect Result.12 - Fill2', 'Collect Result.13 - Fill2', 'Collect Result.14 - Fill2', 'Collect Result.15 - Fill2', 'Collect Result.16 - Fill2']


In [7]:
# value_counts 길이가 데이터프레임의 행 수와 같은 피쳐 찾기
row_count = len(df_merged)
matching_row_columns = [column for column in df_merged.columns if df_merged[column].value_counts().size == row_count]

print("value_counts의 길이가 데이터프레임의 행 수와 같은 피쳐들:") 
matching_row_columns.remove('Collect Date - Dam') # 밑에서 이 피쳐로 정렬 후 train_test_split 진행하여 다시 넣어줌
print(matching_row_columns)

value_counts의 길이가 데이터프레임의 행 수와 같은 피쳐들:
['Set ID', 'LOT ID - AutoClave', 'LOT ID - Fill1', 'Collect Date - Fill1', 'LOT ID - Fill2', 'Collect Date - Fill2']


### @혼합값 대체@

In [8]:
# 각 열의 데이터 타입 확인
def check_mixed_types(column):
    types = set(map(type, column.dropna()))
    return len(types) > 1

mixed_type_columns = [column for column in df_merged.columns if check_mixed_types(df_merged[column])]

print("혼합된 타입이 있는 피처들:")
print(mixed_type_columns)

혼합된 타입이 있는 피처들:
['Collect Result.17 - Dam', 'Collect Result.7 - Fill1', 'Collect Result.17 - Fill2']


혼합된 타입이 있는 피처들:  
['Collect Result.17 - Dam', 'Collect Result.7 - Fill1', 'Collect Result.17 - Fill2']  
각 OK 값을 대체할 수 있는 값의 기준 필요  
일단 AbNormal과 Normal에 따른 큰 차이가 보이진 않음  
최빈값이나 평균으로 대체하기 보단 그에 상응하는, 구별되는 값으로 대체하는 것으로 먼저 조치

In [9]:
# print(df_merged[df_merged['target'] == 'AbNormal']['Collect Result.17 - Dam'].value_counts())
# print(df_merged[df_merged['target'] == 'Normal']['Collect Result.17 - Dam'].value_counts())

# print(df_merged[df_merged['target'] == 'AbNormal']['Collect Result.7 - Fill1'].value_counts())
# print(df_merged[df_merged['target'] == 'Normal']['Collect Result.7 - Fill1'].value_counts())

# print(df_merged[df_merged['target'] == 'AbNormal']['Collect Result.17 - Fill2'].value_counts())
# print(df_merged[df_merged['target'] == 'Normal']['Collect Result.17 - Fill2'].value_counts())

In [10]:
# 혼합값 대체   ## 이거 다음엔 안해보는걸로
df_merged.loc[df_merged['Collect Result.17 - Dam'] == 'OK', 'Collect Result.17 - Dam'] = 500
df_merged.loc[df_merged['Collect Result.7 - Fill1'] == 'OK', 'Collect Result.7 - Fill1'] = 800
df_merged.loc[df_merged['Collect Result.17 - Fill2'] == 'OK', 'Collect Result.17 - Fill2'] = 600

# 고유값, 혼합값 피쳐 제거
df_merged.drop(columns = unique_value_1_columns, inplace = True)
df_merged.drop(columns = matching_row_columns, inplace = True)

### @이상치 탐지@ 

In [11]:
# # 숫자형 피쳐 목록
# numeric_features = [col for col in df_merged.columns if pd.api.types.is_numeric_dtype(df_merged[col])]

# # 서브플롯 설정
# num_features = len(numeric_features)
# num_cols = 5  # 열의 개수
# num_rows = (num_features + num_cols - 1) // num_cols  # 행의 개수

# fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, num_rows * 5))

# # 박스플롯 생성
# for i, feature in enumerate(numeric_features):
#     row = i // num_cols
#     col = i % num_cols
#     sns.boxplot(x=df_merged[feature], ax=axes[row, col])
#     axes[row, col].set_title(f'Boxplot of {feature}')

# # 빈 서브플롯 숨기기
# for i in range(num_features, num_rows * num_cols):
#     fig.delaxes(axes.flatten()[i])

# plt.tight_layout()
# plt.show()

In [12]:
# # 이상치 탐지 함수 정의 (IQR 방법 사용)
# def detect_outliers(df, column):
#     Q1 = df[column].quantile(0.25)
#     Q3 = df[column].quantile(0.75)
#     IQR = Q3 - Q1
#     lower_bound = Q1 - 1.5 * IQR
#     upper_bound = Q3 + 1.5 * IQR
#     outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
#     return outliers

# # 대상 컬럼 목록
# outlier_columns = [
#     'Collect Result.20 - Dam', 'Collect Result.24 - Dam', 'Collect Result.25 - Dam','Collect Result.39 - Dam',
#     'Collect Result.40 - Dam', 'Collect Result.41 - Dam', 'Collect Result.67 - Dam',
#     'Collect Result.68 - Dam', 'Unit Time - AutoClave', 'Collect Result.2 - AutoClave',
#     'Collect Result.3 - AutoClave', 'Collect Result.1 - Fill1', 'Collect Result.2 - Fill1',
#     'Collect Result.5 - Fill1', 'Collect Result.27 - Fill1', 'Collect Result.28 - Fill1',
#     'Collect Result.37 - Fill2', 'Collect Result.38 - Fill2'
# ]

# # 이상치 탐지 및 해당 행의 target 값 확인
# for column in outlier_columns:
#     outliers = detect_outliers(df_merged, column)
#     if not outliers.empty:
#         print(f"Outliers in {column}:")
#         print(outliers[['target']])
#         print("Target value counts:")
#         print(outliers['target'].value_counts())
#         print("\n")
#     else:
#         print(f"No outliers detected in {column}\n")

Collect Result.24 - Dam, Collect Result.25 - Dam 둘의 이상치 양상이 같음  
Collect Result.67 - Dam, Collect Result.68 - Dam 둘의 이상치 양상이 같음  

### 언더 샘플링 

데이타 불균형을 해결하기 위해 언더 샘플링을 진행합니다.

In [13]:
normal_ratio = 1.0  # 1.0 means 1:1 ratio

df_normal = df_merged[df_merged["target"] == "Normal"]
df_abnormal = df_merged[df_merged["target"] == "AbNormal"]

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

df_normal = df_normal.sample(
    n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE
)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
df_concat.value_counts("target")

  Total: Normal: 38156, AbNormal: 2350


target
AbNormal    2350
Normal      2350
dtype: int64

### @피쳐별 동일 양상 제거@

In [14]:
# 고유값 비율 계산 함수
def get_value_counts_ratio(series):
    value_counts = series.value_counts(normalize=True)
    return value_counts.sort_values().values

# 비율 계산
ratios = {column: get_value_counts_ratio(df_concat[column]) for column in df_concat.columns}

# 각 피처별로 비율이 같은 열들을 저장
similar_columns_dict = {column: [] for column in df_concat.columns}

# 비율이 같은 열들 찾기
columns = list(ratios.keys())
for i in range(len(columns)):
    for j in range(i + 1, len(columns)):
        if np.array_equal(ratios[columns[i]], ratios[columns[j]]):
            similar_columns_dict[columns[i]].append(columns[j])

# 중복 제거를 위한 처리
processed_columns = set()
similar_columns_filtered = {}

for key, value in similar_columns_dict.items():
    if value and key not in processed_columns:
        similar_columns_filtered[key] = value
        processed_columns.update(value)
        processed_columns.add(key)

# 첫 번째 컬럼과 두 번째 컬럼의 고유값 매핑 확인 함수
def check_value_mapping(df, col1, col2, tolerance=0.05):
    unique_values_1 = df[col1].unique()
    unique_values_2 = df[col2].unique()

    # 고유값의 개수가 다른 경우 False 반환
    if len(unique_values_1) != len(unique_values_2):
        return False

    value_mapping = {}
    for val1 in unique_values_1:
        corresponding_values = df[df[col1] == val1][col2].unique()
        if len(corresponding_values) != 1:
            return False
        value_mapping[val1] = corresponding_values[0]

    # 매핑이 일관된지 확인
    for val1 in unique_values_1:
        ratio_1 = (df[col1] == val1).mean()
        ratio_2 = (df[col2] == value_mapping[val1]).mean()
        if abs(ratio_1 - ratio_2) > tolerance:
            return False

    return True

# 각 피처 쌍에 대해 고유값 매핑 확인
for key, values in similar_columns_filtered.items():
    for value in values:
        is_mapping_correct = check_value_mapping(df_concat, key, value)
        print(f"{key}과(와) {value}의 고유값 매핑이 일관된가? -> {is_mapping_correct}")

# 기준열과 동일한 양상을 보이는 피처들을 제거하는 함수, 오차범위 5퍼센트
def reduce_dataframe(df, tolerance=0.01):
    columns_to_keep = list(df.columns)
    columns_to_remove = set()
    for i in range(len(columns_to_keep)):
        ref_col = columns_to_keep[i]
        for j in range(i + 1, len(columns_to_keep)):
            col = columns_to_keep[j]
            if check_value_mapping(df, ref_col, col, tolerance):
                columns_to_remove.add(col)
    for col in columns_to_remove:
        if col in columns_to_keep:
            columns_to_keep.remove(col)
    return df[columns_to_keep]

# 기준이 되는 피처들과 동일한 양상을 보이는 피처들을 제거하여 데이터프레임 축소
df_reduced = reduce_dataframe(df_concat)

Equipment - Dam과(와) Collect Result - Dam의 고유값 매핑이 일관된가? -> True
Equipment - Dam과(와) Collect Result.1 - Dam의 고유값 매핑이 일관된가? -> True
Equipment - Dam과(와) Collect Result.2 - Dam의 고유값 매핑이 일관된가? -> True
Equipment - Dam과(와) Collect Result.7 - Dam의 고유값 매핑이 일관된가? -> True
Equipment - Dam과(와) Collect Result.9 - Dam의 고유값 매핑이 일관된가? -> True
Model.Suffix - Dam과(와) Model.Suffix - AutoClave의 고유값 매핑이 일관된가? -> True
Model.Suffix - Dam과(와) Model.Suffix - Fill1의 고유값 매핑이 일관된가? -> True
Model.Suffix - Dam과(와) Model.Suffix - Fill2의 고유값 매핑이 일관된가? -> True
Workorder - Dam과(와) Workorder - AutoClave의 고유값 매핑이 일관된가? -> True
Workorder - Dam과(와) Workorder - Fill1의 고유값 매핑이 일관된가? -> True
Workorder - Dam과(와) Workorder - Fill2의 고유값 매핑이 일관된가? -> True
Collect Result.24 - Dam과(와) Collect Result.25 - Dam의 고유값 매핑이 일관된가? -> True
Collect Result.27 - Dam과(와) Collect Result.28 - Dam의 고유값 매핑이 일관된가? -> True
Collect Result.27 - Dam과(와) Collect Result.29 - Dam의 고유값 매핑이 일관된가? -> True
Collect Result.27 - Dam과(와) Collect Result.30 - Dam의 고유

In [22]:
print("최종적으로 남겨진 피처들:")
# print(df_reduced.columns)
print(len(df_reduced.columns))

최종적으로 남겨진 피처들:
98


In [23]:
# import pandas as pd
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import cross_val_score
# from sklearn.preprocessing import LabelEncoder, OneHotEncod
# er
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.impute import SimpleImputer


# # 단계별 컬럼 분류
# dam_cols = [col for col in df_reduced.columns if 'Dam' in col]
# dam_cols.remove('Workorder - Dam')
# autoclave_cols = [col for col in df_reduced.columns if 'AutoClave' in col]
# autoclave_cols.remove('Collect Date - AutoClave')
# fill1_cols = [col for col in df_reduced.columns if 'Fill1' in col]
# fill2_cols = [col for col in df_reduced.columns if 'Fill2' in col]

# # 타겟 컬럼 추가
# target_col = 'target'  # 실제 타겟 컬럼명을 지정하세요

# # 단계별 데이터프레임 생성 (타겟 컬럼 포함)
# dam_df_reduced = df_reduced.loc[:, dam_cols + [target_col]].copy()
# autoclave_df_reduced = df_reduced.loc[:, autoclave_cols + [target_col]].copy()
# fill1_df_reduced = df_reduced.loc[:, fill1_cols + [target_col]].copy()
# fill2_df_reduced = df_reduced.loc[:, fill2_cols + [target_col]].copy()

# # 타겟 값 인코딩
# label_encoder = LabelEncoder()
# df_reduced.loc[:, target_col] = label_encoder.fit_transform(df_reduced[target_col])

# # 모델 학습 및 평가 함수 정의
# def evaluate_model(dataframe, target_col='target'):
#     X = dataframe.drop(columns=[target_col])
#     y = dataframe[target_col]
    
#     # OneHotEncoder를 사용하여 범주형 변수를 인코딩합니다.
#     categorical_features = X.select_dtypes(include=['object']).columns
#     numerical_features = X.select_dtypes(exclude=['object']).columns

#     # 파이프라인 정의
#     preprocessor = ColumnTransformer(
#         transformers=[
#             ('num', SimpleImputer(strategy='mean'), numerical_features),
#             ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
#         ])

#     model = Pipeline(steps=[('preprocessor', preprocessor),
#                             ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))])

#     scores = cross_val_score(model, X, y, cv=5, scoring='f1_weighted')  # pos_label 문제 해결 위해 f1_weighted 사용
#     return scores.mean()

# # 조합 생성 및 평가를 위한 함수
# def create_combination_and_evaluate(df_list, target_col):
#     combined_df = pd.concat([df.drop(columns=[target_col]) for df in df_list], axis=1)
#     combined_df.loc[:, target_col] = df_list[0][target_col].values
#     return evaluate_model(combined_df)

# # 각 단계별 조합 생성 및 평가
# combinations = [
#     ([dam_df_reduced], 'Dam'),
#     ([autoclave_df_reduced], 'AutoClave'),
#     ([fill1_df_reduced], 'Fill1'),
#     ([fill2_df_reduced], 'Fill2'),
#     ([dam_df_reduced, autoclave_df_reduced], 'Dam + AutoClave'),
#     ([dam_df_reduced, fill1_df_reduced], 'Dam + Fill1'),
#     ([dam_df_reduced, fill2_df_reduced], 'Dam + Fill2'),
#     ([autoclave_df_reduced, fill1_df_reduced], 'AutoClave + Fill1'),
#     ([autoclave_df_reduced, fill2_df_reduced], 'AutoClave + Fill2'),
#     ([fill1_df_reduced, fill2_df_reduced], 'Fill1 + Fill2'),
#     ([dam_df_reduced, autoclave_df_reduced, fill1_df_reduced], 'Dam + AutoClave + Fill1'),
#     ([dam_df_reduced, autoclave_df_reduced, fill2_df_reduced], 'Dam + AutoClave + Fill2'),
#     ([autoclave_df_reduced, fill1_df_reduced, fill2_df_reduced], 'AutoClave + Fill1 + Fill2'),
# ]

# # 각 조합별 모델 평가
# for combination, name in combinations:
#     score = create_combination_and_evaluate(combination, target_col)
#     print(f"{name} 단계 모델 성능 (F1 score): {score}")


### 데이터 분할

In [24]:
df_train = df_reduced
# df_reduced = df_reduced.sort_values(by=["Collect Date - Dam"])
# df_train, df_val = train_test_split(
#     df_reduced,
#     test_size=0.3,
#     stratify=df_reduced["target"],
#     random_state=RANDOM_STATE,
# )


# def print_stats(df: pd.DataFrame):
#     num_normal = len(df[df["target"] == "Normal"])
#     num_abnormal = len(df[df["target"] == "AbNormal"])

#     print(
#         f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}"
#         + f" ratio: {num_abnormal/num_normal}"
#     )


# # Print statistics
# print(f"  \tAbnormal\tNormal")
# print_stats(df_train)
# print_stats(df_val)

In [25]:
print(df_train.shape)
# print(df_val.shape)

(4700, 98)


## 3. 모델 학습

### 모델 정의 

In [26]:
from catboost import CatBoostClassifier
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import cross_val_score
import optuna
import warnings
warnings.filterwarnings("ignore")

model = RandomForestClassifier(random_state=RANDOM_STATE)

### 모델 학습

In [27]:
features = []

for col in df_train.columns:
    try:
        df_train[col] = df_train[col].astype(int)
        features.append(col)
    except:
        continue

if "Set ID" in features:
    features.remove("Set ID")

train_x = df_train[features]
train_y = df_train["target"]

In [28]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(train_y)

### Catboost

In [29]:
# 하이퍼파라미터 튜닝을 위한 목적 함수 정의 + L2 추가 
def objective_decision(trial):
    cat_depth = trial.suggest_int('depth', 2, 10, step=1)
    cat_learning_rate = trial.suggest_float('learning_rate', 0.001, 0.01, step=0.001)
    cat_iterations = trial.suggest_int('iterations', 100, 300, step=10)
    cat_l2_leaf_reg = trial.suggest_float('l2_leaf_reg', 1e-5, 1e-1, log=True)

    classifier_obj = CatBoostClassifier(
        depth=cat_depth,
        learning_rate=cat_learning_rate,
        iterations=cat_iterations,
        l2_leaf_reg=cat_l2_leaf_reg,
        early_stopping_rounds=50,  # 조기 종료 설정
        verbose=0  # 훈련 중 출력 억제
    )

    # 교차 검증을 통해 F1 스코어 계산
    score = cross_val_score(classifier_obj, train_x, y_encoded, cv=5, n_jobs=-1, scoring='f1', error_score='raise')
    mean_f1 = score.mean()
    return mean_f1

# 최적화 실행
study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=10), direction="maximize")
study.optimize(objective_decision, n_trials=10)  # 더 많은 트라이얼 수행

# 최적화 결과 보기
print("Best score:", study.best_value)
print("Best parameters:", study.best_params)

[I 2024-08-04 22:32:20,729] A new study created in memory with name: no-name-f5f49bc6-3517-4df0-b128-cd916a5e5972
[I 2024-08-04 22:32:26,595] Trial 0 finished with value: 0.4138859680832108 and parameters: {'depth': 8, 'learning_rate': 0.001, 'iterations': 230, 'l2_leaf_reg': 0.009890438121030055}. Best is trial 0 with value: 0.4138859680832108.
[I 2024-08-04 22:32:30,094] Trial 1 finished with value: 0.4137662539048438 and parameters: {'depth': 6, 'learning_rate': 0.003, 'iterations': 140, 'l2_leaf_reg': 0.011018509458263562}. Best is trial 0 with value: 0.4138859680832108.
[I 2024-08-04 22:32:34,206] Trial 2 finished with value: 0.41208478948963334 and parameters: {'depth': 3, 'learning_rate': 0.001, 'iterations': 240, 'l2_leaf_reg': 0.06509886021509587}. Best is trial 0 with value: 0.4138859680832108.
[I 2024-08-04 22:32:37,257] Trial 3 finished with value: 0.42629463380283494 and parameters: {'depth': 2, 'learning_rate': 0.006, 'iterations': 270, 'l2_leaf_reg': 0.002819059662190988

Best score: 0.4490644367842398
Best parameters: {'depth': 10, 'learning_rate': 0.009000000000000001, 'iterations': 250, 'l2_leaf_reg': 0.0019529912685950315}


In [30]:
cat = CatBoostClassifier(**study.best_params)
cat.fit(train_x, y_encoded)
# cat.score(X_test, y_test)

0:	learn: 0.6919851	total: 172ms	remaining: 42.8s
1:	learn: 0.6913673	total: 204ms	remaining: 25.3s
2:	learn: 0.6904301	total: 235ms	remaining: 19.4s
3:	learn: 0.6891504	total: 267ms	remaining: 16.4s
4:	learn: 0.6880464	total: 300ms	remaining: 14.7s
5:	learn: 0.6872942	total: 330ms	remaining: 13.4s
6:	learn: 0.6862174	total: 360ms	remaining: 12.5s
7:	learn: 0.6855005	total: 392ms	remaining: 11.9s
8:	learn: 0.6849573	total: 398ms	remaining: 10.7s
9:	learn: 0.6840239	total: 429ms	remaining: 10.3s
10:	learn: 0.6832444	total: 458ms	remaining: 9.94s
11:	learn: 0.6827567	total: 488ms	remaining: 9.67s
12:	learn: 0.6817247	total: 519ms	remaining: 9.47s
13:	learn: 0.6811026	total: 550ms	remaining: 9.27s
14:	learn: 0.6802805	total: 579ms	remaining: 9.06s
15:	learn: 0.6795335	total: 610ms	remaining: 8.91s
16:	learn: 0.6786606	total: 640ms	remaining: 8.78s
17:	learn: 0.6778316	total: 671ms	remaining: 8.65s
18:	learn: 0.6773828	total: 682ms	remaining: 8.3s
19:	learn: 0.6770442	total: 694ms	remainin

<catboost.core.CatBoostClassifier at 0x266a61500d0>

### xgboost

In [31]:
from xgboost import XGBClassifier

def objective_decision(trial):
    xgbm_n_estimators = trial.suggest_int('n_estimators', 300, 400, step=50)
    xgbm_learning_rate = trial.suggest_float('learning_rate', 0.02, 0.05, step=0.01)
    xgbm_max_depth = trial.suggest_int('max_depth', 3, 10, step=1)
    xgbm_gamma = trial.suggest_float('gamma', 0.0, 0.5, step=0.1)
    xgbm_min_child_weight = trial.suggest_int('min_child_weight', 1, 10, step=1)
    xgbm_subsample = trial.suggest_float('subsample', 0.5, 1.0, step=0.1)
    xgbm_colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0, step=0.1)

    classifier_obj = XGBClassifier(
        n_estimators=xgbm_n_estimators,
        learning_rate=xgbm_learning_rate,
        max_depth=xgbm_max_depth,
        gamma=xgbm_gamma,
        min_child_weight=xgbm_min_child_weight,
        subsample=xgbm_subsample,
        colsample_bytree=xgbm_colsample_bytree,
        use_label_encoder=False,
        eval_metric='logloss'
    )

    score = cross_val_score(classifier_obj, train_x, y_encoded, cv=5, n_jobs=-1, scoring='f1', error_score='raise')
    mean_f1 = score.mean()
    return mean_f1

# 최적화 실행
study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction="maximize")
study.optimize(objective_decision, n_trials=10)  # 더 많은 트라이얼 수행

# 최적화 결과 보기
print("Best score:", study.best_value)
print("Best parameters:", study.best_params)

[I 2024-08-04 22:37:43,367] A new study created in memory with name: no-name-267b5a7a-20a3-4ed6-8bd0-20f189f0f2ab
[I 2024-08-04 22:37:44,232] Trial 0 finished with value: 0.46574720278180637 and parameters: {'n_estimators': 350, 'learning_rate': 0.03, 'max_depth': 6, 'gamma': 0.5, 'min_child_weight': 1, 'subsample': 0.5, 'colsample_bytree': 0.9}. Best is trial 0 with value: 0.46574720278180637.
[I 2024-08-04 22:37:45,101] Trial 1 finished with value: 0.46298932463774073 and parameters: {'n_estimators': 400, 'learning_rate': 0.02, 'max_depth': 7, 'gamma': 0.5, 'min_child_weight': 3, 'subsample': 0.6, 'colsample_bytree': 0.5}. Best is trial 0 with value: 0.46574720278180637.
[I 2024-08-04 22:37:45,940] Trial 2 finished with value: 0.47103963147088745 and parameters: {'n_estimators': 300, 'learning_rate': 0.05, 'max_depth': 9, 'gamma': 0.1, 'min_child_weight': 9, 'subsample': 0.6, 'colsample_bytree': 0.7}. Best is trial 2 with value: 0.47103963147088745.
[I 2024-08-04 22:37:46,696] Trial 

Best score: 0.474303901120365
Best parameters: {'n_estimators': 300, 'learning_rate': 0.05, 'max_depth': 10, 'gamma': 0.30000000000000004, 'min_child_weight': 4, 'subsample': 0.7, 'colsample_bytree': 0.6}


In [32]:
xgbm = XGBClassifier(**study.best_params)
xgbm.fit(train_x, y_encoded)
# cat.score(X_test, y_test)

In [33]:
y_encoded #0은 AbNormal, 1은 Normal

array([1, 1, 1, ..., 0, 0, 0])

### RandomForest

In [35]:
def objective(trial):
    # 하이퍼파라미터 설정
    rf_n_estimators = trial.suggest_int('n_estimators', 200, 400, step=20)
    rf_max_depth = trial.suggest_int('max_depth', 2, 15, step=2)
    rf_min_samples_split = trial.suggest_int('min_samples_split', 5, 20, step=2)
    rf_max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 2, 60, step=2)
    rf_min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10, step=1)
    rf_max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])

    classifier_obj = RandomForestClassifier(
        n_estimators=rf_n_estimators,
        max_depth=rf_max_depth,
        min_samples_split=rf_min_samples_split,
        max_leaf_nodes=rf_max_leaf_nodes,
        min_samples_leaf=rf_min_samples_leaf,
        max_features=rf_max_features,
        random_state=100
    )

    # 교차 검증을 통해 F1 스코어 계산
    score = cross_val_score(classifier_obj, train_x, y_encoded, cv=5, n_jobs=-1, scoring='f1')
    mean_f1 = score.mean()
    return mean_f1

# 최적화 실행
study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction="maximize")
study.optimize(objective, n_trials=10)

# 최적화 결과 보기
print("Best score:", study.best_value)
print("Best parameters:", study.best_params)

[I 2024-08-04 22:38:07,322] A new study created in memory with name: no-name-9c9b8fd1-fbfb-4b02-a3b8-9645c01b9fb9
[I 2024-08-04 22:38:08,107] Trial 0 finished with value: 0.4162802921084913 and parameters: {'n_estimators': 300, 'max_depth': 4, 'min_samples_split': 11, 'max_leaf_nodes': 52, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 0 with value: 0.4162802921084913.
[I 2024-08-04 22:38:08,823] Trial 1 finished with value: 0.42953671291011997 and parameters: {'n_estimators': 380, 'max_depth': 2, 'min_samples_split': 13, 'max_leaf_nodes': 54, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.42953671291011997.
[I 2024-08-04 22:38:09,468] Trial 2 finished with value: 0.40385342293750204 and parameters: {'n_estimators': 240, 'max_depth': 14, 'min_samples_split': 17, 'max_leaf_nodes': 12, 'min_samples_leaf': 9, 'max_features': 'log2'}. Best is trial 1 with value: 0.42953671291011997.
[I 2024-08-04 22:38:10,404] Trial 3 finished with value: 0.403

Best score: 0.43409921292626796
Best parameters: {'n_estimators': 320, 'max_depth': 2, 'min_samples_split': 7, 'max_leaf_nodes': 34, 'min_samples_leaf': 8, 'max_features': 'log2'}


In [36]:
rf = RandomForestClassifier(**study.best_params)
rf.fit(train_x, y_encoded)

## 4. 제출하기

### 테스트 데이터 예측

테스트 데이터 불러오기

In [37]:
df_test_y = pd.read_csv(os.path.join("submission.csv"))

In [38]:
df_test = pd.merge(X, df_test_y, "inner", on="Set ID")
df_test_x = df_test[features]

for col in df_test_x.columns:
    try:
        df_test_x.loc[:, col] = df_test_x[col].astype(int)
    except:
        continue

In [39]:
cat_test_pred = cat.predict(df_test_x)
xgbm_test_pred = xgbm.predict(df_test_x)
rf_test_pred = rf.predict(df_test_x)

### voting 전 평가

In [40]:
# 학습데이터 70%, 평가데이터 30%로 데이터 분할
X_tra, X_dev, y_tra, y_dev = train_test_split(train_x, y_encoded, test_size=0.3, stratify=train_y, random_state=0)

In [41]:
y_dev

array([1, 1, 1, ..., 0, 1, 0])

In [42]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score

averaging = VotingClassifier(
    estimators = [('xgbm', xgbm), ('cat', cat),  ('rf', rf)], voting='soft', weights=[4, 3, 3])

averaging.fit(X_tra, y_tra)

ensemble_pred = averaging.predict(X_dev)

print(f1_score(y_dev, ensemble_pred))

0:	learn: 0.6924294	total: 30.7ms	remaining: 7.65s
1:	learn: 0.6915544	total: 64.4ms	remaining: 7.99s
2:	learn: 0.6905203	total: 93.6ms	remaining: 7.71s
3:	learn: 0.6896074	total: 123ms	remaining: 7.59s
4:	learn: 0.6886966	total: 153ms	remaining: 7.51s
5:	learn: 0.6878048	total: 158ms	remaining: 6.42s
6:	learn: 0.6863589	total: 189ms	remaining: 6.58s
7:	learn: 0.6859683	total: 192ms	remaining: 5.79s
8:	learn: 0.6849288	total: 221ms	remaining: 5.91s
9:	learn: 0.6835939	total: 252ms	remaining: 6.05s
10:	learn: 0.6822040	total: 285ms	remaining: 6.19s
11:	learn: 0.6811027	total: 316ms	remaining: 6.27s
12:	learn: 0.6804446	total: 346ms	remaining: 6.31s
13:	learn: 0.6794381	total: 377ms	remaining: 6.36s
14:	learn: 0.6782310	total: 408ms	remaining: 6.39s
15:	learn: 0.6774391	total: 438ms	remaining: 6.41s
16:	learn: 0.6767270	total: 468ms	remaining: 6.42s
17:	learn: 0.6758385	total: 502ms	remaining: 6.47s
18:	learn: 0.6747875	total: 538ms	remaining: 6.54s
19:	learn: 0.6740286	total: 567ms	rema

### voting

In [43]:
from sklearn.ensemble import VotingClassifier

averaging_model = VotingClassifier(
    estimators = [('xgbm', xgbm), ('cat', cat),  ('rf', rf)], voting='soft', weights=[4, 3, 3])

averaging_model.fit(train_x, train_y)

ensemble_pred = averaging_model.predict(df_test_x)

0:	learn: 0.6919851	total: 30.4ms	remaining: 7.57s
1:	learn: 0.6913673	total: 59.9ms	remaining: 7.42s
2:	learn: 0.6904301	total: 89.8ms	remaining: 7.39s
3:	learn: 0.6891504	total: 120ms	remaining: 7.41s
4:	learn: 0.6880464	total: 149ms	remaining: 7.31s
5:	learn: 0.6872942	total: 178ms	remaining: 7.26s
6:	learn: 0.6862174	total: 209ms	remaining: 7.27s
7:	learn: 0.6855005	total: 240ms	remaining: 7.26s
8:	learn: 0.6849573	total: 245ms	remaining: 6.55s
9:	learn: 0.6840239	total: 277ms	remaining: 6.64s
10:	learn: 0.6832444	total: 307ms	remaining: 6.67s
11:	learn: 0.6827567	total: 340ms	remaining: 6.74s
12:	learn: 0.6817247	total: 380ms	remaining: 6.92s
13:	learn: 0.6811026	total: 411ms	remaining: 6.92s
14:	learn: 0.6802805	total: 440ms	remaining: 6.9s
15:	learn: 0.6795335	total: 472ms	remaining: 6.9s
16:	learn: 0.6786606	total: 503ms	remaining: 6.9s
17:	learn: 0.6778316	total: 533ms	remaining: 6.87s
18:	learn: 0.6773828	total: 545ms	remaining: 6.62s
19:	learn: 0.6770442	total: 557ms	remaini

In [44]:
ensemble_pred 

array(['AbNormal', 'AbNormal', 'AbNormal', ..., 'Normal', 'AbNormal',
       'AbNormal'], dtype=object)

### 제출 파일 작성

In [45]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = ensemble_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**