# 제품 이상여부 판별 프로젝트

## 1. 데이터 불러오기

### 필수 라이브러리

In [1]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

pd.set_option('display.max_columns', None)

In [2]:
ROOT_DIR = "data"
RANDOM_STATE = 110

# CSV 파일 읽기
X_Dam = pd.read_csv(os.path.join(ROOT_DIR, "Dam dispensing.csv"))
X_AutoClave = pd.read_csv(os.path.join(ROOT_DIR, "Auto clave.csv"))
X_Fill1 = pd.read_csv(os.path.join(ROOT_DIR, "Fill1 dispensing.csv"))
X_Fill2 = pd.read_csv(os.path.join(ROOT_DIR, "Fill2 dispensing.csv"))

# y 데이터 읽기
y = pd.read_csv(os.path.join(ROOT_DIR, "train_y.csv"))


  X_Dam = pd.read_csv(os.path.join(ROOT_DIR, "Dam dispensing.csv"))
  X_AutoClave = pd.read_csv(os.path.join(ROOT_DIR, "Auto clave.csv"))
  X_Fill1 = pd.read_csv(os.path.join(ROOT_DIR, "Fill1 dispensing.csv"))
  X_Fill2 = pd.read_csv(os.path.join(ROOT_DIR, "Fill2 dispensing.csv"))


### 데이터 병합

x 데이터 병합

In [3]:
# Rename columns
X_Dam.columns = [i + " - Dam" for i in X_Dam.columns]
X_AutoClave.columns = [i + " - AutoClave" for i in X_AutoClave.columns]
X_Fill1.columns = [i + " - Fill1" for i in X_Fill1.columns]
X_Fill2.columns = [i + " - Fill2" for i in X_Fill2.columns]
X_Dam = X_Dam.rename(columns={"Set ID - Dam": "Set ID"})
X_AutoClave = X_AutoClave.rename(columns={"Set ID - AutoClave": "Set ID"})
X_Fill1 = X_Fill1.rename(columns={"Set ID - Fill1": "Set ID"})
X_Fill2 = X_Fill2.rename(columns={"Set ID - Fill2": "Set ID"})

# Merge X
X = pd.merge(X_Dam, X_AutoClave, on="Set ID")
X = pd.merge(X, X_Fill1, on="Set ID")
X = pd.merge(X, X_Fill2, on="Set ID")
X = X.drop(X[X.duplicated(subset="Set ID")].index).reset_index(drop=True)

x 데이터와 y 데이터 병합

In [4]:
# Merge X and y
df_merged = pd.merge(X, y, "inner", on="Set ID")

# Drop columns with more than half of the values missing
drop_cols = []
for column in df_merged.columns:
    if (df_merged[column].notnull().sum() // 2) < df_merged[
        column
    ].isnull().sum():
        drop_cols.append(column)
df_merged = df_merged.drop(drop_cols, axis=1)

# Drop Lot ID
df_merged = df_merged.drop("LOT ID - Dam", axis=1)

In [5]:
# 결측치 피쳐 세 개의 피쳐가 결측치 수가 동일함
na_list = []
for i in df_merged.columns:
    if df_merged[i].isna().sum() > 0:
        na_list.append(i)
na_list

['Collect Result.17 - Dam',
 'Collect Result.7 - Fill1',
 'Collect Result.17 - Fill2']

## 2. 데이터 전처리

### @고유값(전부 다른 값, 단일값) 피쳐 제거@

In [6]:
# 고유값이 1개인 피쳐 찾기
unique_value_1_columns = [column for column in df_merged.columns if df_merged[column].nunique() == 1]

print("고유값이 1개인 피쳐들:")
print(unique_value_1_columns)

고유값이 1개인 피쳐들:
['Wip Line - Dam', 'Process Desc. - Dam', 'Insp. Seq No. - Dam', 'Insp Judge Code - Dam', 'Collect Result.4 - Dam', 'Collect Result.5 - Dam', 'Collect Result.6 - Dam', 'Collect Result.8 - Dam', 'Wip Line - AutoClave', 'Process Desc. - AutoClave', 'Equipment - AutoClave', 'Insp. Seq No. - AutoClave', 'Insp Judge Code - AutoClave', 'Judge Value - AutoClave', 'Judge Value.1 - AutoClave', 'Judge Value.2 - AutoClave', 'Wip Line - Fill1', 'Process Desc. - Fill1', 'Insp. Seq No. - Fill1', 'Insp Judge Code - Fill1', 'Wip Line - Fill2', 'Process Desc. - Fill2', 'Insp. Seq No. - Fill2', 'Insp Judge Code - Fill2', 'Collect Result.2 - Fill2', 'Collect Result.4 - Fill2', 'Collect Result.6 - Fill2', 'Collect Result.9 - Fill2', 'Collect Result.10 - Fill2', 'Collect Result.11 - Fill2', 'Collect Result.12 - Fill2', 'Collect Result.13 - Fill2', 'Collect Result.14 - Fill2', 'Collect Result.15 - Fill2', 'Collect Result.16 - Fill2']


In [7]:
# value_counts 길이가 데이터프레임의 행 수와 같은 피쳐 찾기
row_count = len(df_merged)
matching_row_columns = [column for column in df_merged.columns if df_merged[column].value_counts().size == row_count]

print("value_counts의 길이가 데이터프레임의 행 수와 같은 피쳐들:") 
matching_row_columns.remove('Collect Date - Dam') # 밑에서 이 피쳐로 정렬 후 train_test_split 진행하여 다시 넣어줌
print(matching_row_columns)

value_counts의 길이가 데이터프레임의 행 수와 같은 피쳐들:
['Set ID', 'LOT ID - AutoClave', 'LOT ID - Fill1', 'Collect Date - Fill1', 'LOT ID - Fill2', 'Collect Date - Fill2']


### @혼합값 대체@

In [8]:
# 각 열의 데이터 타입 확인
def check_mixed_types(column):
    types = set(map(type, column.dropna()))
    return len(types) > 1

mixed_type_columns = [column for column in df_merged.columns if check_mixed_types(df_merged[column])]

print("혼합된 타입이 있는 피처들:")
print(mixed_type_columns)

혼합된 타입이 있는 피처들:
['Collect Result.17 - Dam', 'Collect Result.7 - Fill1', 'Collect Result.17 - Fill2']


혼합된 타입이 있는 피처들:  
['Collect Result.17 - Dam', 'Collect Result.7 - Fill1', 'Collect Result.17 - Fill2']  
각 OK 값을 대체할 수 있는 값의 기준 필요  
일단 AbNormal과 Normal에 따른 큰 차이가 보이진 않음  
최빈값이나 평균으로 대체하기 보단 그에 상응하는, 구별되는 값으로 대체하는 것으로 먼저 조치

In [9]:
# print(df_merged[df_merged['target'] == 'AbNormal']['Collect Result.17 - Dam'].value_counts())
# print(df_merged[df_merged['target'] == 'Normal']['Collect Result.17 - Dam'].value_counts())

# print(df_merged[df_merged['target'] == 'AbNormal']['Collect Result.7 - Fill1'].value_counts())
# print(df_merged[df_merged['target'] == 'Normal']['Collect Result.7 - Fill1'].value_counts())

# print(df_merged[df_merged['target'] == 'AbNormal']['Collect Result.17 - Fill2'].value_counts())
# print(df_merged[df_merged['target'] == 'Normal']['Collect Result.17 - Fill2'].value_counts())

In [10]:
# 혼합값 대체
df_merged.loc[df_merged['Collect Result.17 - Dam'] == 'OK', 'Collect Result.17 - Dam'] = 500
df_merged.loc[df_merged['Collect Result.7 - Fill1'] == 'OK', 'Collect Result.7 - Fill1'] = 800
df_merged.loc[df_merged['Collect Result.17 - Fill2'] == 'OK', 'Collect Result.17 - Fill2'] = 600

# 고유값, 혼합값 피쳐 제거
df_merged.drop(columns = unique_value_1_columns, inplace = True)
df_merged.drop(columns = matching_row_columns, inplace = True)

### 언더 샘플링 

데이타 불균형을 해결하기 위해 언더 샘플링을 진행합니다.

In [11]:
normal_ratio = 1.0  # 1.0 means 1:1 ratio

df_normal = df_merged[df_merged["target"] == "Normal"]
df_abnormal = df_merged[df_merged["target"] == "AbNormal"]

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

df_normal = df_normal.sample(
    n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE
)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
df_concat.value_counts("target")

  Total: Normal: 38156, AbNormal: 2350


target
AbNormal    2350
Normal      2350
dtype: int64

### @피쳐별 동일 양상 제거@

In [12]:
# 고유값 비율 계산 함수
def get_value_counts_ratio(series):
    value_counts = series.value_counts(normalize=True)
    return value_counts.sort_values().values

# 비율 계산
ratios = {column: get_value_counts_ratio(df_concat[column]) for column in df_concat.columns}

# 각 피처별로 비율이 같은 열들을 저장
similar_columns_dict = {column: [] for column in df_concat.columns}

# 비율이 같은 열들 찾기
columns = list(ratios.keys())
for i in range(len(columns)):
    for j in range(i + 1, len(columns)):
        if np.array_equal(ratios[columns[i]], ratios[columns[j]]):
            similar_columns_dict[columns[i]].append(columns[j])

# 중복 제거를 위한 처리
processed_columns = set()
similar_columns_filtered = {}

for key, value in similar_columns_dict.items():
    if value and key not in processed_columns:
        similar_columns_filtered[key] = value
        processed_columns.update(value)
        processed_columns.add(key)

# 첫 번째 컬럼과 두 번째 컬럼의 고유값 매핑 확인 함수
def check_value_mapping(df, col1, col2, tolerance=0.05):
    unique_values_1 = df[col1].unique()
    unique_values_2 = df[col2].unique()

    # 고유값의 개수가 다른 경우 False 반환
    if len(unique_values_1) != len(unique_values_2):
        return False

    value_mapping = {}
    for val1 in unique_values_1:
        corresponding_values = df[df[col1] == val1][col2].unique()
        if len(corresponding_values) != 1:
            return False
        value_mapping[val1] = corresponding_values[0]

    # 매핑이 일관된지 확인
    for val1 in unique_values_1:
        ratio_1 = (df[col1] == val1).mean()
        ratio_2 = (df[col2] == value_mapping[val1]).mean()
        if abs(ratio_1 - ratio_2) > tolerance:
            return False

    return True

# 각 피처 쌍에 대해 고유값 매핑 확인
for key, values in similar_columns_filtered.items():
    for value in values:
        is_mapping_correct = check_value_mapping(df_concat, key, value)
        print(f"{key}과(와) {value}의 고유값 매핑이 일관된가? -> {is_mapping_correct}")

# 기준열과 동일한 양상을 보이는 피처들을 제거하는 함수, 오차범위 5퍼센트
def reduce_dataframe(df, tolerance=0.05):
    columns_to_keep = list(df.columns)
    columns_to_remove = set()
    for i in range(len(columns_to_keep)):
        ref_col = columns_to_keep[i]
        for j in range(i + 1, len(columns_to_keep)):
            col = columns_to_keep[j]
            if check_value_mapping(df, ref_col, col, tolerance):
                columns_to_remove.add(col)
    for col in columns_to_remove:
        if col in columns_to_keep:
            columns_to_keep.remove(col)
    return df[columns_to_keep]

# 기준이 되는 피처들과 동일한 양상을 보이는 피처들을 제거하여 데이터프레임 축소
df_reduced = reduce_dataframe(df_concat)

Equipment - Dam과(와) Collect Result - Dam의 고유값 매핑이 일관된가? -> True
Equipment - Dam과(와) Collect Result.1 - Dam의 고유값 매핑이 일관된가? -> True
Equipment - Dam과(와) Collect Result.2 - Dam의 고유값 매핑이 일관된가? -> True
Equipment - Dam과(와) Collect Result.7 - Dam의 고유값 매핑이 일관된가? -> True
Equipment - Dam과(와) Collect Result.9 - Dam의 고유값 매핑이 일관된가? -> True
Model.Suffix - Dam과(와) Model.Suffix - AutoClave의 고유값 매핑이 일관된가? -> True
Model.Suffix - Dam과(와) Model.Suffix - Fill1의 고유값 매핑이 일관된가? -> True
Model.Suffix - Dam과(와) Model.Suffix - Fill2의 고유값 매핑이 일관된가? -> True
Workorder - Dam과(와) Workorder - AutoClave의 고유값 매핑이 일관된가? -> True
Workorder - Dam과(와) Workorder - Fill1의 고유값 매핑이 일관된가? -> True
Workorder - Dam과(와) Workorder - Fill2의 고유값 매핑이 일관된가? -> True
Collect Result.24 - Dam과(와) Collect Result.25 - Dam의 고유값 매핑이 일관된가? -> True
Collect Result.27 - Dam과(와) Collect Result.28 - Dam의 고유값 매핑이 일관된가? -> True
Collect Result.27 - Dam과(와) Collect Result.29 - Dam의 고유값 매핑이 일관된가? -> True
Collect Result.27 - Dam과(와) Collect Result.30 - Dam의 고유

In [13]:
print("최종적으로 남겨진 피처들:")
print(df_reduced.columns)

최종적으로 남겨진 피처들:
Index(['Equipment - Dam', 'Model.Suffix - Dam', 'Workorder - Dam',
       'Collect Date - Dam', 'Collect Result.3 - Dam',
       'Collect Result.10 - Dam', 'Collect Result.11 - Dam',
       'Collect Result.12 - Dam', 'Collect Result.13 - Dam',
       'Collect Result.14 - Dam', 'Collect Result.15 - Dam',
       'Collect Result.16 - Dam', 'Collect Result.17 - Dam',
       'Collect Result.18 - Dam', 'Collect Result.19 - Dam',
       'Collect Result.20 - Dam', 'Collect Result.21 - Dam',
       'Collect Result.22 - Dam', 'Collect Result.23 - Dam',
       'Collect Result.24 - Dam', 'Collect Result.26 - Dam',
       'Collect Result.27 - Dam', 'Collect Result.31 - Dam',
       'Collect Result.32 - Dam', 'Collect Result.34 - Dam',
       'Collect Result.36 - Dam', 'Collect Result.37 - Dam',
       'Collect Result.38 - Dam', 'Collect Result.39 - Dam',
       'Collect Result.40 - Dam', 'Collect Result.41 - Dam',
       'Collect Result.42 - Dam', 'Collect Result.43 - Dam',
       'C

### 데이터 분할

In [14]:
df_concat = df_concat.sort_values(by=["Collect Date - Dam"])
df_train, df_val = train_test_split(
    df_concat,
    test_size=0.3,
    stratify=df_concat["target"],
    random_state=RANDOM_STATE,
)


def print_stats(df: pd.DataFrame):
    num_normal = len(df[df["target"] == "Normal"])
    num_abnormal = len(df[df["target"] == "AbNormal"])

    print(
        f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}"
        + f" ratio: {num_abnormal/num_normal}"
    )


# Print statistics
print(f"  \tAbnormal\tNormal")
print_stats(df_train)
print_stats(df_val)

  	Abnormal	Normal
  Total: Normal: 1645, AbNormal: 1645 ratio: 1.0
  Total: Normal: 705, AbNormal: 705 ratio: 1.0


In [15]:
print(df_train.shape)
print(df_val.shape)

(3290, 148)
(1410, 148)


## 3. 모델 학습

### 모델 정의 

In [16]:
from catboost import CatBoostClassifier
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import cross_val_score
import optuna
import warnings
warnings.filterwarnings("ignore")

model = RandomForestClassifier(random_state=RANDOM_STATE)

### 모델 학습

In [17]:
features = []

for col in df_train.columns:
    try:
        df_train[col] = df_train[col].astype(int)
        features.append(col)
    except:
        continue

if "Set ID" in features:
    features.remove("Set ID")

train_x = df_train[features]
train_y = df_train["target"]

### Catboost

In [18]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(train_y)

In [19]:
# 하이퍼파라미터 튜닝을 위한 목적 함수 정의
def objective_decision(trial): 
    cat_depth = trial.suggest_int('depth', 2, 10, step=1)
    cat_learning_rate = trial.suggest_float('learning_rate', 0.001, 0.01, step=0.01)
    cat_iterations = trial.suggest_int('iterations', 100, 300, step=10)

    regressor_obj = CatBoostClassifier(
        depth=cat_depth, 
        learning_rate=cat_learning_rate,
        iterations=cat_iterations,
        verbose=0  # 훈련 중 출력 억제
    )

    
    # f1 = make_scorer(f1_score, pos_label='AbNormal')
    score = cross_val_score(regressor_obj, train_x, y_encoded, cv=5, n_jobs=-1, scoring='f1', error_score='raise')
    mean_f1 = score.mean()
    return mean_f1

# 최적화 실행
study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction="maximize")
study.optimize(objective_decision, n_trials=10)

# 최적화 결과 보기
print("Best score:", study.best_value)
print("Best parameters:", study.best_params)

[I 2024-08-02 16:01:06,916] A new study created in memory with name: no-name-1800a05f-1621-4e76-ab2f-8f166edb5c65
[I 2024-08-02 16:01:11,501] Trial 0 finished with value: 0.5997391889544941 and parameters: {'depth': 6, 'learning_rate': 0.001, 'iterations': 150}. Best is trial 0 with value: 0.5997391889544941.
[I 2024-08-02 16:01:15,475] Trial 1 finished with value: 0.5982836400274476 and parameters: {'depth': 5, 'learning_rate': 0.001, 'iterations': 270}. Best is trial 0 with value: 0.5997391889544941.
[I 2024-08-02 16:01:18,566] Trial 2 finished with value: 0.5958148836341283 and parameters: {'depth': 2, 'learning_rate': 0.001, 'iterations': 120}. Best is trial 0 with value: 0.5997391889544941.
[I 2024-08-02 16:01:24,044] Trial 3 finished with value: 0.5962056697528492 and parameters: {'depth': 8, 'learning_rate': 0.001, 'iterations': 270}. Best is trial 0 with value: 0.5997391889544941.
[I 2024-08-02 16:01:25,124] Trial 4 finished with value: 0.5945926950782259 and parameters: {'dept

Best score: 0.5997391889544941
Best parameters: {'depth': 6, 'learning_rate': 0.001, 'iterations': 150}


In [20]:
cat = CatBoostClassifier(**study.best_params)
cat.fit(train_x, y_encoded)
# cat.score(X_test, y_test)

0:	learn: 0.6930920	total: 140ms	remaining: 20.9s
1:	learn: 0.6930398	total: 143ms	remaining: 10.6s
2:	learn: 0.6929930	total: 146ms	remaining: 7.17s
3:	learn: 0.6929382	total: 149ms	remaining: 5.45s
4:	learn: 0.6928991	total: 152ms	remaining: 4.39s
5:	learn: 0.6928469	total: 155ms	remaining: 3.71s
6:	learn: 0.6928096	total: 157ms	remaining: 3.22s
7:	learn: 0.6927525	total: 160ms	remaining: 2.84s
8:	learn: 0.6926986	total: 163ms	remaining: 2.55s
9:	learn: 0.6926489	total: 166ms	remaining: 2.32s
10:	learn: 0.6925993	total: 168ms	remaining: 2.13s
11:	learn: 0.6925597	total: 171ms	remaining: 1.97s
12:	learn: 0.6925093	total: 174ms	remaining: 1.83s
13:	learn: 0.6924727	total: 176ms	remaining: 1.71s
14:	learn: 0.6924159	total: 179ms	remaining: 1.61s
15:	learn: 0.6923571	total: 182ms	remaining: 1.52s
16:	learn: 0.6923119	total: 184ms	remaining: 1.44s
17:	learn: 0.6922574	total: 187ms	remaining: 1.37s
18:	learn: 0.6921957	total: 189ms	remaining: 1.3s
19:	learn: 0.6921290	total: 192ms	remainin

<catboost.core.CatBoostClassifier at 0x1d0ae900650>

### xgboost

In [21]:
from xgboost import XGBClassifier

# 조절할 하이퍼 파라미터와 그 범위를 지정하는 함수 정의
def objective_decision(trial): 
    # optuna.trial.Trial.suggest_categorical() for categorical parameters
    # optuna.trial.Trial.suggest_int() for integer parameters
    # optuna.trial.Trial.suggest_float() for floating point parameters    
    #찾고자 하는 하이퍼파라미트의 값의 종류에 따라 suggest_int나 다른것을 사용
    xgbm_n_estimators = trial.suggest_int('n_estimators', 300, 400, step = 50)
    xgbm_learning_rate = trial.suggest_float('learning_rate', 0.02, 0.05, step = 0.01)
    xgbm_max_depth = trial.suggest_int('max_depth', 3, 10, step = 1)

    regressor_obj = XGBClassifier(
        n_estimators = xgbm_n_estimators, 
        learning_rate = xgbm_learning_rate,
        max_depth = xgbm_max_depth
    )

    score = cross_val_score(regressor_obj, train_x, y_encoded, cv=5, n_jobs=-1, scoring='f1', error_score='raise')
    mean_squared = score.mean()
    return mean_squared

# 최적화 실행
study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction="maximize")
study.optimize(objective_decision, n_trials = 10)# n_trials은 실험 횟수

#최적화 결과 보기
print("Best score:", study.best_value)
print("Best parameters:", study.best_params)

[I 2024-08-02 16:01:46,246] A new study created in memory with name: no-name-247b3d4d-ca47-4033-8b4b-c1110bb92faf
[I 2024-08-02 16:01:47,059] Trial 0 finished with value: 0.6033442147473325 and parameters: {'n_estimators': 350, 'learning_rate': 0.03, 'max_depth': 6}. Best is trial 0 with value: 0.6033442147473325.
[I 2024-08-02 16:01:47,617] Trial 1 finished with value: 0.5853183662139164 and parameters: {'n_estimators': 400, 'learning_rate': 0.02, 'max_depth': 3}. Best is trial 0 with value: 0.6033442147473325.
[I 2024-08-02 16:01:48,216] Trial 2 finished with value: 0.5967084090819867 and parameters: {'n_estimators': 400, 'learning_rate': 0.05, 'max_depth': 4}. Best is trial 0 with value: 0.6033442147473325.
[I 2024-08-02 16:01:48,765] Trial 3 finished with value: 0.6002606992460018 and parameters: {'n_estimators': 350, 'learning_rate': 0.05, 'max_depth': 4}. Best is trial 0 with value: 0.6033442147473325.
[I 2024-08-02 16:01:49,238] Trial 4 finished with value: 0.5933001351477994 an

Best score: 0.6033442147473325
Best parameters: {'n_estimators': 350, 'learning_rate': 0.03, 'max_depth': 6}


In [22]:
xgbm = XGBClassifier(**study.best_params)
xgbm.fit(train_x, y_encoded)
# cat.score(X_test, y_test)

In [23]:
y_encoded #0은 AbNormal, 1은 Normal

array([0, 1, 1, ..., 0, 1, 0])

In [24]:
train_y

4417    AbNormal
984       Normal
650       Normal
522       Normal
802       Normal
          ...   
3021    AbNormal
2973    AbNormal
4040    AbNormal
1933      Normal
2789    AbNormal
Name: target, Length: 3290, dtype: object

### RandomForest

In [25]:
# 조절할 하이퍼 파라미터와 그 범위를 지정하는 함수 정의
def objective(trial): 
    # optuna.trial.Trial.suggest_categorical() for categorical parameters
    # optuna.trial.Trial.suggest_int() for integer parameters
    # optuna.trial.Trial.suggest_float() for floating point parameters    
    rf_n_estimators = trial.suggest_int('n_estimators', 200, 400, step = 20) #찾고자 하는 하이퍼파라미트의 값의 종류에 따라 suggest_int나 다른것을 사용
    rf_max_depth = trial.suggest_int('max_depth', 2, 15, step = 2)
    rf_min_samples_split = trial.suggest_int('min_samples_split', 5, 20, step = 2)
    rf_max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 2, 60, step = 2)

    classifier_obj = RandomForestClassifier(
        n_estimators = rf_n_estimators, 
        max_depth = rf_max_depth,
        min_samples_split = rf_min_samples_split,
        max_leaf_nodes = rf_max_leaf_nodes,
    )

   # f1 = make_scorer(f1_score, pos_label='AbNormal')
    score = cross_val_score(classifier_obj, train_x, y_encoded, cv=5, n_jobs=-1, scoring = 'f1')
    mean_squared = score.mean()
    return mean_squared

# 최적화 실행
study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction="maximize")
study.optimize(objective, n_trials = 7) # n_trials은 실험 횟수

#최적화 결과 보기
print("Best score:", study.best_value)
print("Best parameters:", study.best_params)

[I 2024-08-02 16:01:52,475] A new study created in memory with name: no-name-1869a2a5-3d27-4d88-b161-64065c948ddd
[I 2024-08-02 16:01:53,308] Trial 0 finished with value: 0.5887681327114718 and parameters: {'n_estimators': 300, 'max_depth': 4, 'min_samples_split': 11, 'max_leaf_nodes': 52}. Best is trial 0 with value: 0.5887681327114718.
[I 2024-08-02 16:01:53,856] Trial 1 finished with value: 0.6004386114381178 and parameters: {'n_estimators': 200, 'max_depth': 2, 'min_samples_split': 15, 'max_leaf_nodes': 50}. Best is trial 1 with value: 0.6004386114381178.
[I 2024-08-02 16:01:54,586] Trial 2 finished with value: 0.5813625049906335 and parameters: {'n_estimators': 220, 'max_depth': 10, 'min_samples_split': 19, 'max_leaf_nodes': 14}. Best is trial 1 with value: 0.6004386114381178.
[I 2024-08-02 16:01:55,162] Trial 3 finished with value: 0.6012833269941804 and parameters: {'n_estimators': 240, 'max_depth': 2, 'min_samples_split': 7, 'max_leaf_nodes': 60}. Best is trial 3 with value: 0.

Best score: 0.6012833269941804
Best parameters: {'n_estimators': 240, 'max_depth': 2, 'min_samples_split': 7, 'max_leaf_nodes': 60}


In [26]:
rf = RandomForestClassifier(**study.best_params)
rf.fit(train_x, y_encoded)

## 4. 제출하기

### 테스트 데이터 예측

테스트 데이터 불러오기

In [27]:
df_test_y = pd.read_csv(os.path.join("submission.csv"))

In [28]:
df_test = pd.merge(X, df_test_y, "inner", on="Set ID")
df_test_x = df_test[features]

for col in df_test_x.columns:
    try:
        df_test_x.loc[:, col] = df_test_x[col].astype(int)
    except:
        continue

In [29]:
cat_test_pred = cat.predict(df_test_x)
cat_test_pred

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [30]:
xgbm_test_pred = xgbm.predict(df_test_x)
xgbm_test_pred

array([1, 0, 0, ..., 1, 1, 1])

In [31]:
rf_test_pred = rf.predict(df_test_x)
rf_test_pred

array([0, 0, 0, ..., 1, 1, 1])

### voting 전 평가

In [32]:
# 학습데이터 70%, 평가데이터 30%로 데이터 분할
X_tra, X_dev, y_tra, y_dev = train_test_split(train_x, y_encoded, test_size=0.3, stratify=train_y, random_state=0)

In [33]:
y_dev

array([0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,

In [35]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score

averaging = VotingClassifier(
    estimators = [('xgbm', xgbm), ('cat', cat),  ('rf', rf)], voting='soft', weights=[4, 3, 3])

averaging.fit(X_tra, y_tra)

ensemble_pred = averaging.predict(X_dev)

print(f1_score(y_dev, ensemble_pred))

0:	learn: 0.6930855	total: 3.02ms	remaining: 449ms
1:	learn: 0.6930263	total: 5.68ms	remaining: 421ms
2:	learn: 0.6929656	total: 8.26ms	remaining: 405ms
3:	learn: 0.6929194	total: 10.6ms	remaining: 387ms
4:	learn: 0.6928740	total: 13ms	remaining: 376ms
5:	learn: 0.6928253	total: 15.5ms	remaining: 372ms
6:	learn: 0.6927674	total: 17.9ms	remaining: 366ms
7:	learn: 0.6927222	total: 20.2ms	remaining: 359ms
8:	learn: 0.6926626	total: 22.8ms	remaining: 358ms
9:	learn: 0.6926086	total: 25.3ms	remaining: 355ms
10:	learn: 0.6925543	total: 27.7ms	remaining: 350ms
11:	learn: 0.6925124	total: 30.3ms	remaining: 349ms
12:	learn: 0.6924417	total: 32.7ms	remaining: 344ms
13:	learn: 0.6924164	total: 33.9ms	remaining: 330ms
14:	learn: 0.6923476	total: 36.2ms	remaining: 326ms
15:	learn: 0.6923031	total: 38.6ms	remaining: 323ms
16:	learn: 0.6922382	total: 41.2ms	remaining: 322ms
17:	learn: 0.6921925	total: 43.6ms	remaining: 320ms
18:	learn: 0.6921225	total: 46.1ms	remaining: 318ms
19:	learn: 0.6920624	tot

In [36]:
from sklearn.ensemble import VotingClassifier

averaging_model = VotingClassifier(
    estimators = [('xgbm', xgbm), ('cat', cat),  ('rf', rf)], voting='soft', weights=[4, 3, 3])

averaging_model.fit(train_x, train_y)

ensemble_pred = averaging_model.predict(df_test_x)

0:	learn: 0.6930920	total: 3.08ms	remaining: 459ms
1:	learn: 0.6930398	total: 5.87ms	remaining: 434ms
2:	learn: 0.6929930	total: 8.4ms	remaining: 412ms
3:	learn: 0.6929382	total: 11.1ms	remaining: 405ms
4:	learn: 0.6928991	total: 13.2ms	remaining: 384ms
5:	learn: 0.6928469	total: 15.8ms	remaining: 379ms
6:	learn: 0.6928096	total: 18.5ms	remaining: 379ms
7:	learn: 0.6927525	total: 21.4ms	remaining: 380ms
8:	learn: 0.6926986	total: 24ms	remaining: 376ms
9:	learn: 0.6926489	total: 26.8ms	remaining: 375ms
10:	learn: 0.6925993	total: 29.3ms	remaining: 371ms
11:	learn: 0.6925597	total: 32ms	remaining: 368ms
12:	learn: 0.6925093	total: 34.5ms	remaining: 364ms
13:	learn: 0.6924727	total: 37.1ms	remaining: 360ms
14:	learn: 0.6924159	total: 39.7ms	remaining: 357ms
15:	learn: 0.6923571	total: 42.1ms	remaining: 353ms
16:	learn: 0.6923119	total: 44.9ms	remaining: 352ms
17:	learn: 0.6922574	total: 47.3ms	remaining: 347ms
18:	learn: 0.6921957	total: 49.7ms	remaining: 342ms
19:	learn: 0.6921290	total:

In [37]:
ensemble_pred 

array(['AbNormal', 'AbNormal', 'AbNormal', ..., 'Normal', 'Normal',
       'Normal'], dtype=object)

### 제출 파일 작성

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = ensemble_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

In [None]:
data = pd.read_csv('submission.csv')

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**