# 제품 이상여부 판별 프로젝트

## 1. 데이터 불러오기

### 필수 라이브러리

In [1]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

  from pandas.core import (


In [None]:
ROOT_DIR = "data"
RANDOM_STATE = 110

# CSV 파일 읽기
X_Dam = pd.read_csv(os.path.join(ROOT_DIR, "Dam dispensing.csv"))
X_AutoClave = pd.read_csv(os.path.join(ROOT_DIR, "Auto clave.csv"))
X_Fill1 = pd.read_csv(os.path.join(ROOT_DIR, "Fill1 dispensing.csv"))
X_Fill2 = pd.read_csv(os.path.join(ROOT_DIR, "Fill2 dispensing.csv"))

# y 데이터 읽기
y = pd.read_csv(os.path.join(ROOT_DIR, "train_y.csv"))


### 데이터 병합

x 데이터 병합

In [3]:
# Rename columns
X_Dam.columns = [i + " - Dam" for i in X_Dam.columns]
X_AutoClave.columns = [i + " - AutoClave" for i in X_AutoClave.columns]
X_Fill1.columns = [i + " - Fill1" for i in X_Fill1.columns]
X_Fill2.columns = [i + " - Fill2" for i in X_Fill2.columns]
X_Dam = X_Dam.rename(columns={"Set ID - Dam": "Set ID"})
X_AutoClave = X_AutoClave.rename(columns={"Set ID - AutoClave": "Set ID"})
X_Fill1 = X_Fill1.rename(columns={"Set ID - Fill1": "Set ID"})
X_Fill2 = X_Fill2.rename(columns={"Set ID - Fill2": "Set ID"})

# Merge X
X = pd.merge(X_Dam, X_AutoClave, on="Set ID")
X = pd.merge(X, X_Fill1, on="Set ID")
X = pd.merge(X, X_Fill2, on="Set ID")
X = X.drop(X[X.duplicated(subset="Set ID")].index).reset_index(drop=True)

x 데이터와 y 데이터 병합

In [4]:
# Merge X and y
df_merged = pd.merge(X, y, "inner", on="Set ID")

# Drop columns with more than half of the values missing
drop_cols = []
for column in df_merged.columns:
    if (df_merged[column].notnull().sum() // 2) < df_merged[
        column
    ].isnull().sum():
        drop_cols.append(column)
df_merged = df_merged.drop(drop_cols, axis=1)

# Drop Lot ID
df_merged = df_merged.drop("LOT ID - Dam", axis=1)

### 언더 샘플링

데이타 불균형을 해결하기 위해 언더 샘플링을 진행합니다.

In [5]:
normal_ratio = 1.0  # 1.0 means 1:1 ratio

df_normal = df_merged[df_merged["target"] == "Normal"]
df_abnormal = df_merged[df_merged["target"] == "AbNormal"]

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

df_normal = df_normal.sample(
    n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE
)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
df_concat.value_counts("target")

  Total: Normal: 38156, AbNormal: 2350


target
AbNormal    2350
Normal      2350
Name: count, dtype: int64

### 데이터 분할

In [6]:
df_concat = df_concat.sort_values(by=["Collect Date - Dam"])
df_train, df_val = train_test_split(
    df_concat,
    test_size=0.3,
    stratify=df_concat["target"],
    random_state=RANDOM_STATE,
)


def print_stats(df: pd.DataFrame):
    num_normal = len(df[df["target"] == "Normal"])
    num_abnormal = len(df[df["target"] == "AbNormal"])

    print(
        f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}"
        + f" ratio: {num_abnormal/num_normal}"
    )


# Print statistics
print(f"  \tAbnormal\tNormal")
print_stats(df_train)
print_stats(df_val)

  	Abnormal	Normal
  Total: Normal: 1645, AbNormal: 1645 ratio: 1.0
  Total: Normal: 705, AbNormal: 705 ratio: 1.0


In [7]:
print(df_train.shape)
print(df_val.shape)

(3290, 189)
(1410, 189)


## 3. 모델 학습

### 모델 정의 

In [8]:
# model = RandomForestClassifier(random_state=RANDOM_STATE)

### 모델 학습

In [9]:
features = []

for col in df_train.columns:
    try:
        df_train[col] = df_train[col].astype(int)
        features.append(col)
    except:
        continue

if "Set ID" in features:
    features.remove("Set ID")

train_x = df_train[features]
train_y = df_train["target"]

# model.fit(train_x, train_y)

In [10]:
from pycaret.classification import *

setup_clf = setup(data = train_x, target = train_y, session_id = 42)
model = compare_models(sort = 'F1', fold = 5)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,target
2,Target type,Binary
3,Target mapping,"AbNormal: 0, Normal: 1"
4,Original data shape,"(3290, 150)"
5,Transformed data shape,"(3290, 150)"
6,Transformed train set shape,"(2303, 150)"
7,Transformed test set shape,"(987, 150)"
8,Numeric features,149
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.6118,0.6361,0.6118,0.6119,0.6117,0.2236,0.2238,0.316
catboost,CatBoost Classifier,0.6114,0.6402,0.6114,0.6117,0.6111,0.2228,0.2231,3.446
ridge,Ridge Classifier,0.594,0.6288,0.594,0.5942,0.5938,0.188,0.1882,0.046
lda,Linear Discriminant Analysis,0.5918,0.6287,0.5918,0.592,0.5917,0.1837,0.1838,0.058
ada,Ada Boost Classifier,0.5875,0.6207,0.5875,0.5877,0.5873,0.175,0.1752,0.152
lr,Logistic Regression,0.5857,0.6198,0.5857,0.5862,0.5853,0.1715,0.172,1.956
xgboost,Extreme Gradient Boosting,0.5797,0.6149,0.5797,0.5798,0.5795,0.1594,0.1595,0.166
lightgbm,Light Gradient Boosting Machine,0.5771,0.6167,0.5771,0.5773,0.5768,0.1542,0.1544,0.272
et,Extra Trees Classifier,0.5762,0.6088,0.5762,0.5765,0.5759,0.1525,0.1527,0.188
rf,Random Forest Classifier,0.5753,0.6148,0.5753,0.5754,0.5752,0.1507,0.1508,0.192


In [11]:
best_tune = tune_model(model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5801,0.6047,0.5801,0.5812,0.5782,0.1597,0.1611
1,0.619,0.6315,0.619,0.6198,0.6183,0.2378,0.2387
2,0.5931,0.6527,0.5931,0.5943,0.5921,0.1865,0.1875
3,0.6261,0.6726,0.6261,0.6262,0.626,0.2522,0.2523
4,0.5783,0.6123,0.5783,0.5793,0.5769,0.1565,0.1575
5,0.6435,0.6628,0.6435,0.6471,0.6413,0.287,0.2905
6,0.5478,0.5682,0.5478,0.5478,0.5478,0.0957,0.0957
7,0.6261,0.6525,0.6261,0.63,0.6232,0.2522,0.2561
8,0.5783,0.6362,0.5783,0.5805,0.5754,0.1565,0.1587
9,0.6391,0.687,0.6391,0.6404,0.6383,0.2783,0.2795


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


## 4. 제출하기

### 테스트 데이터 예측

테스트 데이터 불러오기

In [15]:
df_test_y = pd.read_csv(os.path.join("submission.csv"))

In [16]:
df_test = pd.merge(X, df_test_y, "inner", on="Set ID")
df_test_x = df_test[features]

for col in df_test_x.columns:
    try:
        df_test_x.loc[:, col] = df_test_x[col].astype(int)
    except:
        continue

# SHAP

---

In [17]:
# import pandas as pd
# import numpy as np
# from sklearn.model_selection import train_test_split
# import xgboost as xgb
# from sklearn.datasets import load_breast_cancer
# from sklearn.metrics import f1_score
# import shap
# xgb_train_x, xgb_test_x, xgb_train_y, xgb_test_y = train_test_split(
#     train_x,  # 특성 데이터
#     train_y,  # 타겟 데이터
#     test_size=0.3,
#     stratify=train_y,
#     random_state=RANDOM_STATE
# )

# xgb_train_y = xgb_train_y.replace({'AbNormal': 1, 'Normal': 0})
# xgb_test_y = xgb_test_y.replace({'AbNormal': 1, 'Normal': 0})

# SHAP_THRESHOLD = 0.1

# # feature_names dimension 조정
# xgb_train_x_col = xgb_train_x.columns
# feature_names = xgb_train_x_col.to_numpy()

# # 모델 학습
# model = xgb.XGBClassifier().fit(xgb_train_x, xgb_train_y)

# # 모델 예측 및 평가
# y_pred = model.predict(xgb_test_x)
# f1 = f1_score(xgb_test_y, y_pred, average='weighted')
# print("Validation F1 Score:", f1)

# # SHAP 값 계산
# explainer = shap.TreeExplainer(model)
# shap_values = explainer.shap_values(xgb_test_x)

# # SHAP 값 요약
# if isinstance(shap_values, list):  # shap_values가 리스트일 경우 (XGBoost >= 1.0.0)
#     shap_values = shap_values[1]

# shap_sum = np.abs(shap_values).mean(axis=0)
# importance_df = pd.DataFrame({'column_name': feature_names, 'shap_importance': shap_sum})
# importance_df = importance_df.sort_values('shap_importance', ascending=False)

# # 중요도 임계값 적용 (선택 사항)
# importance_df_filtered = importance_df[importance_df['shap_importance'] > SHAP_THRESHOLD]
# print("Filtered SHAP Importances:\n", importance_df_filtered)

# # 지정된(SHAP_THRESHOLD) Shap feature 중요도 이상인 것만 선택
# features_selected = importance_df.query('shap_importance > @SHAP_THRESHOLD').column_name.tolist()
# shap_xgb_train_x = xgb_train_x[features_selected]
# shap_xgb_test_x = xgb_test_x[features_selected]

# model = xgb.XGBClassifier().fit(shap_xgb_train_x, xgb_train_y)

# # 모델 예측 및 평가
# y_pred = model.predict(shap_xgb_test_x)
# f1 = f1_score(xgb_test_y, y_pred, average='weighted')
# print("Validation F1 Score:", f1)

# train_x = train_x[features_selected]
# df_test_x = df_test_x[features_selected]

# train_y = train_y.replace({'AbNormal': 1, 'Normal': 0})

---

In [None]:
train_y = train_y.replace({'AbNormal': 1, 'Normal': 0})
test_y = test_y.replace({'AbNormal': 1, 'Normal': 0})

In [25]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(random_state=42)

In [26]:
model.fit(train_x, train_y)

In [27]:
test_pred = model.predict(df_test_x)
test_pred_converted = np.where(test_pred == 1, 'AbNormal', 'Normal')

print(test_pred_converted)

['AbNormal' 'AbNormal' 'AbNormal' ... 'Normal' 'Normal' 'Normal']


### 제출 파일 작성

In [28]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = test_pred_converted

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**