# 제품 이상여부 판별 프로젝트

## 1. 데이터 불러오기

### 필수 라이브러리

In [1]:
import os
from pprint import pprint
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import xgboost as xgb
import shap

pd.set_option('display.max_columns', None)
pd.set_option('display.max_row', None)

In [2]:
ROOT_DIR = "data"
RANDOM_STATE = 110
# Load data
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))

In [3]:
drop_cols = []
for column in train_data.columns:
    if (train_data[column].notnull().sum() // 2) < train_data[column].isnull().sum():
        drop_cols.append(column)
train_data = train_data.drop(drop_cols, axis=1)

In [4]:
# 결측치 피쳐 세 개의 피쳐가 결측치 수가 동일함
na_list = []
for i in train_data.columns:
    if train_data[i].isna().sum() > 0:
        na_list.append(i)
train_data.drop(columns = na_list, inplace= True)

### 전처리 함수

In [5]:
#전처리 함수
import pandas as pd
import numpy as np

# 파트 1: 전처리 초기 단계
def preprocess_initial(df):
    # 고유값이 하나인 컬럼 찾기
    unique_value_1_columns = [column for column in df.columns if df[column].nunique() == 1]

    # 모든 행의 값이 다른 컬럼 찾기
    row_count = len(df)
    matching_row_columns = [column for column in df.columns if df[column].value_counts().size == row_count]

    # 고유값, 혼합값 피쳐 제거
    df.drop(columns=unique_value_1_columns, inplace=True)
    df.drop(columns=matching_row_columns, inplace=True)
    return df

# 파트 2: 유사한 피쳐 제거
def reduce_dataframe(df):
    def get_value_counts_ratio(series):
        value_counts = series.value_counts(normalize=True)
        return value_counts.sort_values().values

    # 고유값 매핑 확인 함수
    def check_value_mapping(df, col1, col2):
        unique_values_1 = df[col1].unique()
        unique_values_2 = df[col2].unique()

        if len(unique_values_1) != len(unique_values_2):
            return False

        value_mapping = {}
        for val1 in unique_values_1:
            corresponding_values = df[df[col1] == val1][col2].unique()
            if len(corresponding_values) != 1:
                return False
            value_mapping[val1] = corresponding_values[0]

        for val1 in unique_values_1:
            ratio_1 = (df[col1] == val1).mean()
            ratio_2 = (df[col2] == value_mapping[val1]).mean()
            if ratio_1 != ratio_2:
                return False

        return True

    def compare_all_features(df):
        ratios = {column: get_value_counts_ratio(df[column]) for column in df.columns}
        similar_columns_dict = {column: [] for column in df.columns}

        # 고유값 비율이 같은 열들 찾기
        columns = list(ratios.keys())
        for i in range(len(columns)):
            for j in range(i + 1, len(columns)):
                if np.array_equal(ratios[columns[i]], ratios[columns[j]]):
                    similar_columns_dict[columns[i]].append(columns[j])

        # 고유값 비율이 같고 매핑도 동일한 피쳐들 찾기
        comparisons = []
        for key, values in similar_columns_dict.items():
            for value in values:
                if check_value_mapping(df, key, value):
                    comparisons.append((key, value))

        if comparisons:
            print("다음 피쳐 쌍은 고유값 비율과 양상이 동일합니다:")
            for base, compare in comparisons:
                print(f"'{base}'와(과) '{compare}'")
        else:
            print("모든 피쳐의 고유값 비율과 양상이 동일하지 않습니다.")

        return comparisons

    comparisons = compare_all_features(df)
    columns_to_remove = set()
    for _, col_to_remove in comparisons:
        columns_to_remove.add(col_to_remove)

    df = df.drop(columns=columns_to_remove)
    global df_columns
    df_columns = df.columns
    return df


# 파트 3: 통계 계산 및 추가 전처리
def finalize_preprocessing(df):
    # 날짜 기준 정렬

    # 장비별 Collect Result 컬럼들 식별
    collect_result_columns_dam = [col for col in df.columns if 'Collect Result' in col and 'Dam' in col]
    collect_result_columns_autoclave = [col for col in df.columns if 'Collect Result' in col and 'AutoClave' in col]
    collect_result_columns_fill1 = [col for col in df.columns if 'Collect Result' in col and 'Fill1' in col]
    collect_result_columns_fill2 = [col for col in df.columns if 'Collect Result' in col and 'Fill2' in col]

    # Dam 장비의 평균, 표준편차, 합계 계산
    df['Collect_Result_Mean_Dam'] = df[collect_result_columns_dam].astype(float).mean(axis=1)
    df['Collect_Result_Std_Dam'] = df[collect_result_columns_dam].astype(float).std(axis=1)
    df['Collect_Result_Sum_Dam'] = df[collect_result_columns_dam].astype(float).sum(axis=1)

    # AutoClave 장비의 평균, 표준편차, 합계 계산
    df['Collect_Result_Mean_AutoClave'] = df[collect_result_columns_autoclave].astype(float).mean(axis=1)
    df['Collect_Result_Std_AutoClave'] = df[collect_result_columns_autoclave].astype(float).std(axis=1)
    df['Collect_Result_Sum_AutoClave'] = df[collect_result_columns_autoclave].astype(float).sum(axis=1)

    # Fill1 장비의 평균, 표준편차, 합계 계산
    df['Collect_Result_Mean_Fill1'] = df[collect_result_columns_fill1].astype(float).mean(axis=1)
    df['Collect_Result_Std_Fill1'] = df[collect_result_columns_fill1].astype(float).std(axis=1)
    df['Collect_Result_Sum_Fill1'] = df[collect_result_columns_fill1].astype(float).sum(axis=1)

    # Fill2 장비의 평균, 표준편차, 합계 계산
    df['Collect_Result_Mean_Fill2'] = df[collect_result_columns_fill2].astype(float).mean(axis=1)
    df['Collect_Result_Std_Fill2'] = df[collect_result_columns_fill2].astype(float).std(axis=1)
    df['Collect_Result_Sum_Fill2'] = df[collect_result_columns_fill2].astype(float).sum(axis=1)

    return df

# 전체 전처리 함수
def preprocess_train_dataframe(df):
    df = preprocess_initial(df)
    df = reduce_dataframe(df)
    return df

# 전체 전처리 함수
def preprocess_test_dataframe(df):
    df = preprocess_initial(df)
    return df

In [6]:
df_train_pre = preprocess_train_dataframe(train_data)

다음 피쳐 쌍은 고유값 비율과 양상이 동일합니다:
'Equipment_Dam'와(과) 'CURE END POSITION X Collect Result_Dam'
'Equipment_Dam'와(과) 'CURE END POSITION Z Collect Result_Dam'
'Equipment_Dam'와(과) 'CURE END POSITION Θ Collect Result_Dam'
'Equipment_Dam'와(과) 'CURE START POSITION X Collect Result_Dam'
'Equipment_Dam'와(과) 'CURE START POSITION Θ Collect Result_Dam'
'Model.Suffix_Dam'와(과) 'Model.Suffix_AutoClave'
'Model.Suffix_Dam'와(과) 'Model.Suffix_Fill1'
'Model.Suffix_Dam'와(과) 'Model.Suffix_Fill2'
'Workorder_Dam'와(과) 'Workorder_AutoClave'
'Workorder_Dam'와(과) 'Workorder_Fill1'
'Workorder_Dam'와(과) 'Workorder_Fill2'
'CURE END POSITION X Collect Result_Dam'와(과) 'CURE END POSITION Z Collect Result_Dam'
'CURE END POSITION X Collect Result_Dam'와(과) 'CURE END POSITION Θ Collect Result_Dam'
'CURE END POSITION X Collect Result_Dam'와(과) 'CURE START POSITION X Collect Result_Dam'
'CURE END POSITION X Collect Result_Dam'와(과) 'CURE START POSITION Θ Collect Result_Dam'
'CURE END POSITION Z Collect Result_Dam'와(과) 'CURE END POSITI

## 2. 데이터 전처리

### 언더 샘플링 

In [7]:
# normal_ratio = 1.0  # 1.0 means 1:1 ratio

# df_normal = train_data[train_data["target"] == "Normal"]
# df_abnormal = train_data[train_data["target"] == "AbNormal"]

# num_normal = len(df_normal)
# num_abnormal = len(df_abnormal)
# print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

# df_normal = df_normal.sample(
#     n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE
# )
# df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
# df_concat.value_counts("target")

### SMOTE-Tokmek

In [8]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from imblearn.combine import SMOTETomek
from sklearn.compose import ColumnTransformer

# 특징(feature)와 타겟(target)을 분리
X = df_train_pre.drop(columns=['target','Workorder_Dam'],axis=1)
y = df_train_pre["target"]

# 범주형 데이터 인코딩
categorical_features = X.select_dtypes(include=['object']).columns
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X[categorical_features]).toarray()
X_encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(categorical_features))

# 인코딩된 범주형 데이터를 원래 숫자형 데이터와 결합
X_numeric = X.drop(categorical_features, axis=1).reset_index(drop=True)
X_combined = pd.concat([X_numeric, X_encoded_df], axis=1)

# SMOTE-Tomek 적용
smote_tomek = SMOTETomek(random_state=42)
X_resampled, y_resampled = smote_tomek.fit_resample(X_combined, y)

# 다시 DataFrame으로 변환
df_resampled = pd.concat([pd.DataFrame(X_resampled, columns=X_combined.columns), pd.Series(y_resampled, name='target')], axis=1)

# 결과 확인
print(f"Resampled target distribution:\n{df_resampled['target'].value_counts()}")

# 원래의 DataFrame과 동일한 형식으로 인덱스를 초기화
df_train_preprocessed = df_resampled.reset_index(drop=True)

Resampled target distribution:
Normal      37079
AbNormal    37079
Name: target, dtype: int64


In [9]:
df_train_preprocessed.shape

(74158, 102)

### 전처리 함수

In [10]:
train_x = df_train_preprocessed.copy()

In [11]:
y_train = train_x.target
train_x.drop(columns = ['target'],inplace =True)

In [12]:
features = train_x.columns

In [19]:
from pycaret.classification import *
setup_clf = setup(data=train_x, target=y_train, n_jobs=1)

Unnamed: 0,Description,Value
0,Session id,3664
1,Target,target
2,Target type,Binary
3,Target mapping,"AbNormal: 0, Normal: 1"
4,Original data shape,"(74158, 102)"
5,Transformed data shape,"(74158, 102)"
6,Transformed train set shape,"(51910, 102)"
7,Transformed test set shape,"(22248, 102)"
8,Numeric features,101
9,Preprocess,True


In [20]:
top5 = compare_models(sort='F1', n_select=5, fold=5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9743,0.9946,0.9743,0.9745,0.9743,0.9485,0.9487,6.62
rf,Random Forest Classifier,0.9692,0.9908,0.9692,0.9702,0.9692,0.9385,0.9395,7.508
catboost,CatBoost Classifier,0.9683,0.9815,0.9683,0.9701,0.9683,0.9367,0.9385,50.39
xgboost,Extreme Gradient Boosting,0.9665,0.9816,0.9665,0.9682,0.9664,0.9329,0.9347,1.796
lightgbm,Light Gradient Boosting Machine,0.9642,0.9808,0.9642,0.9665,0.9641,0.9284,0.9306,2.058
dt,Decision Tree Classifier,0.9236,0.9236,0.9236,0.9237,0.9236,0.8472,0.8473,1.172
gbc,Gradient Boosting Classifier,0.9221,0.9623,0.9221,0.9318,0.9216,0.8441,0.8538,21.03
ada,Ada Boost Classifier,0.8539,0.9175,0.8539,0.8598,0.8533,0.7077,0.7137,5.674
knn,K Neighbors Classifier,0.8236,0.9075,0.8236,0.8367,0.8219,0.6473,0.6602,1.922
ridge,Ridge Classifier,0.6419,0.695,0.6419,0.6421,0.6417,0.2838,0.284,0.43


Processing:   0%|          | 0/73 [00:00<?, ?it/s]

In [None]:
tuned_top5 = [tune_model(i) for i in top5]

In [None]:
blender_top5 = blend_models(estimator_list=tuned_top5, verbose=True, method='auto')

In [None]:
blender_tune5 = tune_model(blender_top5)

In [None]:
final_model = finalize_model(blender_tune5)

## 4. 제출하기

### 테스트 데이터 예측

In [13]:
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

In [14]:
df_test = preprocess_test_dataframe(test_data)
df_test_pre = df_test.copy()

In [15]:
# 특징(feature)와 타겟(target)을 분리
X_test = df_test_pre.drop(columns=['Workorder_Dam'],axis=1)

# 범주형 데이터 인코딩
categorical_features = X_test.select_dtypes(include=['object']).columns
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X_test[categorical_features]).toarray()
X_encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(categorical_features))

# 인코딩된 범주형 데이터를 원래 숫자형 데이터와 결합
X_numeric = X_test.drop(categorical_features, axis=1).reset_index(drop=True)
df_test_preprocessed = pd.concat([X_numeric, X_encoded_df], axis=1)

In [16]:
test_x = df_test_preprocessed[features]

In [None]:
prediction = predict_model(final_model, data=test_x)

### 제출 파일 작성

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = prediction['prediction_label']

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**