In [60]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [61]:
RANDOM_STATE = 110

#데이터 불러오기
train_data = pd.read_csv('C:/Users/정주연/LGAimers_5/train.csv') # 모델 학습 파일 
test_data = pd.read_csv('C:/Users/정주연/LGAimers_5/test.csv') # 모델 테스트 파일 
df_sub = pd.read_csv('C:/Users/정주연/LGAimers_5/submission.csv') # 제출 파일 

In [62]:
train_data

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION X Unit Time_Dam,CURE END POSITION X Judge Value_Dam,...,Production Qty Collect Result_Fill2,Production Qty Unit Time_Fill2,Production Qty Judge Value_Fill2,Receip No Collect Result_Fill2,Receip No Unit Time_Fill2,Receip No Judge Value_Fill2,WorkMode Collect Result_Fill2,WorkMode Unit Time_Fill2,WorkMode Judge Value_Fill2,target
0,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XA938-1,1,OK,240.0,,,...,7,,,127,,,1,,,Normal
1,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,3KPM0016-2,1,OK,240.0,,,...,185,,,1,,,0,,,Normal
2,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1X9167-1,1,OK,1000.0,,,...,10,,,73,,,1,,,Normal
3,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1X0057-1,1,OK,1000.0,,,...,268,,,1,,,0,,,Normal
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3HPM0007-1,1,OK,240.0,,,...,121,,,1,,,0,,,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3J1XF434-2,1,OK,240.0,,,...,318,,,1,,,0,,,Normal
40502,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1XC796-1,1,OK,1000.0,,,...,14,,,197,,,1,,,Normal
40503,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,4C1XD438-1,1,OK,240.0,,,...,1,,,27,,,1,,,Normal
40504,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3I1XA258-1,1,OK,1000.0,,,...,117,,,1,,,0,,,Normal


In [63]:
missing_values = train_data.isnull().sum()
# 결측값이 있는 열만 출력
missing_values = missing_values[missing_values > 0]
print(missing_values)

CURE END POSITION X Unit Time_Dam      40506
CURE END POSITION X Judge Value_Dam    40506
CURE END POSITION Z Unit Time_Dam      40506
CURE END POSITION Z Judge Value_Dam    40506
CURE END POSITION Θ Unit Time_Dam      40506
                                       ...  
Production Qty Judge Value_Fill2       40506
Receip No Unit Time_Fill2              40506
Receip No Judge Value_Fill2            40506
WorkMode Unit Time_Fill2               40506
WorkMode Judge Value_Fill2             40506
Length: 286, dtype: int64


In [64]:
# 필요 없는 열 정의
columns_to_drop = [
    "Workorder_Dam",          # 작업 주문 번호 (고유 식별자)
    "Insp Judge Code_Dam",    # 타겟과 중복되는 정보일 가능성이 있는 열
    "Receip No Collect Result_Fill2",  # 메타데이터로 판단되는 열
    # 필요한 다른 열들도 추가할 수 있습니다.
]

# 열 삭제
train_data = train_data.drop(columns=columns_to_drop)

# 범주형 데이터를 LabelEncoder를 사용해 숫자형으로 변환
label_encoders = {}
for column in train_data.columns:
    if train_data[column].dtype == 'object':  # 열이 문자열(범주형) 데이터인 경우
        le = LabelEncoder()
        train_data[column] = le.fit_transform(train_data[column])
        label_encoders[column] = le  # 각 열에 대한 LabelEncoder를 저장해둡니다.

# 변환된 데이터 확인
print(train_data.head())

   Wip Line_Dam  Process Desc._Dam  Equipment_Dam  Model.Suffix_Dam  \
0             0                  0              0                 3   
1             0                  0              0                 3   
2             0                  0              1                 0   
3             0                  0              1                 0   
4             0                  0              0                 0   

   Insp. Seq No._Dam  CURE END POSITION X Collect Result_Dam  \
0                  1                                   240.0   
1                  1                                   240.0   
2                  1                                  1000.0   
3                  1                                  1000.0   
4                  1                                   240.0   

   CURE END POSITION X Unit Time_Dam  CURE END POSITION X Judge Value_Dam  \
0                                NaN                                  NaN   
1                                N

In [65]:
train_data.isna()

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Insp. Seq No._Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION X Unit Time_Dam,CURE END POSITION X Judge Value_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Z Unit Time_Dam,...,PalletID Judge Value_Fill2,Production Qty Collect Result_Fill2,Production Qty Unit Time_Fill2,Production Qty Judge Value_Fill2,Receip No Unit Time_Fill2,Receip No Judge Value_Fill2,WorkMode Collect Result_Fill2,WorkMode Unit Time_Fill2,WorkMode Judge Value_Fill2,target
0,False,False,False,False,False,False,True,True,False,True,...,True,False,True,True,True,True,False,True,True,False
1,False,False,False,False,False,False,True,True,False,True,...,True,False,True,True,True,True,False,True,True,False
2,False,False,False,False,False,False,True,True,False,True,...,True,False,True,True,True,True,False,True,True,False
3,False,False,False,False,False,False,True,True,False,True,...,True,False,True,True,True,True,False,True,True,False
4,False,False,False,False,False,False,True,True,False,True,...,True,False,True,True,True,True,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,False,False,False,False,False,False,True,True,False,True,...,True,False,True,True,True,True,False,True,True,False
40502,False,False,False,False,False,False,True,True,False,True,...,True,False,True,True,True,True,False,True,True,False
40503,False,False,False,False,False,False,True,True,False,True,...,True,False,True,True,True,True,False,True,True,False
40504,False,False,False,False,False,False,True,True,False,True,...,True,False,True,True,True,True,False,True,True,False


In [72]:
# Normal과 AbNormal 데이터 분리
df_normal = train_data[train_data["target"] == 1]
df_abnormal = train_data[train_data["target"] == 0]

# Normal 데이터 언더 샘플링
#df_normal = df_normal.sample(n=len(df_abnormal), random_state=RANDOM_STATE)

print(f"Total: Normal: {len(df_normal)}, AbNormal: {len(df_abnormal)}")
normal_ratio = 1.0  # Normal : AbNormal = 1 : 1 비율
df_normal = df_normal.sample(n=int(len(df_abnormal) * normal_ratio), replace=False, random_state=RANDOM_STATE)

# 샘플링된 데이터 결합
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)

Total: Normal: 38156, AbNormal: 2350


In [73]:
# df_concat이 비어있는지 확인
print(df_concat.shape)
print(df_concat.head())

(4700, 461)
   Wip Line_Dam  Process Desc._Dam  Equipment_Dam  Model.Suffix_Dam  \
0             0                  0              0                 0   
1             0                  0              1                 0   
2             0                  0              0                 0   
3             0                  0              1                 0   
4             0                  0              0                 0   

   Insp. Seq No._Dam  CURE END POSITION X Collect Result_Dam  \
0                  1                                   240.0   
1                  1                                  1000.0   
2                  1                                   240.0   
3                  1                                  1000.0   
4                  1                                   240.0   

   CURE END POSITION X Unit Time_Dam  CURE END POSITION X Judge Value_Dam  \
0                                NaN                                  NaN   
1                     

In [74]:
#데이터 분할
df_train, df_val = train_test_split(
    df_concat, test_size=0.3, stratify=df_concat["target"], random_state=RANDOM_STATE
)

# 분할된 데이터의 비율 확인
def print_stats(df):
    num_normal = len(df[df["target"] == 1])
    num_abnormal = len(df[df["target"] == 0])
    print(f"Total: Normal: {num_normal}, AbNormal: {num_abnormal}, ratio: {num_abnormal/num_normal:.2f}")

print("Train Data:")
print_stats(df_train)
print("Validation Data:")
print_stats(df_val)

Train Data:
Total: Normal: 1645, AbNormal: 1645, ratio: 1.00
Validation Data:
Total: Normal: 705, AbNormal: 705, ratio: 1.00


In [77]:
# 사용할 피처(특성) 추출
features = df_train.columns[:-1]  # target 칼럼을 제외한 모든 칼럼

# 학습 데이터 준비
#train_x = df_train[features]
#train_y = df_train["target"]

# 모델 정의 및 학습
model = RandomForestClassifier(random_state=RANDOM_STATE)

# 학습용 데이터 준비
train_x = df_train.drop(columns=["target"])
train_y = df_train["target"]

model.fit(train_x, train_y)

In [80]:
# 검증용 데이터 준비
val_x = df_val.drop(columns=["target"])
val_y = df_val["target"]

# 예측 수행
val_pred = model.predict(val_x)

# 성능 지표 계산
accuracy = accuracy_score(val_y, val_pred)
precision = precision_score(val_y, val_pred, pos_label= 0)
recall = recall_score(val_y, val_pred, pos_label= 0)
f1 = f1_score(val_y, val_pred, pos_label= 0)

# 성능 출력
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# 자세한 분류 리포트 출력
print(classification_report(val_y, val_pred, target_names=["Normal", "AbNormal"]))

Accuracy: 0.5780141843971631
Precision: 0.5774647887323944
Recall: 0.5815602836879432
F1 Score: 0.5795053003533569
              precision    recall  f1-score   support

      Normal       0.58      0.58      0.58       705
    AbNormal       0.58      0.57      0.58       705

    accuracy                           0.58      1410
   macro avg       0.58      0.58      0.58      1410
weighted avg       0.58      0.58      0.58      1410

