# Anomalous Financial Transaction Detection

본 대회의 과제는 금융 거래 데이터에서 **이상 거래를 탐지하는 기능**을 개선하고 활용도를 높이는 분류 AI모델을 개발하는 것입니다. 

특히, 클래스 불균형 문제를 해결하기 위해 오픈소스 생성형 AI 모델을 활용하여 부족한 클래스의 데이터를 보완하고, 이를 통해 분류 모델의 성능을 향상시키는 것이 핵심 목표입니다. 

이러한 접근을 통해 금융보안에 특화된 데이터 분석 및 활용 역량을 강화하여 전문 인력을 양성하고, 금융권의 AI 활용 어려움에 따른 해결 방안을 함께 모색하며 금융 산업의 AI 활용 활성화를 지원하는 것을 목표로 합니다.

# Import Library

In [1]:
# pip install sdv

In [2]:
# 제출 파일 생성 관련
import os
import zipfile

# 데이터 처리 및 분석
import pandas as pd
import numpy as np
from scipy import stats
from tqdm import tqdm

# 머신러닝 전처리
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

# 머신러닝 모델
import xgboost as xgb

# 합성 데이터 생성
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer

# To ignore all warnings
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

# 생성 🏭

# Load Data

In [3]:
train_all = pd.read_csv("train.csv")
test_all = pd.read_csv("test.csv")

In [4]:
train = train_all.drop(columns="ID")

In [5]:
train.shape

(120000, 63)

In [6]:
'''
(*) 리더보드 산식 중 생성데이터의 익명성(TCAP)채점을 위해 각 클래스 별로 1000개의 생성데이터가 반드시 필요합니다.
(*) 본 베이스 라인에서는 "Fraud_Type" 13종류에 대해 1000개씩 , 총 13,000개의 데이터를 생성할 예정입니다.
(*) 분류 모델 성능 개선을 위해 생성 데이터를 활용하는 것에는 생성 데이터의 Row 개수에 제한이 없습니다. 단, 리더보드 평가를 위해 제출을 하는 생성 데이터 프레임은 익명성(TCAP) 평가를 위함이며, 위의 조건을 갖춘 생성 데이터를 제출해야합니다.
'''
N_CLS_PER_GEN = 1000
N_CLS_PER_GEN_2 = 1

In [7]:
# pd.set_option('display.max_rows', None)  # 모든 행 표시
# cond_all = pd.read_excel("데이터_명세_및_생성조건.xlsx", header=1)
# cond_all.iloc[:,2:-1]

## 제출용 합성데이터

In [8]:
# from ctgan import CTGAN 
# import pandas as pd
# from tqdm import tqdm
# import numpy as np
# from scipy import stats
# from sdv.metadata import SingleTableMetadata

# # 이상치 처리 함수
# def handle_outliers(series, n_std=3):
#     mean = series.mean()
#     std = series.std()
#     z_scores = np.abs(stats.zscore(series))
#     return series.mask(z_scores > n_std, mean)

# # 범주형 데이터 조건 강제 함수
# def enforce_categorical_conditions(df):
#     # Customer_Gender: 'male', 'female'만 허용
#     df['Customer_Gender'] = df['Customer_Gender'].apply(lambda x: 'male' if x == 'male' else 'female')
    
#     # Customer_credit_rating: 'S', 'A', 'B', 'C', 'D', 'E'만 허용
#     valid_ratings = ['S', 'A', 'B', 'C', 'D', 'E']
#     df['Customer_credit_rating'] = df['Customer_credit_rating'].apply(lambda x: x if x in valid_ratings else 'B')
    
#     # Customer_loan_type: 'a', 'b', 'c', 'd', 'e'만 허용
#     valid_loan_types = ['a', 'b', 'c', 'd', 'e']
#     df['Customer_loan_type'] = df['Customer_loan_type'].apply(lambda x: x if x in valid_loan_types else 'c')
    
#     # 기타 범주형 변수들 0, 1만 허용
#     binary_columns = ['Customer_flag_change_of_authentication_1', 'Customer_flag_change_of_authentication_2',
#                       'Customer_flag_change_of_authentication_3', 'Customer_flag_change_of_authentication_4',
#                       'Customer_rooting_jailbreak_indicator', 'Customer_mobile_roaming_indicator', 
#                       'Customer_VPN_Indicator', 'Customer_flag_terminal_malicious_behavior_1',
#                       'Customer_flag_terminal_malicious_behavior_2', 'Customer_flag_terminal_malicious_behavior_3',
#                       'Customer_flag_terminal_malicious_behavior_4', 'Customer_flag_terminal_malicious_behavior_5',
#                       'Customer_flag_terminal_malicious_behavior_6', 'Customer_inquery_atm_limit',
#                       'Customer_increase_atm_limit', 'Account_indicator_release_limit_excess',
#                       'Account_indicator_Openbanking', 'Account_release_suspention', 'Transaction_Failure_Status',
#                       'Another_Person_Account', 'Unused_terminal_status', 'Flag_deposit_more_than_tenMillion',
#                       'Unused_account_status', 'Recipient_account_suspend_status', 'First_time_iOS_by_vulnerable_user']
#     for col in binary_columns:
#         df[col] = df[col].apply(lambda x: 1 if x == 1 else 0)
    
#     return df

# # 수치형 데이터 조건 강제 함수
# def enforce_numerical_conditions(df):
#     # Customer_Birthyear: 1950 ~ 2004 범위 제한
#     df['Customer_Birthyear'] = df['Customer_Birthyear'].clip(1950, 2004)
    
#     # Account_initial_balance: 음수가 될 수 없으므로 최소값을 0으로 설정
#     df['Account_initial_balance'] = df['Account_initial_balance'].clip(lower=0)
    
#     # Account_balance: 음수가 될 수 없으므로 최소값을 0으로 설정
#     df['Account_balance'] = df['Account_balance'].clip(lower=0)
    
#     # Account_amount_daily_limit: 0 이상의 값으로 설정 (예시로 최대값도 설정 가능)
#     df['Account_amount_daily_limit'] = df['Account_amount_daily_limit'].clip(lower=0)
    
#     # Account_remaining_amount_daily_limit_exceeded: 0 이상의 값으로 설정
#     df['Account_remaining_amount_daily_limit_exceeded'] = df['Account_remaining_amount_daily_limit_exceeded'].clip(lower=0)
    
#     # Account_one_month_max_amount, Account_dawn_one_month_max_amount: 음수일 수 있으므로, 필요시 범위 설정
#     # 이 항목들은 특정 조건이 있다면 적용
#     # 예: 최댓값을 특정 범위로 제한
#     df['Account_one_month_max_amount'] = df['Account_one_month_max_amount'].clip(lower=-1000000, upper=1000000)
#     df['Account_dawn_one_month_max_amount'] = df['Account_dawn_one_month_max_amount'].clip(lower=-1000000, upper=1000000)
    
#     # Account_one_month_std_dev, Account_dawn_one_month_std_dev: 표준편차는 음수가 될 수 없으므로 0으로 제한
#     df['Account_one_month_std_dev'] = df['Account_one_month_std_dev'].clip(lower=0)
#     df['Account_dawn_one_month_std_dev'] = df['Account_dawn_one_month_std_dev'].clip(lower=0)
    
#     # Transaction_Amount: 특정 범위 내로 설정 (예: 음수도 허용, 최대값 제한)
#     df['Transaction_Amount'] = df['Transaction_Amount'].clip(lower=-100000000, upper=100000000)
    
#     # Distance: 음수가 될 수 없으므로 최소값을 0으로 설정
#     df['Distance'] = df['Distance'].clip(lower=0)
    
#     # Number_of_transaction_with_the_account, Transaction_history_with_the_account: 음수가 될 수 없으므로 최소값을 0으로 설정
#     df['Number_of_transaction_with_the_account'] = df['Number_of_transaction_with_the_account'].clip(lower=0)
#     df['Transaction_history_with_the_account'] = df['Transaction_history_with_the_account'].clip(lower=0)
    
#     return df

# # Time_difference 컬럼을 총 초로 변환 및 이상치 처리
# train['Time_difference_seconds'] = pd.to_timedelta(train['Time_difference']).dt.total_seconds()
# train['Time_difference_seconds'] = handle_outliers(train['Time_difference_seconds'])

# # 모든 Fraud_Type 목록 생성
# fraud_types = train['Fraud_Type'].unique()  # Fraud_Type로 변경

# # 모든 합성 데이터를 저장할 DataFrame 초기화
# all_synthetic_data = pd.DataFrame()

# N_SAMPLE = 100

# # 각 Fraud_Type에 대해 합성 데이터 생성 및 저장
# for fraud_type in tqdm(fraud_types):
    
#     # 해당 Fraud_Type에 대한 서브셋 생성
#     subset = train[train["Fraud_Type"] == fraud_type]

#     # 모든 Fraud_Type에 대해 100개씩 샘플링
#     subset = subset.sample(n=N_SAMPLE, random_state=42)
    
#     # Time_difference 열 제외 (초 단위로 변환된 컬럼만 사용)
#     subset = subset.drop('Time_difference', axis=1)
    
#     # 메타데이터 생성 및 모델 학습
#     metadata = SingleTableMetadata()

#     metadata.detect_from_dataframe(subset)
#     metadata.set_primary_key(None)

#     # 데이터 타입 설정
#     column_sdtypes = {
#         'Customer_Birthyear': 'numerical',
#         # 'Customer_Gender': 'categorical',
#         'Customer_personal_identifier': 'categorical',
#         'Customer_identification_number': 'categorical',
#         # 'Customer_registration_datetime': 'datetime',
#         'Customer_credit_rating': 'categorical',
#         # 'Customer_flag_change_of_authentication_1': 'categorical',
#         # 'Customer_flag_change_of_authentication_2': 'categorical',
#         # 'Customer_flag_change_of_authentication_3': 'categorical',
#         # 'Customer_flag_change_of_authentication_4': 'categorical',
#         # 'Customer_rooting_jailbreak_indicator': 'categorical',
#         # 'Customer_mobile_roaming_indicator': 'categorical',
#         # 'Customer_VPN_Indicator': 'categorical',
#         # 'Customer_loan_type': 'categorical',
#         # 'Customer_flag_terminal_malicious_behavior_1': 'categorical',
#         # 'Customer_flag_terminal_malicious_behavior_2': 'categorical',
#         # 'Customer_flag_terminal_malicious_behavior_3': 'categorical',
#         # 'Customer_flag_terminal_malicious_behavior_4': 'categorical',
#         # 'Customer_flag_terminal_malicious_behavior_5': 'categorical',
#         # 'Customer_flag_terminal_malicious_behavior_6': 'categorical',
#         # 'Customer_inquery_atm_limit': 'categorical',
#         # 'Customer_increase_atm_limit': 'categorical',
#         'Account_account_number': 'categorical',
#         # 'Account_account_type': 'categorical',
#         # 'Account_creation_datetime': 'datetime',
#         'Account_initial_balance': 'numerical',
#         'Account_balance': 'numerical',
#         # 'Account_indicator_release_limit_excess': 'categorical',
#         'Account_amount_daily_limit': 'numerical',
#         'Account_indicator_Openbanking': 'categorical',
#         'Account_remaining_amount_daily_limit_exceeded': 'numerical',
#         # 'Account_release_suspention': 'categorical',
#         'Account_one_month_max_amount': 'numerical',
#         'Account_one_month_std_dev': 'numerical',
#         'Account_dawn_one_month_max_amount': 'numerical',
#         'Account_dawn_one_month_std_dev': 'numerical',
#         # 'Transaction_Datetime': 'datetime',
#         'Transaction_Amount': 'numerical',
#         # 'Channel': 'categorical',
#         # 'Operating_System': 'categorical',
#         # 'Error_Code': 'categorical',
#         # 'Transaction_Failure_Status': 'categorical',
#         # 'Type_General_Automatic': 'categorical',
#         'IP_Address': 'ipv4_address',
#         # 'Access_Medium': 'categorical',
#         'Location': 'categorical',
#         'Recipient_Account_Number': 'categorical',
#         'Transaction_num_connection_failure': 'numerical',
#         # 'Another_Person_Account': 'categorical',
#         'Distance': 'numerical',
#         'Time_difference_seconds': 'numerical',
#         # 'Unused_terminal_status': 'categorical',
#         # 'Last_atm_transaction_datetime': 'datetime',
#         # 'Last_bank_branch_transaction_datetime': 'datetime',
#         # 'Flag_deposit_more_than_tenMillion': 'categorical',
#         # 'Unused_account_status': 'categorical',
#         # 'Recipient_account_suspend_status': 'categorical',
#         'Number_of_transaction_with_the_account': 'numerical',
#         'Transaction_history_with_the_account': 'numerical',
#         # 'First_time_iOS_by_vulnerable_user': 'categorical',
#         # 'Transaction_resumed_date': 'datetime',
#         'Fraud_Type': 'categorical'
#     }

#     # 각 컬럼에 대해 데이터 타입 설정
#     for column, sdtype in column_sdtypes.items():
#         metadata.update_column(
#             column_name=column,
#             sdtype=sdtype
#         )
        
#     synthesizer = CTGANSynthesizer(
#                             metadata,
#                             epochs= 2000
#                         )
#     synthesizer.fit(subset)

#     synthetic_subset = synthesizer.sample(num_rows=N_CLS_PER_GEN)  # 합성 데이터 생성 수 설정
    
#     # 생성된 Time_difference_seconds의 이상치 처리
#     synthetic_subset['Time_difference_seconds'] = handle_outliers(synthetic_subset['Time_difference_seconds'])
    
#     # Time_difference_seconds를 다시 timedelta로 변환
#     synthetic_subset['Time_difference'] = pd.to_timedelta(synthetic_subset['Time_difference_seconds'], unit='s')
    
#     # Time_difference_seconds 컬럼 제거
#     synthetic_subset = synthetic_subset.drop('Time_difference_seconds', axis=1)
    
#     # 생성 조건 반영 (범주형, 수치형, 형식 조건)
#     synthetic_subset = enforce_categorical_conditions(synthetic_subset)
#     synthetic_subset = enforce_numerical_conditions(synthetic_subset)
    
#     # 생성된 데이터를 all_synthetic_data에 추가
#     all_synthetic_data = pd.concat([all_synthetic_data, synthetic_subset], ignore_index=True)

# # 최종 결과 확인
# print("\nFinal All Synthetic Data Shape:", all_synthetic_data.shape)


In [9]:
# all_synthetic_data.to_csv('submission/syn_submission.csv', encoding='UTF-8-sig', index=False)

## 성능용 합성데이터

CTGAN

In [10]:
from ctgan import CTGAN 
import pandas as pd
from tqdm import tqdm
import numpy as np
from scipy import stats
from sdv.metadata import SingleTableMetadata

# 이상치 처리 함수
def handle_outliers(series, n_std=3):
    mean = series.mean()
    std = series.std()
    z_scores = np.abs(stats.zscore(series))
    return series.mask(z_scores > n_std, mean)

# 범주형 데이터 조건 강제 함수
def enforce_categorical_conditions(df):
    # Customer_Gender: 'male', 'female'만 허용
    df['Customer_Gender'] = df['Customer_Gender'].apply(lambda x: 'male' if x == 'male' else 'female')
    
    # Customer_credit_rating: 'S', 'A', 'B', 'C', 'D', 'E'만 허용
    valid_ratings = ['S', 'A', 'B', 'C', 'D', 'E']
    df['Customer_credit_rating'] = df['Customer_credit_rating'].apply(lambda x: x if x in valid_ratings else 'B')
    
    # Customer_loan_type: 'a', 'b', 'c', 'd', 'e'만 허용
    valid_loan_types = ['a', 'b', 'c', 'd', 'e']
    df['Customer_loan_type'] = df['Customer_loan_type'].apply(lambda x: x if x in valid_loan_types else 'c')
    
    # 기타 범주형 변수들 0, 1만 허용
    binary_columns = ['Customer_flag_change_of_authentication_1', 'Customer_flag_change_of_authentication_2',
                      'Customer_flag_change_of_authentication_3', 'Customer_flag_change_of_authentication_4',
                      'Customer_rooting_jailbreak_indicator', 'Customer_mobile_roaming_indicator', 
                      'Customer_VPN_Indicator', 'Customer_flag_terminal_malicious_behavior_1',
                      'Customer_flag_terminal_malicious_behavior_2', 'Customer_flag_terminal_malicious_behavior_3',
                      'Customer_flag_terminal_malicious_behavior_4', 'Customer_flag_terminal_malicious_behavior_5',
                      'Customer_flag_terminal_malicious_behavior_6', 'Customer_inquery_atm_limit',
                      'Customer_increase_atm_limit', 'Account_indicator_release_limit_excess',
                      'Account_indicator_Openbanking', 'Account_release_suspention', 'Transaction_Failure_Status',
                      'Another_Person_Account', 'Unused_terminal_status', 'Flag_deposit_more_than_tenMillion',
                      'Unused_account_status', 'Recipient_account_suspend_status', 'First_time_iOS_by_vulnerable_user']
    for col in binary_columns:
        df[col] = df[col].apply(lambda x: 1 if x == 1 else 0)
    
    return df

# 수치형 데이터 조건 강제 함수
def enforce_numerical_conditions(df):
    # Customer_Birthyear: 1950 ~ 2004 범위 제한
    df['Customer_Birthyear'] = df['Customer_Birthyear'].clip(1950, 2004)
    
    # Account_initial_balance: 음수가 될 수 없으므로 최소값을 0으로 설정
    df['Account_initial_balance'] = df['Account_initial_balance'].clip(lower=0)
    
    # Account_balance: 음수가 될 수 없으므로 최소값을 0으로 설정
    df['Account_balance'] = df['Account_balance'].clip(lower=0)
    
    # Account_amount_daily_limit: 0 이상의 값으로 설정 (예시로 최대값도 설정 가능)
    df['Account_amount_daily_limit'] = df['Account_amount_daily_limit'].clip(lower=0)
    
    # Account_remaining_amount_daily_limit_exceeded: 0 이상의 값으로 설정
    df['Account_remaining_amount_daily_limit_exceeded'] = df['Account_remaining_amount_daily_limit_exceeded'].clip(lower=0)
    
    # Account_one_month_max_amount, Account_dawn_one_month_max_amount: 음수일 수 있으므로, 필요시 범위 설정
    # 이 항목들은 특정 조건이 있다면 적용
    # 예: 최댓값을 특정 범위로 제한
    df['Account_one_month_max_amount'] = df['Account_one_month_max_amount'].clip(lower=-1000000, upper=1000000)
    df['Account_dawn_one_month_max_amount'] = df['Account_dawn_one_month_max_amount'].clip(lower=-1000000, upper=1000000)
    
    # Account_one_month_std_dev, Account_dawn_one_month_std_dev: 표준편차는 음수가 될 수 없으므로 0으로 제한
    df['Account_one_month_std_dev'] = df['Account_one_month_std_dev'].clip(lower=0)
    df['Account_dawn_one_month_std_dev'] = df['Account_dawn_one_month_std_dev'].clip(lower=0)
    
    # Transaction_Amount: 특정 범위 내로 설정 (예: 음수도 허용, 최대값 제한)
    df['Transaction_Amount'] = df['Transaction_Amount'].clip(lower=-100000000, upper=100000000)
    
    # Distance: 음수가 될 수 없으므로 최소값을 0으로 설정
    df['Distance'] = df['Distance'].clip(lower=0)
    
    # Number_of_transaction_with_the_account, Transaction_history_with_the_account: 음수가 될 수 없으므로 최소값을 0으로 설정
    df['Number_of_transaction_with_the_account'] = df['Number_of_transaction_with_the_account'].clip(lower=0)
    df['Transaction_history_with_the_account'] = df['Transaction_history_with_the_account'].clip(lower=0)
    
    return df



# Time_difference 컬럼을 총 초로 변환 및 이상치 처리
train['Time_difference_seconds'] = pd.to_timedelta(train['Time_difference']).dt.total_seconds()
train['Time_difference_seconds'] = handle_outliers(train['Time_difference_seconds'])

# 모든 Fraud_Type 목록 생성
fraud_types = train['Fraud_Type'].unique()  # Fraud_Type로 변경

# 모든 합성 데이터를 저장할 DataFrame 초기화
all_synthetic_data_ctgan = pd.DataFrame()

N_SAMPLE = 100

# 각 Fraud_Type에 대해 합성 데이터 생성 및 저장
for fraud_type in tqdm(fraud_types):
    
    # 해당 Fraud_Type에 대한 서브셋 생성
    subset = train[train["Fraud_Type"] == fraud_type]

    # 모든 Fraud_Type에 대해 100개씩 샘플링
    subset = subset.sample(n=N_SAMPLE, random_state=42)
    
    # Time_difference 열 제외 (초 단위로 변환된 컬럼만 사용)
    subset = subset.drop('Time_difference', axis=1)
    
    # 메타데이터 생성 및 모델 학습
    metadata = SingleTableMetadata()

    metadata.detect_from_dataframe(subset)
    metadata.set_primary_key(None)

    # 데이터 타입 설정
    column_sdtypes = {
        'Customer_Birthyear': 'numerical',
        # 'Customer_Gender': 'categorical',
        'Customer_personal_identifier': 'categorical',
        'Customer_identification_number': 'categorical',
        # 'Customer_registration_datetime': 'datetime',
        'Customer_credit_rating': 'categorical',
        # 'Customer_flag_change_of_authentication_1': 'categorical',
        # 'Customer_flag_change_of_authentication_2': 'categorical',
        # 'Customer_flag_change_of_authentication_3': 'categorical',
        # 'Customer_flag_change_of_authentication_4': 'categorical',
        # 'Customer_rooting_jailbreak_indicator': 'categorical',
        # 'Customer_mobile_roaming_indicator': 'categorical',
        # 'Customer_VPN_Indicator': 'categorical',
        # 'Customer_loan_type': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_1': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_2': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_3': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_4': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_5': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_6': 'categorical',
        # 'Customer_inquery_atm_limit': 'categorical',
        # 'Customer_increase_atm_limit': 'categorical',
        'Account_account_number': 'categorical',
        # 'Account_account_type': 'categorical',
        # 'Account_creation_datetime': 'datetime',
        'Account_initial_balance': 'numerical',
        'Account_balance': 'numerical',
        # 'Account_indicator_release_limit_excess': 'categorical',
        'Account_amount_daily_limit': 'numerical',
        'Account_indicator_Openbanking': 'categorical',
        'Account_remaining_amount_daily_limit_exceeded': 'numerical',
        # 'Account_release_suspention': 'categorical',
        'Account_one_month_max_amount': 'numerical',
        'Account_one_month_std_dev': 'numerical',
        'Account_dawn_one_month_max_amount': 'numerical',
        'Account_dawn_one_month_std_dev': 'numerical',
        # 'Transaction_Datetime': 'datetime',
        'Transaction_Amount': 'numerical',
        # 'Channel': 'categorical',
        # 'Operating_System': 'categorical',
        # 'Error_Code': 'categorical',
        # 'Transaction_Failure_Status': 'categorical',
        # 'Type_General_Automatic': 'categorical',
        'IP_Address': 'ipv4_address',
        # 'Access_Medium': 'categorical',
        'Location': 'categorical',
        'Recipient_Account_Number': 'categorical',
        'Transaction_num_connection_failure': 'numerical',
        # 'Another_Person_Account': 'categorical',
        'Distance': 'numerical',
        'Time_difference_seconds': 'numerical',
        # 'Unused_terminal_status': 'categorical',
        # 'Last_atm_transaction_datetime': 'datetime',
        # 'Last_bank_branch_transaction_datetime': 'datetime',
        # 'Flag_deposit_more_than_tenMillion': 'categorical',
        # 'Unused_account_status': 'categorical',
        # 'Recipient_account_suspend_status': 'categorical',
        'Number_of_transaction_with_the_account': 'numerical',
        'Transaction_history_with_the_account': 'numerical',
        # 'First_time_iOS_by_vulnerable_user': 'categorical',
        # 'Transaction_resumed_date': 'datetime',
        'Fraud_Type': 'categorical'
    }
    # 각 컬럼에 대해 데이터 타입 설정
    for column, sdtype in column_sdtypes.items():
        metadata.update_column(
            column_name=column,
            sdtype=sdtype
        )
        
    synthesizer = CTGANSynthesizer(
                            metadata,
                            epochs=200
                        )
    synthesizer.fit(subset)

    synthetic_subset = synthesizer.sample(num_rows=N_CLS_PER_GEN_2)  # 합성 데이터 생성 수 설정
    
    # 생성된 Time_difference_seconds의 이상치 처리
    synthetic_subset['Time_difference_seconds'] = handle_outliers(synthetic_subset['Time_difference_seconds'])
    
    # Time_difference_seconds를 다시 timedelta로 변환
    synthetic_subset['Time_difference'] = pd.to_timedelta(synthetic_subset['Time_difference_seconds'], unit='s')
    
    # Time_difference_seconds 컬럼 제거
    synthetic_subset = synthetic_subset.drop('Time_difference_seconds', axis=1)
    
    # 생성 조건 반영 (범주형, 수치형, 형식 조건)
    synthetic_subset = enforce_categorical_conditions(synthetic_subset)
    synthetic_subset = enforce_numerical_conditions(synthetic_subset)
    
    # 생성된 데이터를 all_synthetic_data에 추가
    all_synthetic_data_ctgan = pd.concat([all_synthetic_data_ctgan, synthetic_subset], ignore_index=True)

# 최종 결과 확인
print("\nFinal All Synthetic Data ctgan Shape:", all_synthetic_data_ctgan.shape)


100%|██████████| 13/13 [10:55<00:00, 50.41s/it]


Final All Synthetic Data ctgan Shape: (13, 63)





## 원본 데이터와 concat

In [34]:
origin_train = train_all.drop(columns="ID")
train_total = pd.concat([origin_train, all_synthetic_data_ctgan])
train_total.shape

(120013, 63)

# Data Preprocessing 1 : Select x, y

In [35]:
train_x = train_total.drop(columns=['Fraud_Type'])
train_y = train_total['Fraud_Type']

test_x = test_all.drop(columns=['ID'])

# Data Preprocessing 2 : 범주형 변수 인코딩

In [36]:
le_subclass = LabelEncoder()
train_y_encoded = le_subclass.fit_transform(train_y)

# 변환된 레이블 확인
for i, label in enumerate(le_subclass.classes_):
    print(f"원래 레이블: {label}, 변환된 숫자: {i}")

원래 레이블: a, 변환된 숫자: 0
원래 레이블: b, 변환된 숫자: 1
원래 레이블: c, 변환된 숫자: 2
원래 레이블: d, 변환된 숫자: 3
원래 레이블: e, 변환된 숫자: 4
원래 레이블: f, 변환된 숫자: 5
원래 레이블: g, 변환된 숫자: 6
원래 레이블: h, 변환된 숫자: 7
원래 레이블: i, 변환된 숫자: 8
원래 레이블: j, 변환된 숫자: 9
원래 레이블: k, 변환된 숫자: 10
원래 레이블: l, 변환된 숫자: 11
원래 레이블: m, 변환된 숫자: 12


In [37]:
# train_x
# 'Time_difference' 열을 문자열로 변환
train_x['Time_difference'] = train_x['Time_difference'].astype(str)

# 범주형 변수 인코딩
categorical_columns = train_x.select_dtypes(include=['object', 'category']).columns
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# 훈련 데이터 인코딩
train_x_encoded = train_x.copy()
train_x_encoded[categorical_columns] = ordinal_encoder.fit_transform(train_x[categorical_columns])

In [38]:
train_x_encoded['Customer_Total_Authentication_Changes'] = train_x_encoded[
    ['Customer_flag_change_of_authentication_1', 'Customer_flag_change_of_authentication_2', 
     'Customer_flag_change_of_authentication_3', 'Customer_flag_change_of_authentication_4']
].sum(axis=1)

train_x_encoded['Customer_Total_Malicious_Behavior_Flags'] = train_x_encoded[
    ['Customer_flag_terminal_malicious_behavior_1', 'Customer_flag_terminal_malicious_behavior_2', 
     'Customer_flag_terminal_malicious_behavior_3', 'Customer_flag_terminal_malicious_behavior_4', 
     'Customer_flag_terminal_malicious_behavior_5', 'Customer_flag_terminal_malicious_behavior_6']
].sum(axis=1)

train_x_encoded['Daily_Usage_Ratio'] = train_x_encoded['Account_amount_daily_limit'] / train_x_encoded['Account_remaining_amount_daily_limit_exceeded']

train_x_encoded['Transaction_Amount_per_Distance'] = train_x_encoded['Transaction_Amount'] / train_x_encoded['Distance']
train_x_encoded['Channel_Distance_Interaction'] = train_x_encoded['Channel'] * train_x_encoded['Distance']
train_x_encoded['Abs_Transaction_Amount'] = train_x_encoded['Transaction_Amount'].abs()
train_x_encoded['Transaction_Amount_per_Transaction_Count'] = train_x_encoded['Transaction_Amount'] / (train_x_encoded['Number_of_transaction_with_the_account'] + 1)

train_x_encoded['Channel_Transaction_Count_Interaction'] = train_x_encoded['Channel'] * train_x_encoded['Number_of_transaction_with_the_account']
train_x_encoded['Flag_Transaction_Interaction'] = train_x_encoded['Flag_deposit_more_than_tenMillion'] * train_x_encoded['Transaction_Amount']

train_x_encoded['Transaction_Failure_Rate'] = train_x_encoded['Transaction_Failure_Status'].mean()

train_x_encoded['ATM_Limit_Increased'] = (train_x_encoded['Customer_increase_atm_limit'] > 0).astype(int)

# train_x_encoded['Transaction_Weekday'] = pd.to_datetime(train_x_encoded['Transaction_Datetime']).dt.weekday

train_x_encoded.replace([np.inf, -np.inf], np.nan, inplace=True)
train_x_encoded.fillna(train_x_encoded.mean(), inplace=True)


In [39]:
# 특성 순서 저장
feature_order = train_x_encoded.columns.tolist()

### test

In [40]:
# 테스트 데이터 인코딩
test_x_encoded = test_x.copy()
test_x_encoded[categorical_columns] = ordinal_encoder.transform(test_x[categorical_columns])

In [41]:
test_x_encoded['Customer_Total_Authentication_Changes'] = test_x_encoded[
    ['Customer_flag_change_of_authentication_1', 'Customer_flag_change_of_authentication_2', 
     'Customer_flag_change_of_authentication_3', 'Customer_flag_change_of_authentication_4']
].sum(axis=1)

test_x_encoded['Customer_Total_Malicious_Behavior_Flags'] = test_x_encoded[
    ['Customer_flag_terminal_malicious_behavior_1', 'Customer_flag_terminal_malicious_behavior_2', 
     'Customer_flag_terminal_malicious_behavior_3', 'Customer_flag_terminal_malicious_behavior_4', 
     'Customer_flag_terminal_malicious_behavior_5', 'Customer_flag_terminal_malicious_behavior_6']
].sum(axis=1)

test_x_encoded['Daily_Usage_Ratio'] = test_x_encoded['Account_amount_daily_limit'] / test_x_encoded['Account_remaining_amount_daily_limit_exceeded']

test_x_encoded['Transaction_Amount_per_Distance'] = test_x_encoded['Transaction_Amount'] / test_x_encoded['Distance']
test_x_encoded['Channel_Distance_Interaction'] = test_x_encoded['Channel'] * test_x_encoded['Distance']
test_x_encoded['Abs_Transaction_Amount'] = test_x_encoded['Transaction_Amount'].abs()
test_x_encoded['Transaction_Amount_per_Transaction_Count'] = test_x_encoded['Transaction_Amount'] / (test_x_encoded['Number_of_transaction_with_the_account'] + 1)

test_x_encoded['Channel_Transaction_Count_Interaction'] = test_x_encoded['Channel'] * test_x_encoded['Number_of_transaction_with_the_account']
test_x_encoded['Flag_Transaction_Interaction'] = test_x_encoded['Flag_deposit_more_than_tenMillion'] * test_x_encoded['Transaction_Amount']
test_x_encoded['Transaction_Failure_Rate'] = test_x_encoded['Transaction_Failure_Status'].mean()

test_x_encoded['ATM_Limit_Increased'] = (test_x_encoded['Customer_increase_atm_limit'] > 0).astype(int)

# test_x_encoded['Transaction_Weekday'] = pd.to_datetime(test_x_encoded['Transaction_Datetime']).dt.weekday

test_x_encoded.replace([np.inf, -np.inf], np.nan, inplace=True)
test_x_encoded.fillna(test_x_encoded.mean(), inplace=True)

In [42]:
# 특성 순서 맞추기 및 데이터 타입 일치
test_x_encoded = test_x_encoded[feature_order]
# test_x_encoded = test_x_encoded[selected_features]
# for col in selected_features:
for col in feature_order:
    test_x_encoded[col] = test_x_encoded[col].astype(train_x_encoded[col].dtype)

In [43]:
train_x_encoded.shape

(120013, 73)

In [44]:
test_x_encoded.shape

(120000, 73)

In [45]:
train_x_encoded['Fraud_Type'] = train_y_encoded

In [46]:
# 비율 조정을 위한 타겟 클래스별 비율 딕셔너리 (예: Normal 비율 1.0, 다른 클래스는 원하는 비율로 조정)
target_ratios = {
    0: 1.0,
    1: 1.0,
    2: 1.0,
    3: 1.0,
    4: 1.0,
    5: 1.0,
    6: 1.0,
    7: 1.0,
    8: 1.0,
    9: 1.0,
    10: 1.0,
    11: 1.0,
    12: 0.0025,
}

# 각 클래스별로 샘플링하여 새로운 데이터프레임 생성
df_list = []
for target_class, ratio in target_ratios.items():
    df_class = train_x_encoded[train_x_encoded['Fraud_Type'] == target_class]
    num_class = len(df_class)
    
    # 비율에 맞게 샘플링
    df_sampled = df_class.sample(
        n=int(num_class * ratio), replace=False, random_state=42
    )
    
    # 리스트에 추가
    df_list.append(df_sampled)

# 샘플링된 데이터프레임 결합
df_concat = pd.concat(df_list, axis=0).reset_index(drop=True)

# 새로운 데이터프레임의 클래스별 카운트를 확인
print(df_concat.value_counts('Fraud_Type'))

Fraud_Type
12    297
0     101
1     101
2     101
3     101
4     101
5     101
6     101
7     101
8     101
9     101
10    101
11    101
dtype: int64


In [47]:
train_x_encoded_down = df_concat.drop(columns=['Fraud_Type'])
train_y_encoded_down = df_concat['Fraud_Type']

In [48]:
import xgboost as xgb
import shap
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(train_x_encoded_down, train_y_encoded_down, test_size=0.25, random_state=42)

# XGBoost 모델 학습
model = xgb.XGBClassifier(objective="multi:softprob", eval_metric="mlogloss")
model.fit(X_train, y_train)

# SHAP 값 계산
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# 각 클래스별로 가장 중요한 피처 확인
for i in range(len(shap_values)):
    print(f"클래스 {i}에 대한 피처 중요도 순위:")
    shap_importance = np.abs(shap_values[i]).mean(axis=0)
    feature_importance_df = pd.DataFrame({
        'Feature': X_test.columns,
        'Importance': shap_importance
    }).sort_values(by='Importance', ascending=False)
    print(feature_importance_df.head())

    # # SHAP 값 시각화
    # shap.summary_plot(shap_values[i], X_test, show=False)

# 전체 SHAP 값 계산 (평균 절대값을 사용하여 피처 중요도 확인)
shap_values_mean = np.mean([np.abs(s).mean(axis=0) for s in shap_values], axis=0)

# 피처 중요도 데이터프레임 생성 및 정렬
overall_importance_df = pd.DataFrame({
    'Feature': X_test.columns,
    'Importance': shap_values_mean
}).sort_values(by='Importance', ascending=False)

print("전체 모델의 피처 중요도:")
overall_importance_df.head()
# # 전체 모델의 피처 중요도 시각화
# shap.summary_plot(np.mean(shap_values, axis=0), X_test)


클래스 0에 대한 피처 중요도 순위:
                                     Feature  Importance
50                                  Distance    2.906083
51                           Time_difference    1.128925
65           Transaction_Amount_per_Distance    0.528109
68  Transaction_Amount_per_Transaction_Count    0.383415
66              Channel_Distance_Interaction    0.238472
클래스 1에 대한 피처 중요도 순위:
                                          Feature  Importance
5                          Customer_credit_rating    1.898991
10           Customer_rooting_jailbreak_indicator    1.191622
12                         Customer_VPN_Indicator    1.083813
48             Transaction_num_connection_failure    0.357401
30  Account_remaining_amount_daily_limit_exceeded    0.336353
클래스 2에 대한 피처 중요도 순위:
                                        Feature  Importance
63      Customer_Total_Malicious_Behavior_Flags    2.324272
19  Customer_flag_terminal_malicious_behavior_6    0.401205
38                                      Cha

Unnamed: 0,Feature,Importance
70,Flag_Transaction_Interaction,0.526477
38,Channel,0.381518
50,Distance,0.345588
37,Transaction_Amount,0.303288
58,Number_of_transaction_with_the_account,0.260288


In [49]:
# 임계값 설정 
threshold = 0

# 임계값 이상인 피처들만 필터링
selected_features = overall_importance_df[overall_importance_df['Importance'] > threshold]

# 필터링된 피처 목록 출력
print("임계값 이상인 피처들:")
print(selected_features)

# 필터링된 피처 이름 리스트로 추출
selected_feature_names = selected_features['Feature'].tolist()

임계값 이상인 피처들:
                                        Feature  Importance
70                 Flag_Transaction_Interaction    0.526477
38                                      Channel    0.381518
50                                     Distance    0.345588
37                           Transaction_Amount    0.303288
58       Number_of_transaction_with_the_account    0.260288
..                                          ...         ...
9      Customer_flag_change_of_authentication_4    0.003651
18  Customer_flag_terminal_malicious_behavior_5    0.001339
14  Customer_flag_terminal_malicious_behavior_1    0.001170
20                   Customer_inquery_atm_limit    0.001141
40                                   Error_Code    0.000536

[67 rows x 2 columns]


In [50]:
train_x_shap = train_x_encoded_down[selected_feature_names]
test_x_shap = test_x_encoded[selected_feature_names]

In [55]:
import optuna
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score

# CatBoost 최적화 목적 함수 정의
def optimize_catboost(trial):
    # 최적화할 하이퍼파라미터 설정
    depth = trial.suggest_int('depth', 4, 8)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-3, 0.1)
    iterations = trial.suggest_int('iterations', 100, 500)
    
    # CatBoost 모델 생성
    model = CatBoostClassifier(
        depth=depth,
        learning_rate=learning_rate,
        iterations=iterations,
        verbose=200,
        random_state=42
    )
    
    # 교차 검증으로 모델 성능 평가 (F1 Macro)
    score = cross_val_score(model, train_x_shap, train_y_encoded_down, cv=3, scoring='f1_macro').mean()
    return score

# Optuna 스터디 생성 및 최적화 실행
catboost_study = optuna.create_study(direction='maximize')
catboost_study.optimize(optimize_catboost, n_trials=30)

# 최적 하이퍼파라미터 출력
print("Best CatBoost hyperparameters: ", catboost_study.best_params)

# 최적 하이퍼파라미터로 CatBoost 모델 생성
best_catboost_model = CatBoostClassifier(
    depth=catboost_study.best_params['depth'],
    learning_rate=catboost_study.best_params['learning_rate'],
    iterations=catboost_study.best_params['iterations'],
    verbose=0,
    random_state=42
)


[I 2024-08-26 16:16:55,108] A new study created in memory with name: no-name-80dccb94-e476-4d45-b390-a91fbcec6327


0:	learn: 2.5606110	total: 27.2ms	remaining: 2.96s
109:	learn: 2.1208500	total: 3.01s	remaining: 0us
0:	learn: 2.5594850	total: 27.6ms	remaining: 3.01s
109:	learn: 2.1240059	total: 3.06s	remaining: 0us
0:	learn: 2.5600497	total: 32.3ms	remaining: 3.52s


[I 2024-08-26 16:17:04,882] Trial 0 finished with value: 0.45370946355489195 and parameters: {'depth': 6, 'learning_rate': 0.005840013881257101, 'iterations': 110}. Best is trial 0 with value: 0.45370946355489195.


109:	learn: 2.1246122	total: 3.09s	remaining: 0us
0:	learn: 2.5642164	total: 112ms	remaining: 17.9s
160:	learn: 2.3937162	total: 21.1s	remaining: 0us
0:	learn: 2.5639184	total: 138ms	remaining: 22.1s
160:	learn: 2.3985382	total: 22s	remaining: 0us
0:	learn: 2.5640863	total: 133ms	remaining: 21.3s


[I 2024-08-26 16:18:09,904] Trial 1 finished with value: 0.4766375406592383 and parameters: {'depth': 8, 'learning_rate': 0.0015129193809402409, 'iterations': 161}. Best is trial 1 with value: 0.4766375406592383.


160:	learn: 2.3978535	total: 21.3s	remaining: 0us
0:	learn: 2.5616919	total: 102ms	remaining: 49.3s
200:	learn: 1.8625999	total: 22.3s	remaining: 31.4s
400:	learn: 1.4650070	total: 44.8s	remaining: 9.39s
484:	learn: 1.3386893	total: 56s	remaining: 0us
0:	learn: 2.5603687	total: 125ms	remaining: 1m
200:	learn: 1.8692226	total: 25.3s	remaining: 35.7s
400:	learn: 1.4663580	total: 49.4s	remaining: 10.3s
484:	learn: 1.3382113	total: 1m	remaining: 0us
0:	learn: 2.5611143	total: 158ms	remaining: 1m 16s
200:	learn: 1.8685063	total: 27.2s	remaining: 38.4s
400:	learn: 1.4700768	total: 54s	remaining: 11.3s


[I 2024-08-26 16:21:13,567] Trial 2 finished with value: 0.49897236702478737 and parameters: {'depth': 8, 'learning_rate': 0.006728059401022302, 'iterations': 485}. Best is trial 2 with value: 0.49897236702478737.


484:	learn: 1.3453702	total: 1m 5s	remaining: 0us
0:	learn: 2.5473954	total: 32.8ms	remaining: 8.96s
200:	learn: 1.1400762	total: 5.98s	remaining: 2.17s
273:	learn: 0.9383205	total: 7.99s	remaining: 0us
0:	learn: 2.5428656	total: 25.2ms	remaining: 6.89s
200:	learn: 1.1497375	total: 5.43s	remaining: 1.97s
273:	learn: 0.9494164	total: 7.4s	remaining: 0us
0:	learn: 2.5451343	total: 25.9ms	remaining: 7.08s
200:	learn: 1.1556115	total: 5.65s	remaining: 2.05s


[I 2024-08-26 16:21:37,709] Trial 3 finished with value: 0.5335241454335815 and parameters: {'depth': 6, 'learning_rate': 0.02371279347609934, 'iterations': 274}. Best is trial 3 with value: 0.5335241454335815.


273:	learn: 0.9645440	total: 7.76s	remaining: 0us
0:	learn: 2.5406838	total: 111ms	remaining: 24.1s
200:	learn: 0.5287072	total: 23.7s	remaining: 2.12s
218:	learn: 0.4746103	total: 25.9s	remaining: 0us
0:	learn: 2.5308988	total: 122ms	remaining: 26.6s
200:	learn: 0.5250776	total: 24.7s	remaining: 2.21s
218:	learn: 0.4709643	total: 27.2s	remaining: 0us
0:	learn: 2.5364249	total: 130ms	remaining: 28.3s
200:	learn: 0.5406062	total: 28.3s	remaining: 2.53s


[I 2024-08-26 16:23:02,454] Trial 4 finished with value: 0.5304457920343758 and parameters: {'depth': 8, 'learning_rate': 0.050379306416952305, 'iterations': 219}. Best is trial 3 with value: 0.5335241454335815.


218:	learn: 0.4864711	total: 30.7s	remaining: 0us
0:	learn: 2.5636040	total: 33ms	remaining: 9.69s
200:	learn: 2.2791553	total: 6.4s	remaining: 2.99s
294:	learn: 2.1806816	total: 9.03s	remaining: 0us
0:	learn: 2.5632543	total: 24.7ms	remaining: 7.26s
200:	learn: 2.2745111	total: 5.78s	remaining: 2.71s
294:	learn: 2.1738567	total: 8.53s	remaining: 0us
0:	learn: 2.5634297	total: 28.1ms	remaining: 8.25s
200:	learn: 2.2760303	total: 5.88s	remaining: 2.75s


[I 2024-08-26 16:23:29,778] Trial 5 finished with value: 0.46572834778796685 and parameters: {'depth': 6, 'learning_rate': 0.0018097037527625333, 'iterations': 295}. Best is trial 3 with value: 0.5335241454335815.


294:	learn: 2.1763579	total: 8.6s	remaining: 0us
0:	learn: 2.5617908	total: 55ms	remaining: 18.4s
200:	learn: 1.9612848	total: 11.5s	remaining: 7.65s
334:	learn: 1.7119239	total: 19s	remaining: 0us
0:	learn: 2.5607434	total: 51.3ms	remaining: 17.1s
200:	learn: 1.9488699	total: 11.4s	remaining: 7.6s
334:	learn: 1.7021874	total: 18.9s	remaining: 0us
0:	learn: 2.5612575	total: 52.4ms	remaining: 17.5s
200:	learn: 1.9554535	total: 11.5s	remaining: 7.67s


[I 2024-08-26 16:24:28,798] Trial 6 finished with value: 0.4816204218987859 and parameters: {'depth': 7, 'learning_rate': 0.00525913230086408, 'iterations': 335}. Best is trial 3 with value: 0.5335241454335815.


334:	learn: 1.7060843	total: 19.7s	remaining: 0us
0:	learn: 2.5642087	total: 129ms	remaining: 57.7s
200:	learn: 2.3530930	total: 26.9s	remaining: 33.1s
400:	learn: 2.1806830	total: 53.9s	remaining: 6.45s
448:	learn: 2.1446897	total: 1m	remaining: 0us
0:	learn: 2.5639076	total: 137ms	remaining: 1m 1s
200:	learn: 2.3582714	total: 26.8s	remaining: 33.1s
400:	learn: 2.1843821	total: 54.1s	remaining: 6.47s
448:	learn: 2.1477387	total: 1m	remaining: 0us
0:	learn: 2.5640772	total: 143ms	remaining: 1m 4s
200:	learn: 2.3557427	total: 29.1s	remaining: 36s
400:	learn: 2.1829206	total: 56.7s	remaining: 6.79s


[I 2024-08-26 16:27:34,567] Trial 7 finished with value: 0.48577015639986215 and parameters: {'depth': 8, 'learning_rate': 0.0015288258279379811, 'iterations': 449}. Best is trial 3 with value: 0.5335241454335815.


448:	learn: 2.1473423	total: 1m 3s	remaining: 0us
0:	learn: 2.5636906	total: 31.7ms	remaining: 6.73s
200:	learn: 2.2936651	total: 6.1s	remaining: 364ms
212:	learn: 2.2804569	total: 6.44s	remaining: 0us
0:	learn: 2.5633634	total: 26.8ms	remaining: 5.67s
200:	learn: 2.2892657	total: 5.62s	remaining: 335ms
212:	learn: 2.2762610	total: 6s	remaining: 0us
0:	learn: 2.5635275	total: 29.8ms	remaining: 6.33s
200:	learn: 2.2908685	total: 6s	remaining: 358ms


[I 2024-08-26 16:27:54,252] Trial 8 finished with value: 0.46382033894262215 and parameters: {'depth': 6, 'learning_rate': 0.001693165871401662, 'iterations': 213}. Best is trial 3 with value: 0.5335241454335815.


212:	learn: 2.2786830	total: 6.38s	remaining: 0us
0:	learn: 2.5630017	total: 56.3ms	remaining: 18.3s
200:	learn: 2.1434316	total: 11.7s	remaining: 7.28s
325:	learn: 1.9585113	total: 18.7s	remaining: 0us
0:	learn: 2.5623555	total: 49.9ms	remaining: 16.2s
200:	learn: 2.1343012	total: 11.8s	remaining: 7.31s
325:	learn: 1.9507597	total: 19.2s	remaining: 0us
0:	learn: 2.5626728	total: 52.3ms	remaining: 17s
200:	learn: 2.1378950	total: 11.4s	remaining: 7.08s


[I 2024-08-26 16:28:51,851] Trial 9 finished with value: 0.4721697703583148 and parameters: {'depth': 7, 'learning_rate': 0.00324188818896466, 'iterations': 326}. Best is trial 3 with value: 0.5335241454335815.


325:	learn: 1.9535343	total: 18.4s	remaining: 0us
0:	learn: 2.5404163	total: 7.68ms	remaining: 3.09s
200:	learn: 1.2590900	total: 1.17s	remaining: 1.19s
400:	learn: 0.8692751	total: 2.37s	remaining: 17.8ms
403:	learn: 0.8663819	total: 2.39s	remaining: 0us
0:	learn: 2.5335286	total: 7.05ms	remaining: 2.84s
200:	learn: 1.2491629	total: 1.19s	remaining: 1.2s
400:	learn: 0.8623535	total: 2.41s	remaining: 18ms
403:	learn: 0.8587602	total: 2.43s	remaining: 0us
0:	learn: 2.5379556	total: 7.55ms	remaining: 3.04s
200:	learn: 1.2729253	total: 1.21s	remaining: 1.22s


[I 2024-08-26 16:29:00,279] Trial 10 finished with value: 0.5953201449583966 and parameters: {'depth': 4, 'learning_rate': 0.025603370916199606, 'iterations': 404}. Best is trial 10 with value: 0.5953201449583966.


400:	learn: 0.8875594	total: 2.39s	remaining: 17.9ms
403:	learn: 0.8844075	total: 2.41s	remaining: 0us
0:	learn: 2.5410441	total: 8.01ms	remaining: 3.23s
200:	learn: 1.2753037	total: 1.19s	remaining: 1.21s
400:	learn: 0.8836093	total: 2.35s	remaining: 23.4ms
404:	learn: 0.8788570	total: 2.37s	remaining: 0us
0:	learn: 2.5343304	total: 7.16ms	remaining: 2.89s
200:	learn: 1.2627426	total: 1.19s	remaining: 1.2s
400:	learn: 0.8745340	total: 2.32s	remaining: 23.1ms
404:	learn: 0.8701796	total: 2.34s	remaining: 0us
0:	learn: 2.5386454	total: 7.48ms	remaining: 3.02s
200:	learn: 1.2835689	total: 1.17s	remaining: 1.18s


[I 2024-08-26 16:29:08,540] Trial 11 finished with value: 0.5912556369163181 and parameters: {'depth': 4, 'learning_rate': 0.02494314457576134, 'iterations': 405}. Best is trial 10 with value: 0.5953201449583966.


400:	learn: 0.8972036	total: 2.36s	remaining: 23.5ms
404:	learn: 0.8921491	total: 2.38s	remaining: 0us
0:	learn: 2.5468867	total: 7.07ms	remaining: 2.86s
200:	learn: 1.4101224	total: 1.18s	remaining: 1.2s
400:	learn: 1.0513437	total: 2.32s	remaining: 23.1ms
404:	learn: 1.0453733	total: 2.34s	remaining: 0us
0:	learn: 2.5417985	total: 6.74ms	remaining: 2.72s
200:	learn: 1.4115280	total: 1.21s	remaining: 1.23s
400:	learn: 1.0468286	total: 2.4s	remaining: 24ms
404:	learn: 1.0420803	total: 2.43s	remaining: 0us
0:	learn: 2.5450678	total: 7.28ms	remaining: 2.94s
200:	learn: 1.4247944	total: 1.2s	remaining: 1.22s


[I 2024-08-26 16:29:16,854] Trial 12 finished with value: 0.5475553369492759 and parameters: {'depth': 4, 'learning_rate': 0.01881203106173267, 'iterations': 405}. Best is trial 10 with value: 0.5953201449583966.


400:	learn: 1.0584058	total: 2.34s	remaining: 23.4ms
404:	learn: 1.0537266	total: 2.37s	remaining: 0us
0:	learn: 2.4850646	total: 7.91ms	remaining: 3.15s
200:	learn: 0.6350426	total: 1.18s	remaining: 1.16s
398:	learn: 0.3957926	total: 2.35s	remaining: 0us
0:	learn: 2.4634509	total: 7.06ms	remaining: 2.81s
200:	learn: 0.6552668	total: 1.14s	remaining: 1.12s
398:	learn: 0.3975062	total: 2.29s	remaining: 0us
0:	learn: 2.4773695	total: 7.45ms	remaining: 2.96s
200:	learn: 0.6730559	total: 1.17s	remaining: 1.15s


[I 2024-08-26 16:29:24,959] Trial 13 finished with value: 0.6226701378019351 and parameters: {'depth': 4, 'learning_rate': 0.08505417251747852, 'iterations': 399}. Best is trial 13 with value: 0.6226701378019351.


398:	learn: 0.4217107	total: 2.3s	remaining: 0us
0:	learn: 2.4714466	total: 14.4ms	remaining: 5.52s
200:	learn: 0.4501889	total: 2.42s	remaining: 2.19s
382:	learn: 0.2434815	total: 4.64s	remaining: 0us
0:	learn: 2.4560480	total: 13.8ms	remaining: 5.27s
200:	learn: 0.4547749	total: 2.47s	remaining: 2.24s
382:	learn: 0.2413382	total: 4.6s	remaining: 0us
0:	learn: 2.4599253	total: 13.2ms	remaining: 5.05s
200:	learn: 0.4820036	total: 2.48s	remaining: 2.24s


[I 2024-08-26 16:29:39,979] Trial 14 finished with value: 0.6016018691488907 and parameters: {'depth': 5, 'learning_rate': 0.09777201516344487, 'iterations': 383}. Best is trial 13 with value: 0.6226701378019351.


382:	learn: 0.2598120	total: 4.66s	remaining: 0us
0:	learn: 2.4743527	total: 14.1ms	remaining: 5.24s
200:	learn: 0.4712819	total: 2.44s	remaining: 2.09s
372:	learn: 0.2669656	total: 4.48s	remaining: 0us
0:	learn: 2.4593969	total: 13.3ms	remaining: 4.96s
200:	learn: 0.4643606	total: 2.41s	remaining: 2.06s
372:	learn: 0.2600364	total: 4.46s	remaining: 0us
0:	learn: 2.4631690	total: 12.9ms	remaining: 4.79s
200:	learn: 0.4935117	total: 2.46s	remaining: 2.1s


[I 2024-08-26 16:29:54,517] Trial 15 finished with value: 0.6046065621788349 and parameters: {'depth': 5, 'learning_rate': 0.09463633946283768, 'iterations': 373}. Best is trial 13 with value: 0.6226701378019351.


372:	learn: 0.2804832	total: 4.49s	remaining: 0us
0:	learn: 2.4742077	total: 13.8ms	remaining: 4.83s
200:	learn: 0.4664212	total: 2.55s	remaining: 1.89s
349:	learn: 0.2832921	total: 4.48s	remaining: 0us
0:	learn: 2.4592297	total: 14.3ms	remaining: 4.98s
200:	learn: 0.4584853	total: 2.7s	remaining: 2s
349:	learn: 0.2771352	total: 4.72s	remaining: 0us
0:	learn: 2.4630071	total: 15.7ms	remaining: 5.48s
200:	learn: 0.4876047	total: 2.68s	remaining: 1.99s


[I 2024-08-26 16:30:09,480] Trial 16 finished with value: 0.6095121707863419 and parameters: {'depth': 5, 'learning_rate': 0.09479259207756056, 'iterations': 350}. Best is trial 13 with value: 0.6226701378019351.


349:	learn: 0.2949411	total: 4.64s	remaining: 0us
0:	learn: 2.5206033	total: 13.9ms	remaining: 6.29s
200:	learn: 0.8112024	total: 2.56s	remaining: 3.21s
400:	learn: 0.4823390	total: 5.09s	remaining: 660ms
452:	learn: 0.4373134	total: 5.75s	remaining: 0us
0:	learn: 2.5130311	total: 14.8ms	remaining: 6.67s
200:	learn: 0.8142487	total: 2.52s	remaining: 3.16s
400:	learn: 0.4748870	total: 5.08s	remaining: 658ms
452:	learn: 0.4307929	total: 5.69s	remaining: 0us
0:	learn: 2.5149947	total: 13.7ms	remaining: 6.19s
200:	learn: 0.8484096	total: 2.61s	remaining: 3.27s
400:	learn: 0.5055995	total: 5.02s	remaining: 651ms


[I 2024-08-26 16:30:27,905] Trial 17 finished with value: 0.5986548781093165 and parameters: {'depth': 5, 'learning_rate': 0.045643517566522, 'iterations': 453}. Best is trial 13 with value: 0.6226701378019351.


452:	learn: 0.4597412	total: 5.65s	remaining: 0us
0:	learn: 2.5136947	total: 13.4ms	remaining: 3.26s
200:	learn: 0.7220241	total: 2.55s	remaining: 559ms
244:	learn: 0.6272064	total: 3.08s	remaining: 0us
0:	learn: 2.5049828	total: 13.3ms	remaining: 3.24s
200:	learn: 0.7187417	total: 2.53s	remaining: 554ms
244:	learn: 0.6206521	total: 3.15s	remaining: 0us
0:	learn: 2.5072323	total: 14.9ms	remaining: 3.64s
200:	learn: 0.7592208	total: 2.59s	remaining: 567ms


[I 2024-08-26 16:30:38,142] Trial 18 finished with value: 0.5902352397520744 and parameters: {'depth': 5, 'learning_rate': 0.05286212419629532, 'iterations': 245}. Best is trial 13 with value: 0.6226701378019351.


244:	learn: 0.6519093	total: 3.16s	remaining: 0us
0:	learn: 2.5520139	total: 7.57ms	remaining: 2.58s
200:	learn: 1.5720254	total: 1.23s	remaining: 864ms
341:	learn: 1.3188762	total: 2.07s	remaining: 0us
0:	learn: 2.5483607	total: 7.41ms	remaining: 2.53s
200:	learn: 1.5664054	total: 1.27s	remaining: 895ms
341:	learn: 1.3153661	total: 2.16s	remaining: 0us
0:	learn: 2.5507073	total: 6.81ms	remaining: 2.32s
200:	learn: 1.5716889	total: 1.21s	remaining: 849ms


[I 2024-08-26 16:30:45,558] Trial 19 finished with value: 0.521309503315758 and parameters: {'depth': 4, 'learning_rate': 0.01345067353861414, 'iterations': 342}. Best is trial 13 with value: 0.6226701378019351.


341:	learn: 1.3168336	total: 2.12s	remaining: 0us
0:	learn: 2.4981310	total: 13.3ms	remaining: 6.63s
200:	learn: 0.5870571	total: 2.54s	remaining: 3.78s
400:	learn: 0.3388843	total: 4.94s	remaining: 1.22s
499:	learn: 0.2710536	total: 6.16s	remaining: 0us
0:	learn: 2.4868966	total: 14.1ms	remaining: 7.05s
200:	learn: 0.5784000	total: 2.46s	remaining: 3.66s
400:	learn: 0.3242129	total: 5.06s	remaining: 1.25s
499:	learn: 0.2591403	total: 6.28s	remaining: 0us
0:	learn: 2.4897702	total: 14.3ms	remaining: 7.12s
200:	learn: 0.6221121	total: 2.65s	remaining: 3.95s
400:	learn: 0.3548474	total: 5.25s	remaining: 1.3s


[I 2024-08-26 16:31:06,101] Trial 20 finished with value: 0.6104373163146105 and parameters: {'depth': 5, 'learning_rate': 0.06924587822322016, 'iterations': 500}. Best is trial 13 with value: 0.6226701378019351.


499:	learn: 0.2864854	total: 6.65s	remaining: 0us
0:	learn: 2.4929613	total: 14.3ms	remaining: 6.94s
200:	learn: 0.5605531	total: 2.81s	remaining: 3.99s
400:	learn: 0.3148217	total: 5.51s	remaining: 1.17s
485:	learn: 0.2592977	total: 6.7s	remaining: 0us
0:	learn: 2.4809035	total: 15ms	remaining: 7.29s
200:	learn: 0.5564875	total: 2.99s	remaining: 4.24s
400:	learn: 0.3033020	total: 5.75s	remaining: 1.22s
485:	learn: 0.2507085	total: 6.87s	remaining: 0us
0:	learn: 2.4839782	total: 35.3ms	remaining: 17.1s
200:	learn: 0.5843738	total: 2.86s	remaining: 4.06s
400:	learn: 0.3278414	total: 5.35s	remaining: 1.13s


[I 2024-08-26 16:31:27,699] Trial 21 finished with value: 0.6058113432724722 and parameters: {'depth': 5, 'learning_rate': 0.07472753841757156, 'iterations': 486}. Best is trial 13 with value: 0.6226701378019351.


485:	learn: 0.2721478	total: 6.5s	remaining: 0us
0:	learn: 2.5267374	total: 8.13ms	remaining: 3.71s
200:	learn: 1.0059477	total: 1.38s	remaining: 1.76s
400:	learn: 0.6632340	total: 2.76s	remaining: 386ms
456:	learn: 0.6127757	total: 3.15s	remaining: 0us
0:	learn: 2.5160915	total: 7.99ms	remaining: 3.64s
200:	learn: 1.0057019	total: 1.43s	remaining: 1.82s
400:	learn: 0.6588259	total: 2.84s	remaining: 396ms
456:	learn: 0.6077309	total: 3.21s	remaining: 0us
0:	learn: 2.5229384	total: 7.31ms	remaining: 3.33s
200:	learn: 1.0300792	total: 1.4s	remaining: 1.78s
400:	learn: 0.6858378	total: 2.73s	remaining: 382ms


[I 2024-08-26 16:31:38,676] Trial 22 finished with value: 0.6105923689327097 and parameters: {'depth': 4, 'learning_rate': 0.040059725901033724, 'iterations': 457}. Best is trial 13 with value: 0.6226701378019351.


456:	learn: 0.6373726	total: 3.1s	remaining: 0us
0:	learn: 2.5300664	total: 6.54ms	remaining: 2.96s
200:	learn: 1.0617939	total: 1.37s	remaining: 1.73s
400:	learn: 0.6965585	total: 2.57s	remaining: 340ms
453:	learn: 0.6489868	total: 2.88s	remaining: 0us
0:	learn: 2.5203289	total: 6.59ms	remaining: 2.98s
200:	learn: 1.0584646	total: 1.24s	remaining: 1.56s
400:	learn: 0.6879398	total: 2.42s	remaining: 320ms
453:	learn: 0.6366352	total: 2.75s	remaining: 0us
0:	learn: 2.5265906	total: 7.69ms	remaining: 3.48s
200:	learn: 1.0813126	total: 1.32s	remaining: 1.66s
400:	learn: 0.7229794	total: 2.5s	remaining: 330ms


[I 2024-08-26 16:31:48,469] Trial 23 finished with value: 0.6089941542942103 and parameters: {'depth': 4, 'learning_rate': 0.0365287757531698, 'iterations': 454}. Best is trial 13 with value: 0.6226701378019351.


453:	learn: 0.6761834	total: 2.83s	remaining: 0us
0:	learn: 2.5055938	total: 8.34ms	remaining: 4.09s
200:	learn: 0.7749963	total: 1.5s	remaining: 2.17s
400:	learn: 0.4994545	total: 2.85s	remaining: 647ms
491:	learn: 0.4267009	total: 3.44s	remaining: 0us
0:	learn: 2.4892813	total: 7.13ms	remaining: 3.5s
200:	learn: 0.7681328	total: 1.35s	remaining: 1.95s
400:	learn: 0.5043443	total: 2.63s	remaining: 596ms
491:	learn: 0.4327632	total: 3.23s	remaining: 0us
0:	learn: 2.4997810	total: 7.77ms	remaining: 3.81s
200:	learn: 0.7876019	total: 1.33s	remaining: 1.93s
400:	learn: 0.5247356	total: 2.69s	remaining: 611ms


[I 2024-08-26 16:31:59,979] Trial 24 finished with value: 0.6230594069668877 and parameters: {'depth': 4, 'learning_rate': 0.06269481768518484, 'iterations': 492}. Best is trial 24 with value: 0.6230594069668877.


491:	learn: 0.4522970	total: 3.3s	remaining: 0us
0:	learn: 2.5358866	total: 7.78ms	remaining: 3.38s
200:	learn: 1.1706969	total: 1.3s	remaining: 1.52s
400:	learn: 0.7801808	total: 2.62s	remaining: 229ms
435:	learn: 0.7442143	total: 2.86s	remaining: 0us
0:	learn: 2.5277471	total: 7.97ms	remaining: 3.47s
200:	learn: 1.1569764	total: 1.32s	remaining: 1.55s
400:	learn: 0.7832201	total: 2.63s	remaining: 230ms
435:	learn: 0.7426583	total: 2.87s	remaining: 0us
0:	learn: 2.5329798	total: 7.63ms	remaining: 3.32s
200:	learn: 1.1820851	total: 1.39s	remaining: 1.63s
400:	learn: 0.8016458	total: 2.68s	remaining: 234ms


[I 2024-08-26 16:32:09,979] Trial 25 finished with value: 0.6103339032616032 and parameters: {'depth': 4, 'learning_rate': 0.030375335296869702, 'iterations': 436}. Best is trial 24 with value: 0.6230594069668877.


435:	learn: 0.7667889	total: 2.91s	remaining: 0us
0:	learn: 2.5492354	total: 9.03ms	remaining: 3.88s
200:	learn: 1.4797915	total: 1.41s	remaining: 1.62s
400:	learn: 1.1260204	total: 2.74s	remaining: 205ms
430:	learn: 1.0865382	total: 2.95s	remaining: 0us
0:	learn: 2.5448036	total: 7.84ms	remaining: 3.37s
200:	learn: 1.4761528	total: 1.32s	remaining: 1.51s
400:	learn: 1.1314496	total: 2.51s	remaining: 188ms
430:	learn: 1.0896352	total: 2.68s	remaining: 0us
0:	learn: 2.5476508	total: 7.51ms	remaining: 3.23s
200:	learn: 1.4907348	total: 1.27s	remaining: 1.45s
400:	learn: 1.1405630	total: 2.5s	remaining: 187ms


[I 2024-08-26 16:32:19,598] Trial 26 finished with value: 0.544844082483312 and parameters: {'depth': 4, 'learning_rate': 0.01635387044252012, 'iterations': 431}. Best is trial 24 with value: 0.6230594069668877.


430:	learn: 1.1039843	total: 2.69s	remaining: 0us
0:	learn: 2.5094110	total: 9.34ms	remaining: 4.36s
200:	learn: 0.7984649	total: 1.39s	remaining: 1.84s
400:	learn: 0.5171844	total: 2.65s	remaining: 444ms
467:	learn: 0.4644508	total: 3.08s	remaining: 0us
0:	learn: 2.4941073	total: 7.3ms	remaining: 3.41s
200:	learn: 0.7977290	total: 1.32s	remaining: 1.75s
400:	learn: 0.5227172	total: 2.64s	remaining: 441ms
467:	learn: 0.4676601	total: 3.09s	remaining: 0us
0:	learn: 2.5039564	total: 7.2ms	remaining: 3.36s
200:	learn: 0.8152265	total: 1.31s	remaining: 1.75s
400:	learn: 0.5464446	total: 2.62s	remaining: 437ms


[I 2024-08-26 16:32:30,191] Trial 27 finished with value: 0.6160462646166712 and parameters: {'depth': 4, 'learning_rate': 0.05858051971745916, 'iterations': 468}. Best is trial 24 with value: 0.6230594069668877.


467:	learn: 0.4913081	total: 3.02s	remaining: 0us
0:	learn: 2.4990877	total: 7.07ms	remaining: 3.36s
200:	learn: 0.7239148	total: 1.21s	remaining: 1.65s
400:	learn: 0.4620775	total: 2.4s	remaining: 450ms
475:	learn: 0.4029198	total: 2.85s	remaining: 0us
0:	learn: 2.4810717	total: 7.31ms	remaining: 3.47s
200:	learn: 0.7279818	total: 1.2s	remaining: 1.64s
400:	learn: 0.4612472	total: 2.37s	remaining: 444ms
475:	learn: 0.4062456	total: 2.8s	remaining: 0us
0:	learn: 2.4926701	total: 6.34ms	remaining: 3.01s
200:	learn: 0.7460716	total: 1.25s	remaining: 1.71s
400:	learn: 0.4899285	total: 2.42s	remaining: 452ms


[I 2024-08-26 16:32:40,053] Trial 28 finished with value: 0.6132372939220648 and parameters: {'depth': 4, 'learning_rate': 0.06973756196847353, 'iterations': 476}. Best is trial 24 with value: 0.6230594069668877.


475:	learn: 0.4296687	total: 2.85s	remaining: 0us
0:	learn: 2.5578992	total: 53ms	remaining: 22.4s
200:	learn: 1.5414848	total: 12.8s	remaining: 14.2s
400:	learn: 1.1048419	total: 26.7s	remaining: 1.46s
422:	learn: 1.0704985	total: 28.2s	remaining: 0us
0:	learn: 2.5555656	total: 65.7ms	remaining: 27.7s
200:	learn: 1.5378984	total: 13.6s	remaining: 15.1s
400:	learn: 1.1050269	total: 25.2s	remaining: 1.38s
422:	learn: 1.0706052	total: 26.7s	remaining: 0us
0:	learn: 2.5567108	total: 60.4ms	remaining: 25.5s
200:	learn: 1.5415506	total: 13.5s	remaining: 14.9s
400:	learn: 1.1157785	total: 26.6s	remaining: 1.46s


[I 2024-08-26 16:34:04,683] Trial 29 finished with value: 0.5146539772502631 and parameters: {'depth': 7, 'learning_rate': 0.011750325608195302, 'iterations': 423}. Best is trial 24 with value: 0.6230594069668877.


422:	learn: 1.0808537	total: 28.1s	remaining: 0us
Best CatBoost hyperparameters:  {'depth': 4, 'learning_rate': 0.06269481768518484, 'iterations': 492}


In [58]:
import optuna
from xgboost import XGBClassifier

# XGBoost 최적화 목적 함수 정의
def optimize_xgboost(trial):
    # 최적화할 하이퍼파라미터 설정
    n_estimators = trial.suggest_int('n_estimators', 100, 600)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-3, 0.2)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
    
    # XGBoost 모델 생성
    model = XGBClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        eval_metric='logloss',
        random_state=42
    )
    
    # 교차 검증으로 모델 성능 평가 (F1 Macro)
    score = cross_val_score(model, train_x_shap, train_y_encoded_down, cv=3, scoring='f1_macro').mean()
    return score

# Optuna 스터디 생성 및 최적화 실행
xgboost_study = optuna.create_study(direction='maximize')
xgboost_study.optimize(optimize_xgboost, n_trials=50)

# 최적 하이퍼파라미터 출력
print("Best XGBoost hyperparameters: ", xgboost_study.best_params)

# 최적 하이퍼파라미터로 XGBoost 모델 생성
best_xgboost_model = XGBClassifier(
    n_estimators=xgboost_study.best_params['n_estimators'],
    max_depth=xgboost_study.best_params['max_depth'],
    learning_rate=xgboost_study.best_params['learning_rate'],
    subsample=xgboost_study.best_params['subsample'],
    colsample_bytree=xgboost_study.best_params['colsample_bytree'],
    eval_metric='logloss',
    random_state=42
)


[I 2024-08-26 16:36:17,132] A new study created in memory with name: no-name-e9746f45-3fe7-4bdd-b1ff-fa9155cfd486
[I 2024-08-26 16:36:31,702] Trial 0 finished with value: 0.6370618035576044 and parameters: {'n_estimators': 424, 'max_depth': 6, 'learning_rate': 0.0012612494605261661, 'subsample': 0.6037809566339694, 'colsample_bytree': 0.9259811083204342}. Best is trial 0 with value: 0.6370618035576044.
[I 2024-08-26 16:36:47,488] Trial 1 finished with value: 0.6539656219923217 and parameters: {'n_estimators': 451, 'max_depth': 10, 'learning_rate': 0.019108407187565936, 'subsample': 0.7444836587653201, 'colsample_bytree': 0.6258687545041202}. Best is trial 1 with value: 0.6539656219923217.
[I 2024-08-26 16:36:56,067] Trial 2 finished with value: 0.6246700493139242 and parameters: {'n_estimators': 257, 'max_depth': 8, 'learning_rate': 0.011984571590995398, 'subsample': 0.5207604927837892, 'colsample_bytree': 0.5803739281412161}. Best is trial 1 with value: 0.6539656219923217.
[I 2024-08-

Best XGBoost hyperparameters:  {'n_estimators': 563, 'max_depth': 8, 'learning_rate': 0.023580377486484377, 'subsample': 0.9568332869281627, 'colsample_bytree': 0.9602188779018576}


In [59]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier,VotingClassifier

randomforest_model = RandomForestClassifier(random_state=42)

voting_clf = VotingClassifier(
    estimators=[
        ('catboost', best_catboost_model),
        ('xgboost', best_xgboost_model),
        ('randomforest', randomforest_model)  # 기존 RandomForest 모델
    ],
    voting='soft'
)

# 앙상블 모델 학습
voting_clf.fit(train_x_shap, train_y_encoded_down)


In [60]:
# 예측
predictions = voting_clf.predict(test_x_shap)
predictions_label = le_subclass.inverse_transform(predictions)

# Submission

In [61]:
# 분류 예측 결과 제출 데이터프레임(DataFrame)
# 분류 예측 결과 데이터프레임 파일명을 반드시 clf_submission.csv 로 지정해야합니다.
clf_submission = pd.read_csv("sample_submission.csv")
clf_submission["Fraud_Type"] = predictions_label
clf_submission.head()

Unnamed: 0,ID,Fraud_Type
0,TEST_000000,b
1,TEST_000001,m
2,TEST_000002,m
3,TEST_000003,m
4,TEST_000004,h


In [None]:
# 합성 데이터 생성 결과 제출 데이터프레임(DataFrame)
# 합성 데이터 생성 결과 데이터프레임 파일명을 반드시 syn_submission.csv 로 지정해야합니다.
# all_synthetic_data.head()

In [62]:
'''
(*) 저장 시 각 파일명을 반드시 확인해주세요.
    1. 분류 예측 결과 데이터프레임 파일명 = clf_submission.csv
    2. 합성 데이터 생성 결과 데이터프레임 파일명 = syn_submission.csv

(*) 제출 파일(zip) 내에 두 개의 데이터프레임이 각각 위의 파일명으로 반드시 존재해야합니다.
(*) 파일명을 일치시키지 않으면 채점이 불가능합니다.
'''
from datetime import datetime
today_datetime = datetime.today().strftime('%y%m%d_%H%M')
os.chdir('G:/내 드라이브/DACON_proj/DACON/2024_FSI_AIxData_Challenge')
# 폴더 생성 및 작업 디렉토리 변경
os.makedirs('./submission', exist_ok=True)
os.chdir('./submission')

# CSV 파일로 저장
clf_submission.to_csv('./clf_submission.csv', encoding='UTF-8-sig', index=False)
# all_synthetic_data.to_csv('./syn_submission.csv', encoding='UTF-8-sig', index=False)

# ZIP 파일 생성 및 CSV 파일 추가
with zipfile.ZipFile(f'submission_{today_datetime}.zip', 'w') as submission:
    submission.write('clf_submission.csv')
    submission.write('syn_submission.csv')
    
print('Done.')
os.chdir('G:/내 드라이브/DACON_proj/DACON/2024_FSI_AIxData_Challenge')

Done.


In [None]:
import winsound

# 주파수와 지속시간 설정 (주파수 단위: Hertz, 지속시간 단위: 밀리초)
frequency = 1000  # 주파수 (Hertz)
duration = 300    # 지속시간 (Milliseconds)

# 소리 재생
winsound.Beep(frequency, duration)