# Anomalous Financial Transaction Detection

본 대회의 과제는 금융 거래 데이터에서 **이상 거래를 탐지하는 기능**을 개선하고 활용도를 높이는 분류 AI모델을 개발하는 것입니다. 

특히, 클래스 불균형 문제를 해결하기 위해 오픈소스 생성형 AI 모델을 활용하여 부족한 클래스의 데이터를 보완하고, 이를 통해 분류 모델의 성능을 향상시키는 것이 핵심 목표입니다. 

이러한 접근을 통해 금융보안에 특화된 데이터 분석 및 활용 역량을 강화하여 전문 인력을 양성하고, 금융권의 AI 활용 어려움에 따른 해결 방안을 함께 모색하며 금융 산업의 AI 활용 활성화를 지원하는 것을 목표로 합니다.

# Import Library

In [1]:
# pip install sdv

In [1]:
# 제출 파일 생성 관련
import os
import zipfile

# 데이터 처리 및 분석
import pandas as pd
import numpy as np
from scipy import stats
from tqdm import tqdm

# 머신러닝 전처리
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

# 머신러닝 모델
import xgboost as xgb

# 합성 데이터 생성
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer

# To ignore all warnings
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

# 생성 🏭

# Load Data

In [2]:
train_all = pd.read_csv("train.csv")
test_all = pd.read_csv("test.csv")

In [3]:
train = train_all.drop(columns="ID")

In [4]:
train.shape

(120000, 63)

In [5]:
'''
(*) 리더보드 산식 중 생성데이터의 익명성(TCAP)채점을 위해 각 클래스 별로 1000개의 생성데이터가 반드시 필요합니다.
(*) 본 베이스 라인에서는 "Fraud_Type" 13종류에 대해 1000개씩 , 총 13,000개의 데이터를 생성할 예정입니다.
(*) 분류 모델 성능 개선을 위해 생성 데이터를 활용하는 것에는 생성 데이터의 Row 개수에 제한이 없습니다. 단, 리더보드 평가를 위해 제출을 하는 생성 데이터 프레임은 익명성(TCAP) 평가를 위함이며, 위의 조건을 갖춘 생성 데이터를 제출해야합니다.
'''
N_CLS_PER_GEN = 1000
N_CLS_PER_GEN_2 = 5

In [7]:
# pd.set_option('display.max_rows', None)  # 모든 행 표시
# cond_all = pd.read_excel("데이터_명세_및_생성조건.xlsx", header=1)
# cond_all.iloc[:,2:-1]

## 제출용 합성데이터

In [8]:
# from ctgan import CTGAN 
# import pandas as pd
# from tqdm import tqdm
# import numpy as np
# from scipy import stats
# from sdv.metadata import SingleTableMetadata

# # 이상치 처리 함수
# def handle_outliers(series, n_std=3):
#     mean = series.mean()
#     std = series.std()
#     z_scores = np.abs(stats.zscore(series))
#     return series.mask(z_scores > n_std, mean)

# # 범주형 데이터 조건 강제 함수
# def enforce_categorical_conditions(df):
#     # Customer_Gender: 'male', 'female'만 허용
#     df['Customer_Gender'] = df['Customer_Gender'].apply(lambda x: 'male' if x == 'male' else 'female')
    
#     # Customer_credit_rating: 'S', 'A', 'B', 'C', 'D', 'E'만 허용
#     valid_ratings = ['S', 'A', 'B', 'C', 'D', 'E']
#     df['Customer_credit_rating'] = df['Customer_credit_rating'].apply(lambda x: x if x in valid_ratings else 'B')
    
#     # Customer_loan_type: 'a', 'b', 'c', 'd', 'e'만 허용
#     valid_loan_types = ['a', 'b', 'c', 'd', 'e']
#     df['Customer_loan_type'] = df['Customer_loan_type'].apply(lambda x: x if x in valid_loan_types else 'c')
    
#     # 기타 범주형 변수들 0, 1만 허용
#     binary_columns = ['Customer_flag_change_of_authentication_1', 'Customer_flag_change_of_authentication_2',
#                       'Customer_flag_change_of_authentication_3', 'Customer_flag_change_of_authentication_4',
#                       'Customer_rooting_jailbreak_indicator', 'Customer_mobile_roaming_indicator', 
#                       'Customer_VPN_Indicator', 'Customer_flag_terminal_malicious_behavior_1',
#                       'Customer_flag_terminal_malicious_behavior_2', 'Customer_flag_terminal_malicious_behavior_3',
#                       'Customer_flag_terminal_malicious_behavior_4', 'Customer_flag_terminal_malicious_behavior_5',
#                       'Customer_flag_terminal_malicious_behavior_6', 'Customer_inquery_atm_limit',
#                       'Customer_increase_atm_limit', 'Account_indicator_release_limit_excess',
#                       'Account_indicator_Openbanking', 'Account_release_suspention', 'Transaction_Failure_Status',
#                       'Another_Person_Account', 'Unused_terminal_status', 'Flag_deposit_more_than_tenMillion',
#                       'Unused_account_status', 'Recipient_account_suspend_status', 'First_time_iOS_by_vulnerable_user']
#     for col in binary_columns:
#         df[col] = df[col].apply(lambda x: 1 if x == 1 else 0)
    
#     return df

# # 수치형 데이터 조건 강제 함수
# def enforce_numerical_conditions(df):
#     # Customer_Birthyear: 1950 ~ 2004 범위 제한
#     df['Customer_Birthyear'] = df['Customer_Birthyear'].clip(1950, 2004)
    
#     # Account_initial_balance: 음수가 될 수 없으므로 최소값을 0으로 설정
#     df['Account_initial_balance'] = df['Account_initial_balance'].clip(lower=0)
    
#     # Account_balance: 음수가 될 수 없으므로 최소값을 0으로 설정
#     df['Account_balance'] = df['Account_balance'].clip(lower=0)
    
#     # Account_amount_daily_limit: 0 이상의 값으로 설정 (예시로 최대값도 설정 가능)
#     df['Account_amount_daily_limit'] = df['Account_amount_daily_limit'].clip(lower=0)
    
#     # Account_remaining_amount_daily_limit_exceeded: 0 이상의 값으로 설정
#     df['Account_remaining_amount_daily_limit_exceeded'] = df['Account_remaining_amount_daily_limit_exceeded'].clip(lower=0)
    
#     # Account_one_month_max_amount, Account_dawn_one_month_max_amount: 음수일 수 있으므로, 필요시 범위 설정
#     # 이 항목들은 특정 조건이 있다면 적용
#     # 예: 최댓값을 특정 범위로 제한
#     df['Account_one_month_max_amount'] = df['Account_one_month_max_amount'].clip(lower=-1000000, upper=1000000)
#     df['Account_dawn_one_month_max_amount'] = df['Account_dawn_one_month_max_amount'].clip(lower=-1000000, upper=1000000)
    
#     # Account_one_month_std_dev, Account_dawn_one_month_std_dev: 표준편차는 음수가 될 수 없으므로 0으로 제한
#     df['Account_one_month_std_dev'] = df['Account_one_month_std_dev'].clip(lower=0)
#     df['Account_dawn_one_month_std_dev'] = df['Account_dawn_one_month_std_dev'].clip(lower=0)
    
#     # Transaction_Amount: 특정 범위 내로 설정 (예: 음수도 허용, 최대값 제한)
#     df['Transaction_Amount'] = df['Transaction_Amount'].clip(lower=-100000000, upper=100000000)
    
#     # Distance: 음수가 될 수 없으므로 최소값을 0으로 설정
#     df['Distance'] = df['Distance'].clip(lower=0)
    
#     # Number_of_transaction_with_the_account, Transaction_history_with_the_account: 음수가 될 수 없으므로 최소값을 0으로 설정
#     df['Number_of_transaction_with_the_account'] = df['Number_of_transaction_with_the_account'].clip(lower=0)
#     df['Transaction_history_with_the_account'] = df['Transaction_history_with_the_account'].clip(lower=0)
    
#     return df

# # Time_difference 컬럼을 총 초로 변환 및 이상치 처리
# train['Time_difference_seconds'] = pd.to_timedelta(train['Time_difference']).dt.total_seconds()
# train['Time_difference_seconds'] = handle_outliers(train['Time_difference_seconds'])

# # 모든 Fraud_Type 목록 생성
# fraud_types = train['Fraud_Type'].unique()  # Fraud_Type로 변경

# # 모든 합성 데이터를 저장할 DataFrame 초기화
# all_synthetic_data = pd.DataFrame()

# N_SAMPLE = 100

# # 각 Fraud_Type에 대해 합성 데이터 생성 및 저장
# for fraud_type in tqdm(fraud_types):
    
#     # 해당 Fraud_Type에 대한 서브셋 생성
#     subset = train[train["Fraud_Type"] == fraud_type]

#     # 모든 Fraud_Type에 대해 100개씩 샘플링
#     subset = subset.sample(n=N_SAMPLE, random_state=42)
    
#     # Time_difference 열 제외 (초 단위로 변환된 컬럼만 사용)
#     subset = subset.drop('Time_difference', axis=1)
    
#     # 메타데이터 생성 및 모델 학습
#     metadata = SingleTableMetadata()

#     metadata.detect_from_dataframe(subset)
#     metadata.set_primary_key(None)

#     # 데이터 타입 설정
#     column_sdtypes = {
#         'Customer_Birthyear': 'numerical',
#         # 'Customer_Gender': 'categorical',
#         'Customer_personal_identifier': 'categorical',
#         'Customer_identification_number': 'categorical',
#         # 'Customer_registration_datetime': 'datetime',
#         'Customer_credit_rating': 'categorical',
#         # 'Customer_flag_change_of_authentication_1': 'categorical',
#         # 'Customer_flag_change_of_authentication_2': 'categorical',
#         # 'Customer_flag_change_of_authentication_3': 'categorical',
#         # 'Customer_flag_change_of_authentication_4': 'categorical',
#         # 'Customer_rooting_jailbreak_indicator': 'categorical',
#         # 'Customer_mobile_roaming_indicator': 'categorical',
#         # 'Customer_VPN_Indicator': 'categorical',
#         # 'Customer_loan_type': 'categorical',
#         # 'Customer_flag_terminal_malicious_behavior_1': 'categorical',
#         # 'Customer_flag_terminal_malicious_behavior_2': 'categorical',
#         # 'Customer_flag_terminal_malicious_behavior_3': 'categorical',
#         # 'Customer_flag_terminal_malicious_behavior_4': 'categorical',
#         # 'Customer_flag_terminal_malicious_behavior_5': 'categorical',
#         # 'Customer_flag_terminal_malicious_behavior_6': 'categorical',
#         # 'Customer_inquery_atm_limit': 'categorical',
#         # 'Customer_increase_atm_limit': 'categorical',
#         'Account_account_number': 'categorical',
#         # 'Account_account_type': 'categorical',
#         # 'Account_creation_datetime': 'datetime',
#         'Account_initial_balance': 'numerical',
#         'Account_balance': 'numerical',
#         # 'Account_indicator_release_limit_excess': 'categorical',
#         'Account_amount_daily_limit': 'numerical',
#         'Account_indicator_Openbanking': 'categorical',
#         'Account_remaining_amount_daily_limit_exceeded': 'numerical',
#         # 'Account_release_suspention': 'categorical',
#         'Account_one_month_max_amount': 'numerical',
#         'Account_one_month_std_dev': 'numerical',
#         'Account_dawn_one_month_max_amount': 'numerical',
#         'Account_dawn_one_month_std_dev': 'numerical',
#         # 'Transaction_Datetime': 'datetime',
#         'Transaction_Amount': 'numerical',
#         # 'Channel': 'categorical',
#         # 'Operating_System': 'categorical',
#         # 'Error_Code': 'categorical',
#         # 'Transaction_Failure_Status': 'categorical',
#         # 'Type_General_Automatic': 'categorical',
#         'IP_Address': 'ipv4_address',
#         # 'Access_Medium': 'categorical',
#         'Location': 'categorical',
#         'Recipient_Account_Number': 'categorical',
#         'Transaction_num_connection_failure': 'numerical',
#         # 'Another_Person_Account': 'categorical',
#         'Distance': 'numerical',
#         'Time_difference_seconds': 'numerical',
#         # 'Unused_terminal_status': 'categorical',
#         # 'Last_atm_transaction_datetime': 'datetime',
#         # 'Last_bank_branch_transaction_datetime': 'datetime',
#         # 'Flag_deposit_more_than_tenMillion': 'categorical',
#         # 'Unused_account_status': 'categorical',
#         # 'Recipient_account_suspend_status': 'categorical',
#         'Number_of_transaction_with_the_account': 'numerical',
#         'Transaction_history_with_the_account': 'numerical',
#         # 'First_time_iOS_by_vulnerable_user': 'categorical',
#         # 'Transaction_resumed_date': 'datetime',
#         'Fraud_Type': 'categorical'
#     }

#     # 각 컬럼에 대해 데이터 타입 설정
#     for column, sdtype in column_sdtypes.items():
#         metadata.update_column(
#             column_name=column,
#             sdtype=sdtype
#         )
        
#     synthesizer = CTGANSynthesizer(
#                             metadata,
#                             epochs= 2000
#                         )
#     synthesizer.fit(subset)

#     synthetic_subset = synthesizer.sample(num_rows=N_CLS_PER_GEN)  # 합성 데이터 생성 수 설정
    
#     # 생성된 Time_difference_seconds의 이상치 처리
#     synthetic_subset['Time_difference_seconds'] = handle_outliers(synthetic_subset['Time_difference_seconds'])
    
#     # Time_difference_seconds를 다시 timedelta로 변환
#     synthetic_subset['Time_difference'] = pd.to_timedelta(synthetic_subset['Time_difference_seconds'], unit='s')
    
#     # Time_difference_seconds 컬럼 제거
#     synthetic_subset = synthetic_subset.drop('Time_difference_seconds', axis=1)
    
#     # 생성 조건 반영 (범주형, 수치형, 형식 조건)
#     synthetic_subset = enforce_categorical_conditions(synthetic_subset)
#     synthetic_subset = enforce_numerical_conditions(synthetic_subset)
    
#     # 생성된 데이터를 all_synthetic_data에 추가
#     all_synthetic_data = pd.concat([all_synthetic_data, synthetic_subset], ignore_index=True)

# # 최종 결과 확인
# print("\nFinal All Synthetic Data Shape:", all_synthetic_data.shape)


In [9]:
# all_synthetic_data.to_csv('submission/syn_submission.csv', encoding='UTF-8-sig', index=False)

## 성능용 합성데이터

CTGAN

In [6]:
from ctgan import CTGAN 
import pandas as pd
from tqdm import tqdm
import numpy as np
from scipy import stats
from sdv.metadata import SingleTableMetadata

# 이상치 처리 함수
def handle_outliers(series, n_std=3):
    mean = series.mean()
    std = series.std()
    z_scores = np.abs(stats.zscore(series))
    return series.mask(z_scores > n_std, mean)

# 범주형 데이터 조건 강제 함수
def enforce_categorical_conditions(df):
    # Customer_Gender: 'male', 'female'만 허용
    df['Customer_Gender'] = df['Customer_Gender'].apply(lambda x: 'male' if x == 'male' else 'female')
    
    # Customer_credit_rating: 'S', 'A', 'B', 'C', 'D', 'E'만 허용
    valid_ratings = ['S', 'A', 'B', 'C', 'D', 'E']
    df['Customer_credit_rating'] = df['Customer_credit_rating'].apply(lambda x: x if x in valid_ratings else 'B')
    
    # Customer_loan_type: 'a', 'b', 'c', 'd', 'e'만 허용
    valid_loan_types = ['a', 'b', 'c', 'd', 'e']
    df['Customer_loan_type'] = df['Customer_loan_type'].apply(lambda x: x if x in valid_loan_types else 'c')
    
    # 기타 범주형 변수들 0, 1만 허용
    binary_columns = ['Customer_flag_change_of_authentication_1', 'Customer_flag_change_of_authentication_2',
                      'Customer_flag_change_of_authentication_3', 'Customer_flag_change_of_authentication_4',
                      'Customer_rooting_jailbreak_indicator', 'Customer_mobile_roaming_indicator', 
                      'Customer_VPN_Indicator', 'Customer_flag_terminal_malicious_behavior_1',
                      'Customer_flag_terminal_malicious_behavior_2', 'Customer_flag_terminal_malicious_behavior_3',
                      'Customer_flag_terminal_malicious_behavior_4', 'Customer_flag_terminal_malicious_behavior_5',
                      'Customer_flag_terminal_malicious_behavior_6', 'Customer_inquery_atm_limit',
                      'Customer_increase_atm_limit', 'Account_indicator_release_limit_excess',
                      'Account_indicator_Openbanking', 'Account_release_suspention', 'Transaction_Failure_Status',
                      'Another_Person_Account', 'Unused_terminal_status', 'Flag_deposit_more_than_tenMillion',
                      'Unused_account_status', 'Recipient_account_suspend_status', 'First_time_iOS_by_vulnerable_user']
    for col in binary_columns:
        df[col] = df[col].apply(lambda x: 1 if x == 1 else 0)
    
    return df

# 수치형 데이터 조건 강제 함수
def enforce_numerical_conditions(df):
    # Customer_Birthyear: 1950 ~ 2004 범위 제한
    df['Customer_Birthyear'] = df['Customer_Birthyear'].clip(1950, 2004)
    
    # Account_initial_balance: 음수가 될 수 없으므로 최소값을 0으로 설정
    df['Account_initial_balance'] = df['Account_initial_balance'].clip(lower=0)
    
    # Account_balance: 음수가 될 수 없으므로 최소값을 0으로 설정
    df['Account_balance'] = df['Account_balance'].clip(lower=0)
    
    # Account_amount_daily_limit: 0 이상의 값으로 설정 (예시로 최대값도 설정 가능)
    df['Account_amount_daily_limit'] = df['Account_amount_daily_limit'].clip(lower=0)
    
    # Account_remaining_amount_daily_limit_exceeded: 0 이상의 값으로 설정
    df['Account_remaining_amount_daily_limit_exceeded'] = df['Account_remaining_amount_daily_limit_exceeded'].clip(lower=0)
    
    # Account_one_month_max_amount, Account_dawn_one_month_max_amount: 음수일 수 있으므로, 필요시 범위 설정
    # 이 항목들은 특정 조건이 있다면 적용
    # 예: 최댓값을 특정 범위로 제한
    df['Account_one_month_max_amount'] = df['Account_one_month_max_amount'].clip(lower=-1000000, upper=1000000)
    df['Account_dawn_one_month_max_amount'] = df['Account_dawn_one_month_max_amount'].clip(lower=-1000000, upper=1000000)
    
    # Account_one_month_std_dev, Account_dawn_one_month_std_dev: 표준편차는 음수가 될 수 없으므로 0으로 제한
    df['Account_one_month_std_dev'] = df['Account_one_month_std_dev'].clip(lower=0)
    df['Account_dawn_one_month_std_dev'] = df['Account_dawn_one_month_std_dev'].clip(lower=0)
    
    # Transaction_Amount: 특정 범위 내로 설정 (예: 음수도 허용, 최대값 제한)
    df['Transaction_Amount'] = df['Transaction_Amount'].clip(lower=-100000000, upper=100000000)
    
    # Distance: 음수가 될 수 없으므로 최소값을 0으로 설정
    df['Distance'] = df['Distance'].clip(lower=0)
    
    # Number_of_transaction_with_the_account, Transaction_history_with_the_account: 음수가 될 수 없으므로 최소값을 0으로 설정
    df['Number_of_transaction_with_the_account'] = df['Number_of_transaction_with_the_account'].clip(lower=0)
    df['Transaction_history_with_the_account'] = df['Transaction_history_with_the_account'].clip(lower=0)
    
    return df



# Time_difference 컬럼을 총 초로 변환 및 이상치 처리
train['Time_difference_seconds'] = pd.to_timedelta(train['Time_difference']).dt.total_seconds()
train['Time_difference_seconds'] = handle_outliers(train['Time_difference_seconds'])

# 모든 Fraud_Type 목록 생성
fraud_types = train['Fraud_Type'].unique()  # Fraud_Type로 변경

# 모든 합성 데이터를 저장할 DataFrame 초기화
all_synthetic_data_ctgan = pd.DataFrame()

N_SAMPLE = 100

# 각 Fraud_Type에 대해 합성 데이터 생성 및 저장
for fraud_type in tqdm(fraud_types):
    
    # 해당 Fraud_Type에 대한 서브셋 생성
    subset = train[train["Fraud_Type"] == fraud_type]

    # 모든 Fraud_Type에 대해 100개씩 샘플링
    subset = subset.sample(n=N_SAMPLE, random_state=42)
    
    # Time_difference 열 제외 (초 단위로 변환된 컬럼만 사용)
    subset = subset.drop('Time_difference', axis=1)
    
    # 메타데이터 생성 및 모델 학습
    metadata = SingleTableMetadata()

    metadata.detect_from_dataframe(subset)
    metadata.set_primary_key(None)

    # 데이터 타입 설정
    column_sdtypes = {
        'Customer_Birthyear': 'numerical',
        # 'Customer_Gender': 'categorical',
        'Customer_personal_identifier': 'categorical',
        'Customer_identification_number': 'categorical',
        # 'Customer_registration_datetime': 'datetime',
        'Customer_credit_rating': 'categorical',
        # 'Customer_flag_change_of_authentication_1': 'categorical',
        # 'Customer_flag_change_of_authentication_2': 'categorical',
        # 'Customer_flag_change_of_authentication_3': 'categorical',
        # 'Customer_flag_change_of_authentication_4': 'categorical',
        # 'Customer_rooting_jailbreak_indicator': 'categorical',
        # 'Customer_mobile_roaming_indicator': 'categorical',
        # 'Customer_VPN_Indicator': 'categorical',
        # 'Customer_loan_type': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_1': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_2': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_3': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_4': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_5': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_6': 'categorical',
        # 'Customer_inquery_atm_limit': 'categorical',
        # 'Customer_increase_atm_limit': 'categorical',
        'Account_account_number': 'categorical',
        # 'Account_account_type': 'categorical',
        # 'Account_creation_datetime': 'datetime',
        'Account_initial_balance': 'numerical',
        'Account_balance': 'numerical',
        # 'Account_indicator_release_limit_excess': 'categorical',
        'Account_amount_daily_limit': 'numerical',
        'Account_indicator_Openbanking': 'categorical',
        'Account_remaining_amount_daily_limit_exceeded': 'numerical',
        # 'Account_release_suspention': 'categorical',
        'Account_one_month_max_amount': 'numerical',
        'Account_one_month_std_dev': 'numerical',
        'Account_dawn_one_month_max_amount': 'numerical',
        'Account_dawn_one_month_std_dev': 'numerical',
        # 'Transaction_Datetime': 'datetime',
        'Transaction_Amount': 'numerical',
        # 'Channel': 'categorical',
        # 'Operating_System': 'categorical',
        # 'Error_Code': 'categorical',
        # 'Transaction_Failure_Status': 'categorical',
        # 'Type_General_Automatic': 'categorical',
        'IP_Address': 'ipv4_address',
        # 'Access_Medium': 'categorical',
        'Location': 'categorical',
        'Recipient_Account_Number': 'categorical',
        'Transaction_num_connection_failure': 'numerical',
        # 'Another_Person_Account': 'categorical',
        'Distance': 'numerical',
        'Time_difference_seconds': 'numerical',
        # 'Unused_terminal_status': 'categorical',
        # 'Last_atm_transaction_datetime': 'datetime',
        # 'Last_bank_branch_transaction_datetime': 'datetime',
        # 'Flag_deposit_more_than_tenMillion': 'categorical',
        # 'Unused_account_status': 'categorical',
        # 'Recipient_account_suspend_status': 'categorical',
        'Number_of_transaction_with_the_account': 'numerical',
        'Transaction_history_with_the_account': 'numerical',
        # 'First_time_iOS_by_vulnerable_user': 'categorical',
        # 'Transaction_resumed_date': 'datetime',
        'Fraud_Type': 'categorical'
    }
    # 각 컬럼에 대해 데이터 타입 설정
    for column, sdtype in column_sdtypes.items():
        metadata.update_column(
            column_name=column,
            sdtype=sdtype
        )
        
    synthesizer = CTGANSynthesizer(
                            metadata,
                            epochs=300
                        )
    synthesizer.fit(subset)

    synthetic_subset = synthesizer.sample(num_rows=N_CLS_PER_GEN_2)  # 합성 데이터 생성 수 설정
    
    # 생성된 Time_difference_seconds의 이상치 처리
    synthetic_subset['Time_difference_seconds'] = handle_outliers(synthetic_subset['Time_difference_seconds'])
    
    # Time_difference_seconds를 다시 timedelta로 변환
    synthetic_subset['Time_difference'] = pd.to_timedelta(synthetic_subset['Time_difference_seconds'], unit='s')
    
    # Time_difference_seconds 컬럼 제거
    synthetic_subset = synthetic_subset.drop('Time_difference_seconds', axis=1)
    
    # 생성 조건 반영 (범주형, 수치형, 형식 조건)
    synthetic_subset = enforce_categorical_conditions(synthetic_subset)
    synthetic_subset = enforce_numerical_conditions(synthetic_subset)
    
    # 생성된 데이터를 all_synthetic_data에 추가
    all_synthetic_data_ctgan = pd.concat([all_synthetic_data_ctgan, synthetic_subset], ignore_index=True)

# 최종 결과 확인
print("\nFinal All Synthetic Data ctgan Shape:", all_synthetic_data_ctgan.shape)


100%|██████████| 13/13 [15:36<00:00, 72.06s/it]


Final All Synthetic Data ctgan Shape: (65, 63)





## 원본 데이터와 concat

In [57]:
origin_train = train_all.drop(columns="ID")
train_total = pd.concat([origin_train, all_synthetic_data_ctgan])
train_total.shape

(120065, 63)

# Data Preprocessing 1 : Select x, y

In [58]:
train_x = train_total.drop(columns=['Fraud_Type'])
train_y = train_total['Fraud_Type']

test_x = test_all.drop(columns=['ID'])

# Data Preprocessing 2 : 범주형 변수 인코딩

In [59]:
le_subclass = LabelEncoder()
train_y_encoded = le_subclass.fit_transform(train_y)

# 변환된 레이블 확인
for i, label in enumerate(le_subclass.classes_):
    print(f"원래 레이블: {label}, 변환된 숫자: {i}")

원래 레이블: a, 변환된 숫자: 0
원래 레이블: b, 변환된 숫자: 1
원래 레이블: c, 변환된 숫자: 2
원래 레이블: d, 변환된 숫자: 3
원래 레이블: e, 변환된 숫자: 4
원래 레이블: f, 변환된 숫자: 5
원래 레이블: g, 변환된 숫자: 6
원래 레이블: h, 변환된 숫자: 7
원래 레이블: i, 변환된 숫자: 8
원래 레이블: j, 변환된 숫자: 9
원래 레이블: k, 변환된 숫자: 10
원래 레이블: l, 변환된 숫자: 11
원래 레이블: m, 변환된 숫자: 12


In [60]:
# train_x
# 'Time_difference' 열을 문자열로 변환
train_x['Time_difference'] = train_x['Time_difference'].astype(str)

# 범주형 변수 인코딩
categorical_columns = train_x.select_dtypes(include=['object', 'category']).columns
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# 훈련 데이터 인코딩
train_x_encoded = train_x.copy()
train_x_encoded[categorical_columns] = ordinal_encoder.fit_transform(train_x[categorical_columns])

In [61]:
train_x_encoded['Customer_Total_Authentication_Changes'] = train_x_encoded[
    ['Customer_flag_change_of_authentication_1', 'Customer_flag_change_of_authentication_2', 
     'Customer_flag_change_of_authentication_3', 'Customer_flag_change_of_authentication_4']
].sum(axis=1)

train_x_encoded['Customer_Total_Malicious_Behavior_Flags'] = train_x_encoded[
    ['Customer_flag_terminal_malicious_behavior_1', 'Customer_flag_terminal_malicious_behavior_2', 
     'Customer_flag_terminal_malicious_behavior_3', 'Customer_flag_terminal_malicious_behavior_4', 
     'Customer_flag_terminal_malicious_behavior_5', 'Customer_flag_terminal_malicious_behavior_6']
].sum(axis=1)

train_x_encoded['Daily_Usage_Ratio'] = train_x_encoded['Account_amount_daily_limit'] / train_x_encoded['Account_remaining_amount_daily_limit_exceeded']

train_x_encoded['Transaction_Amount_per_Distance'] = train_x_encoded['Transaction_Amount'] / train_x_encoded['Distance']
train_x_encoded['Channel_Distance_Interaction'] = train_x_encoded['Channel'] * train_x_encoded['Distance']
train_x_encoded['Abs_Transaction_Amount'] = train_x_encoded['Transaction_Amount'].abs()
train_x_encoded['Transaction_Amount_per_Transaction_Count'] = train_x_encoded['Transaction_Amount'] / (train_x_encoded['Number_of_transaction_with_the_account'] + 1)

train_x_encoded['Channel_Transaction_Count_Interaction'] = train_x_encoded['Channel'] * train_x_encoded['Number_of_transaction_with_the_account']
train_x_encoded['Flag_Transaction_Interaction'] = train_x_encoded['Flag_deposit_more_than_tenMillion'] * train_x_encoded['Transaction_Amount']

train_x_encoded['Transaction_Failure_Rate'] = train_x_encoded['Transaction_Failure_Status'].mean()

train_x_encoded['ATM_Limit_Increased'] = (train_x_encoded['Customer_increase_atm_limit'] > 0).astype(int)

# train_x_encoded['Transaction_Weekday'] = pd.to_datetime(train_x_encoded['Transaction_Datetime']).dt.weekday

train_x_encoded.replace([np.inf, -np.inf], np.nan, inplace=True)
train_x_encoded.fillna(train_x_encoded.mean(), inplace=True)


In [62]:
# 특성 순서 저장
feature_order = train_x_encoded.columns.tolist()

### test

In [63]:
# 테스트 데이터 인코딩
test_x_encoded = test_x.copy()
test_x_encoded[categorical_columns] = ordinal_encoder.transform(test_x[categorical_columns])

In [64]:
test_x_encoded['Customer_Total_Authentication_Changes'] = test_x_encoded[
    ['Customer_flag_change_of_authentication_1', 'Customer_flag_change_of_authentication_2', 
     'Customer_flag_change_of_authentication_3', 'Customer_flag_change_of_authentication_4']
].sum(axis=1)

test_x_encoded['Customer_Total_Malicious_Behavior_Flags'] = test_x_encoded[
    ['Customer_flag_terminal_malicious_behavior_1', 'Customer_flag_terminal_malicious_behavior_2', 
     'Customer_flag_terminal_malicious_behavior_3', 'Customer_flag_terminal_malicious_behavior_4', 
     'Customer_flag_terminal_malicious_behavior_5', 'Customer_flag_terminal_malicious_behavior_6']
].sum(axis=1)

test_x_encoded['Daily_Usage_Ratio'] = test_x_encoded['Account_amount_daily_limit'] / test_x_encoded['Account_remaining_amount_daily_limit_exceeded']

test_x_encoded['Transaction_Amount_per_Distance'] = test_x_encoded['Transaction_Amount'] / test_x_encoded['Distance']
test_x_encoded['Channel_Distance_Interaction'] = test_x_encoded['Channel'] * test_x_encoded['Distance']
test_x_encoded['Abs_Transaction_Amount'] = test_x_encoded['Transaction_Amount'].abs()
test_x_encoded['Transaction_Amount_per_Transaction_Count'] = test_x_encoded['Transaction_Amount'] / (test_x_encoded['Number_of_transaction_with_the_account'] + 1)

test_x_encoded['Channel_Transaction_Count_Interaction'] = test_x_encoded['Channel'] * test_x_encoded['Number_of_transaction_with_the_account']
test_x_encoded['Flag_Transaction_Interaction'] = test_x_encoded['Flag_deposit_more_than_tenMillion'] * test_x_encoded['Transaction_Amount']
test_x_encoded['Transaction_Failure_Rate'] = test_x_encoded['Transaction_Failure_Status'].mean()

test_x_encoded['ATM_Limit_Increased'] = (test_x_encoded['Customer_increase_atm_limit'] > 0).astype(int)

# test_x_encoded['Transaction_Weekday'] = pd.to_datetime(test_x_encoded['Transaction_Datetime']).dt.weekday

test_x_encoded.replace([np.inf, -np.inf], np.nan, inplace=True)
test_x_encoded.fillna(test_x_encoded.mean(), inplace=True)

In [65]:
# 특성 순서 맞추기 및 데이터 타입 일치
test_x_encoded = test_x_encoded[feature_order]
# test_x_encoded = test_x_encoded[selected_features]
# for col in selected_features:
for col in feature_order:
    test_x_encoded[col] = test_x_encoded[col].astype(train_x_encoded[col].dtype)

In [66]:
train_x_encoded.shape

(120065, 73)

In [67]:
test_x_encoded.shape

(120000, 73)

In [68]:
train_x_encoded['Fraud_Type'] = train_y_encoded

In [69]:
# 비율 조정을 위한 타겟 클래스별 비율 딕셔너리 (예: Normal 비율 1.0, 다른 클래스는 원하는 비율로 조정)
target_ratios = {
    0: 1.0,
    1: 1.0,
    2: 1.0,
    3: 1.0,
    4: 1.0,
    5: 1.0,
    6: 1.0,
    7: 1.0,
    8: 1.0,
    9: 1.0,
    10: 1.0,
    11: 1.0,
    12: 0.004,
}

# 각 클래스별로 샘플링하여 새로운 데이터프레임 생성
df_list = []
for target_class, ratio in target_ratios.items():
    df_class = train_x_encoded[train_x_encoded['Fraud_Type'] == target_class]
    num_class = len(df_class)
    
    # 비율에 맞게 샘플링
    df_sampled = df_class.sample(
        n=int(num_class * ratio), replace=False, random_state=42
    )
    
    # 리스트에 추가
    df_list.append(df_sampled)

# 샘플링된 데이터프레임 결합
df_concat = pd.concat(df_list, axis=0).reset_index(drop=True)

# 새로운 데이터프레임의 클래스별 카운트를 확인
print(df_concat.value_counts('Fraud_Type'))

Fraud_Type
12    475
0     105
1     105
2     105
3     105
4     105
5     105
6     105
7     105
8     105
9     105
10    105
11    105
dtype: int64


In [70]:
train_x_encoded_down = df_concat.drop(columns=['Fraud_Type'])
train_y_encoded_down = df_concat['Fraud_Type']

In [71]:
import xgboost as xgb
import shap
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(train_x_encoded_down, train_y_encoded_down, test_size=0.25, random_state=42)

# XGBoost 모델 학습
model = xgb.XGBClassifier(objective="multi:softprob", eval_metric="mlogloss")
model.fit(X_train, y_train)

# SHAP 값 계산
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# # 각 클래스별로 가장 중요한 피처 확인
# for i in range(len(shap_values)):
#     print(f"클래스 {i}에 대한 피처 중요도 순위:")
#     shap_importance = np.abs(shap_values[i]).mean(axis=0)
#     feature_importance_df = pd.DataFrame({
#         'Feature': X_test.columns,
#         'Importance': shap_importance
#     }).sort_values(by='Importance', ascending=False)
#     print(feature_importance_df.head())

    # # SHAP 값 시각화
    # shap.summary_plot(shap_values[i], X_test, show=False)

# 전체 SHAP 값 계산 (평균 절대값을 사용하여 피처 중요도 확인)
shap_values_mean = np.mean([np.abs(s).mean(axis=0) for s in shap_values], axis=0)

# 피처 중요도 데이터프레임 생성 및 정렬
overall_importance_df = pd.DataFrame({
    'Feature': X_test.columns,
    'Importance': shap_values_mean
}).sort_values(by='Importance', ascending=False)

print("전체 모델의 피처 중요도:")
overall_importance_df.head()
# # 전체 모델의 피처 중요도 시각화
# shap.summary_plot(np.mean(shap_values, axis=0), X_test)


전체 모델의 피처 중요도:


Unnamed: 0,Feature,Importance
70,Flag_Transaction_Interaction,0.513241
38,Channel,0.411058
50,Distance,0.380328
37,Transaction_Amount,0.325015
58,Number_of_transaction_with_the_account,0.279957


In [72]:
# 임계값 설정 
threshold = 0

# 임계값 이상인 피처들만 필터링
selected_features = overall_importance_df[overall_importance_df['Importance'] > threshold]

# 필터링된 피처 목록 출력
print("임계값 이상인 피처들:")
print(selected_features)

# 필터링된 피처 이름 리스트로 추출
selected_feature_names = selected_features['Feature'].tolist()

임계값 이상인 피처들:
                                        Feature  Importance
70                 Flag_Transaction_Interaction    0.513241
38                                      Channel    0.411058
50                                     Distance    0.380328
37                           Transaction_Amount    0.325015
58       Number_of_transaction_with_the_account    0.279957
..                                          ...         ...
7      Customer_flag_change_of_authentication_2    0.005232
16  Customer_flag_terminal_malicious_behavior_3    0.004120
14  Customer_flag_terminal_malicious_behavior_1    0.000857
17  Customer_flag_terminal_malicious_behavior_4    0.000629
18  Customer_flag_terminal_malicious_behavior_5    0.000543

[66 rows x 2 columns]


In [73]:
train_x_shap = train_x_encoded_down[selected_feature_names]
test_x_shap = test_x_encoded[selected_feature_names]

In [74]:
import optuna
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score

# CatBoost 최적화 목적 함수 정의
def optimize_catboost(trial):
    # 최적화할 하이퍼파라미터 설정
    depth = trial.suggest_int('depth', 4, 8)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-3, 0.2)
    iterations = trial.suggest_int('iterations', 100, 600)
    
    # CatBoost 모델 생성
    model = CatBoostClassifier(
        depth=depth,
        learning_rate=learning_rate,
        iterations=iterations,
        verbose=0,
        random_state=42
    )
    
    # 교차 검증으로 모델 성능 평가 (F1 Macro)
    score = cross_val_score(model, train_x_shap, train_y_encoded_down, cv=3, scoring='f1_macro').mean()
    return score

# Optuna 스터디 생성 및 최적화 실행
catboost_study = optuna.create_study(direction='maximize')
catboost_study.optimize(optimize_catboost, n_trials=50)

# 최적 하이퍼파라미터 출력
print("Best CatBoost hyperparameters: ", catboost_study.best_params)

# 최적 하이퍼파라미터로 CatBoost 모델 생성
best_catboost_model = CatBoostClassifier(
    depth=catboost_study.best_params['depth'],
    learning_rate=catboost_study.best_params['learning_rate'],
    iterations=catboost_study.best_params['iterations'],
    verbose=0,
    random_state=42
)


[I 2024-08-28 18:24:28,065] A new study created in memory with name: no-name-9ebabff7-20dd-45f5-8c80-a5b8a24d725d


0:	learn: 2.5578441	total: 15.9ms	remaining: 1.68s
106:	learn: 2.1326242	total: 1.59s	remaining: 0us
0:	learn: 2.5577627	total: 14.7ms	remaining: 1.56s
106:	learn: 2.1409439	total: 1.36s	remaining: 0us
0:	learn: 2.5570448	total: 14.1ms	remaining: 1.5s


[I 2024-08-28 18:24:32,958] Trial 0 finished with value: 0.41137774272763816 and parameters: {'depth': 5, 'learning_rate': 0.0043581132654134675, 'iterations': 107}. Best is trial 0 with value: 0.41137774272763816.


106:	learn: 2.1417791	total: 1.41s	remaining: 0us
0:	learn: 2.4007353	total: 13.7ms	remaining: 1.5s
109:	learn: 0.6820183	total: 1.38s	remaining: 0us
0:	learn: 2.3982330	total: 12.7ms	remaining: 1.39s
109:	learn: 0.6982717	total: 1.36s	remaining: 0us
0:	learn: 2.3832010	total: 14.3ms	remaining: 1.56s


[I 2024-08-28 18:24:37,540] Trial 1 finished with value: 0.5745801813712608 and parameters: {'depth': 5, 'learning_rate': 0.10534046874092955, 'iterations': 110}. Best is trial 1 with value: 0.5745801813712608.


109:	learn: 0.6871041	total: 1.36s	remaining: 0us
0:	learn: 2.5635882	total: 109ms	remaining: 35s
200:	learn: 2.3728945	total: 25.3s	remaining: 15.4s
322:	learn: 2.2778684	total: 41.2s	remaining: 0us
0:	learn: 2.5637925	total: 115ms	remaining: 37.2s
200:	learn: 2.3747537	total: 26.7s	remaining: 16.2s
322:	learn: 2.2807222	total: 43.2s	remaining: 0us
0:	learn: 2.5630381	total: 112ms	remaining: 36s
200:	learn: 2.3756925	total: 26.9s	remaining: 16.3s


[I 2024-08-28 18:26:46,279] Trial 2 finished with value: 0.44382467214242904 and parameters: {'depth': 8, 'learning_rate': 0.0010847344161811357, 'iterations': 323}. Best is trial 1 with value: 0.5745801813712608.


322:	learn: 2.2819819	total: 43.1s	remaining: 0us
0:	learn: 2.5617685	total: 6.95ms	remaining: 1.99s
200:	learn: 2.1901586	total: 1.21s	remaining: 517ms
286:	learn: 2.0948387	total: 1.72s	remaining: 0us
0:	learn: 2.5621429	total: 7.11ms	remaining: 2.03s
200:	learn: 2.1900125	total: 1.21s	remaining: 518ms
286:	learn: 2.0992787	total: 1.73s	remaining: 0us
0:	learn: 2.5614227	total: 6.65ms	remaining: 1.9s
200:	learn: 2.1902462	total: 1.22s	remaining: 522ms


[I 2024-08-28 18:26:52,387] Trial 3 finished with value: 0.4008472396970902 and parameters: {'depth': 4, 'learning_rate': 0.0019212607592431063, 'iterations': 287}. Best is trial 1 with value: 0.5745801813712608.


286:	learn: 2.0970170	total: 1.74s	remaining: 0us
0:	learn: 2.5632516	total: 6.63ms	remaining: 988ms
149:	learn: 2.3709461	total: 894ms	remaining: 0us
0:	learn: 2.5634515	total: 6.74ms	remaining: 1s
149:	learn: 2.3712127	total: 925ms	remaining: 0us
0:	learn: 2.5630670	total: 7.2ms	remaining: 1.07s


[I 2024-08-28 18:26:55,741] Trial 4 finished with value: 0.3470019246708505 and parameters: {'depth': 4, 'learning_rate': 0.0010251154341790463, 'iterations': 150}. Best is trial 1 with value: 0.5745801813712608.


149:	learn: 2.3728903	total: 939ms	remaining: 0us
0:	learn: 2.5609280	total: 7.01ms	remaining: 1.31s
187:	learn: 2.1476709	total: 1.17s	remaining: 0us
0:	learn: 2.5614013	total: 6.59ms	remaining: 1.23s
187:	learn: 2.1470019	total: 1.15s	remaining: 0us
0:	learn: 2.5604909	total: 7.58ms	remaining: 1.42s


[I 2024-08-28 18:26:59,860] Trial 5 finished with value: 0.3766693316854776 and parameters: {'depth': 4, 'learning_rate': 0.0024293805878051354, 'iterations': 188}. Best is trial 1 with value: 0.5745801813712608.


187:	learn: 2.1479548	total: 1.12s	remaining: 0us
0:	learn: 2.5171503	total: 116ms	remaining: 1m 8s
200:	learn: 0.6953954	total: 27.3s	remaining: 52.8s
400:	learn: 0.3065912	total: 54.6s	remaining: 25.7s
589:	learn: 0.1824314	total: 1m 20s	remaining: 0us
0:	learn: 2.5242552	total: 112ms	remaining: 1m 6s
200:	learn: 0.7003905	total: 26.7s	remaining: 51.7s
400:	learn: 0.3060017	total: 53.7s	remaining: 25.3s
589:	learn: 0.1814465	total: 1m 18s	remaining: 0us
0:	learn: 2.4979983	total: 108ms	remaining: 1m 3s
200:	learn: 0.6975415	total: 26.5s	remaining: 51.2s
400:	learn: 0.3067441	total: 52.7s	remaining: 24.8s


[I 2024-08-28 18:30:59,161] Trial 6 finished with value: 0.5598144959599215 and parameters: {'depth': 8, 'learning_rate': 0.03844909938854531, 'iterations': 590}. Best is trial 1 with value: 0.5745801813712608.


589:	learn: 0.1830700	total: 1m 18s	remaining: 0us
0:	learn: 2.5303517	total: 29.3ms	remaining: 9.63s
200:	learn: 1.1820618	total: 5.76s	remaining: 3.69s
329:	learn: 0.8930635	total: 9.56s	remaining: 0us
0:	learn: 2.5363919	total: 28.5ms	remaining: 9.38s
200:	learn: 1.2036320	total: 5.7s	remaining: 3.65s
329:	learn: 0.9122810	total: 9.41s	remaining: 0us
0:	learn: 2.5230150	total: 27.7ms	remaining: 9.1s
200:	learn: 1.1930630	total: 5.72s	remaining: 3.67s


[I 2024-08-28 18:31:28,817] Trial 7 finished with value: 0.5308745271170038 and parameters: {'depth': 6, 'learning_rate': 0.02058070599603535, 'iterations': 330}. Best is trial 1 with value: 0.5745801813712608.


329:	learn: 0.9101861	total: 9.47s	remaining: 0us
0:	learn: 2.5630585	total: 14.9ms	remaining: 8.56s
200:	learn: 2.3002513	total: 2.61s	remaining: 4.84s
400:	learn: 2.1379985	total: 5.17s	remaining: 2.23s
573:	learn: 2.0292979	total: 7.54s	remaining: 0us
0:	learn: 2.5630371	total: 16.1ms	remaining: 9.22s
200:	learn: 2.3063133	total: 2.56s	remaining: 4.76s
400:	learn: 2.1475492	total: 5.18s	remaining: 2.23s
573:	learn: 2.0419647	total: 7.35s	remaining: 0us
0:	learn: 2.5628456	total: 13.2ms	remaining: 7.56s
200:	learn: 2.3045879	total: 2.59s	remaining: 4.81s
400:	learn: 2.1433930	total: 5.09s	remaining: 2.2s


[I 2024-08-28 18:31:52,597] Trial 8 finished with value: 0.4345079149689152 and parameters: {'depth': 5, 'learning_rate': 0.0011584730033533403, 'iterations': 574}. Best is trial 1 with value: 0.5745801813712608.


573:	learn: 2.0371763	total: 7.25s	remaining: 0us
0:	learn: 2.5554968	total: 15.4ms	remaining: 4.4s
200:	learn: 1.8167606	total: 2.58s	remaining: 1.09s
285:	learn: 1.6694235	total: 3.66s	remaining: 0us
0:	learn: 2.5553881	total: 13ms	remaining: 3.7s
200:	learn: 1.8321494	total: 2.59s	remaining: 1.09s
285:	learn: 1.6948218	total: 3.68s	remaining: 0us
0:	learn: 2.5544339	total: 16ms	remaining: 4.57s
200:	learn: 1.8267009	total: 2.57s	remaining: 1.08s


[I 2024-08-28 18:32:04,496] Trial 9 finished with value: 0.447361444273273 and parameters: {'depth': 5, 'learning_rate': 0.005800838810001111, 'iterations': 286}. Best is trial 1 with value: 0.5745801813712608.


285:	learn: 1.6839823	total: 3.63s	remaining: 0us
0:	learn: 2.3289405	total: 57.8ms	remaining: 26.5s
200:	learn: 0.1302108	total: 13s	remaining: 16.7s
400:	learn: 0.0503428	total: 26.5s	remaining: 3.83s
458:	learn: 0.0418249	total: 29.9s	remaining: 0us
0:	learn: 2.3575692	total: 58.6ms	remaining: 26.8s
200:	learn: 0.1352338	total: 13.2s	remaining: 16.9s
400:	learn: 0.0509284	total: 26.1s	remaining: 3.77s
458:	learn: 0.0424586	total: 30.2s	remaining: 0us
0:	learn: 2.2624850	total: 58.2ms	remaining: 26.7s
200:	learn: 0.1332964	total: 13.8s	remaining: 17.7s
400:	learn: 0.0514489	total: 26.5s	remaining: 3.83s


[I 2024-08-28 18:33:36,319] Trial 10 finished with value: 0.5657770720084152 and parameters: {'depth': 7, 'learning_rate': 0.17327249912689377, 'iterations': 459}. Best is trial 1 with value: 0.5745801813712608.


458:	learn: 0.0424749	total: 30s	remaining: 0us
0:	learn: 2.3229037	total: 61.4ms	remaining: 29.5s
200:	learn: 0.1259525	total: 12.6s	remaining: 17.7s
400:	learn: 0.0483861	total: 25.7s	remaining: 5.18s
481:	learn: 0.0376030	total: 30.5s	remaining: 0us
0:	learn: 2.3521446	total: 55.2ms	remaining: 26.6s
200:	learn: 0.1302016	total: 11.9s	remaining: 16.6s
400:	learn: 0.0494618	total: 24.3s	remaining: 4.91s
481:	learn: 0.0383891	total: 29.2s	remaining: 0us
0:	learn: 2.2551143	total: 53.1ms	remaining: 25.6s
200:	learn: 0.1276983	total: 12s	remaining: 16.8s
400:	learn: 0.0497330	total: 24s	remaining: 4.85s


[I 2024-08-28 18:35:06,588] Trial 11 finished with value: 0.5767434267071355 and parameters: {'depth': 7, 'learning_rate': 0.17818192759530643, 'iterations': 482}. Best is trial 11 with value: 0.5767434267071355.


481:	learn: 0.0386635	total: 28.8s	remaining: 0us
0:	learn: 2.3010863	total: 56.1ms	remaining: 24.5s
200:	learn: 0.1096524	total: 12.2s	remaining: 14.4s
400:	learn: 0.0418127	total: 24s	remaining: 2.21s
437:	learn: 0.0369574	total: 26.2s	remaining: 0us
0:	learn: 2.3324541	total: 57ms	remaining: 24.9s
200:	learn: 0.1167776	total: 12.2s	remaining: 14.4s
400:	learn: 0.0439433	total: 24.3s	remaining: 2.24s
437:	learn: 0.0388564	total: 26.6s	remaining: 0us
0:	learn: 2.2287382	total: 55.5ms	remaining: 24.2s
200:	learn: 0.1143760	total: 12.3s	remaining: 14.5s
400:	learn: 0.0442814	total: 24.8s	remaining: 2.29s


[I 2024-08-28 18:36:27,880] Trial 12 finished with value: 0.5697751470929671 and parameters: {'depth': 7, 'learning_rate': 0.19624712737915948, 'iterations': 438}. Best is trial 11 with value: 0.5767434267071355.


437:	learn: 0.0392252	total: 27s	remaining: 0us
0:	learn: 2.4490521	total: 29.3ms	remaining: 13.6s
200:	learn: 0.4900743	total: 5.7s	remaining: 7.51s
400:	learn: 0.2399080	total: 11.7s	remaining: 1.9s
465:	learn: 0.2033077	total: 13.6s	remaining: 0us
0:	learn: 2.4689025	total: 27.9ms	remaining: 13s
200:	learn: 0.4946578	total: 5.86s	remaining: 7.72s
400:	learn: 0.2446446	total: 11.6s	remaining: 1.88s
465:	learn: 0.2028865	total: 13.4s	remaining: 0us
0:	learn: 2.4249056	total: 27.1ms	remaining: 12.6s
200:	learn: 0.4825484	total: 5.76s	remaining: 7.59s
400:	learn: 0.2445287	total: 11.8s	remaining: 1.91s


[I 2024-08-28 18:37:10,265] Trial 13 finished with value: 0.581000769624026 and parameters: {'depth': 6, 'learning_rate': 0.07031576722740382, 'iterations': 466}. Best is trial 13 with value: 0.581000769624026.


465:	learn: 0.2061353	total: 13.7s	remaining: 0us
0:	learn: 2.4864814	total: 60.3ms	remaining: 28.2s
200:	learn: 0.5222336	total: 13s	remaining: 17.3s
400:	learn: 0.2358111	total: 24.6s	remaining: 4.17s
468:	learn: 0.1917755	total: 28.6s	remaining: 0us
0:	learn: 2.4966787	total: 56.9ms	remaining: 26.6s
200:	learn: 0.5410839	total: 12.1s	remaining: 16.1s
400:	learn: 0.2438988	total: 24.4s	remaining: 4.14s
468:	learn: 0.2006501	total: 28.6s	remaining: 0us
0:	learn: 2.4623940	total: 51.9ms	remaining: 24.3s
200:	learn: 0.5282376	total: 11.6s	remaining: 15.4s
400:	learn: 0.2423402	total: 22.9s	remaining: 3.87s


[I 2024-08-28 18:38:35,934] Trial 14 finished with value: 0.5716021770548295 and parameters: {'depth': 7, 'learning_rate': 0.05468553986038975, 'iterations': 469}. Best is trial 13 with value: 0.581000769624026.


468:	learn: 0.2001367	total: 26.7s	remaining: 0us
0:	learn: 2.4359475	total: 27.1ms	remaining: 14.4s
200:	learn: 0.4381927	total: 5.51s	remaining: 9.05s
400:	learn: 0.2088176	total: 11.1s	remaining: 3.6s
530:	learn: 0.1507579	total: 14.8s	remaining: 0us
0:	learn: 2.4579557	total: 28.1ms	remaining: 14.9s
200:	learn: 0.4471734	total: 5.57s	remaining: 9.14s
400:	learn: 0.2120902	total: 11.2s	remaining: 3.62s
530:	learn: 0.1509256	total: 14.8s	remaining: 0us
0:	learn: 2.4091734	total: 27.2ms	remaining: 14.4s
200:	learn: 0.4359839	total: 5.56s	remaining: 9.13s
400:	learn: 0.2131841	total: 11.1s	remaining: 3.6s


[I 2024-08-28 18:39:21,961] Trial 15 finished with value: 0.5928373973266553 and parameters: {'depth': 6, 'learning_rate': 0.07855589541710582, 'iterations': 531}. Best is trial 15 with value: 0.5928373973266553.


530:	learn: 0.1485750	total: 14.8s	remaining: 0us
0:	learn: 2.4661718	total: 28.4ms	remaining: 15s
200:	learn: 0.5655648	total: 5.55s	remaining: 9.09s
400:	learn: 0.2848995	total: 11.1s	remaining: 3.57s
529:	learn: 0.2101182	total: 14.6s	remaining: 0us
0:	learn: 2.4831704	total: 28.9ms	remaining: 15.3s
200:	learn: 0.5744757	total: 5.56s	remaining: 9.1s
400:	learn: 0.2936587	total: 11.1s	remaining: 3.58s
529:	learn: 0.2150138	total: 14.7s	remaining: 0us
0:	learn: 2.4454984	total: 29.7ms	remaining: 15.7s
200:	learn: 0.5643954	total: 5.55s	remaining: 9.08s
400:	learn: 0.2928205	total: 11.1s	remaining: 3.56s


[I 2024-08-28 18:40:07,632] Trial 16 finished with value: 0.5871891712291477 and parameters: {'depth': 6, 'learning_rate': 0.05965482495222645, 'iterations': 530}. Best is trial 15 with value: 0.5928373973266553.


529:	learn: 0.2172959	total: 14.6s	remaining: 0us
0:	learn: 2.5282592	total: 27.2ms	remaining: 14.5s
200:	learn: 1.1490733	total: 5.53s	remaining: 9.22s
400:	learn: 0.7353996	total: 11.1s	remaining: 3.73s
535:	learn: 0.5735166	total: 14.8s	remaining: 0us
0:	learn: 2.5346622	total: 29.3ms	remaining: 15.7s
200:	learn: 1.1662248	total: 5.63s	remaining: 9.38s
400:	learn: 0.7444105	total: 11.2s	remaining: 3.77s
535:	learn: 0.5816468	total: 14.9s	remaining: 0us
0:	learn: 2.5204815	total: 27.4ms	remaining: 14.7s
200:	learn: 1.1630204	total: 5.54s	remaining: 9.23s
400:	learn: 0.7501194	total: 11.2s	remaining: 3.77s


[I 2024-08-28 18:40:53,878] Trial 17 finished with value: 0.5672871440235062 and parameters: {'depth': 6, 'learning_rate': 0.02183522234688035, 'iterations': 536}. Best is trial 15 with value: 0.5928373973266553.


535:	learn: 0.5802257	total: 14.9s	remaining: 0us
0:	learn: 2.5480661	total: 26.7ms	remaining: 10.6s
200:	learn: 1.5576008	total: 5.66s	remaining: 5.52s
396:	learn: 1.1999639	total: 11.1s	remaining: 0us
0:	learn: 2.5510231	total: 27ms	remaining: 10.7s
200:	learn: 1.5854673	total: 5.54s	remaining: 5.4s
396:	learn: 1.2276280	total: 11s	remaining: 0us
0:	learn: 2.5444761	total: 28.4ms	remaining: 11.2s
200:	learn: 1.5782794	total: 5.54s	remaining: 5.4s


[I 2024-08-28 18:41:28,262] Trial 18 finished with value: 0.501863342932719 and parameters: {'depth': 6, 'learning_rate': 0.010006227953661538, 'iterations': 397}. Best is trial 15 with value: 0.5928373973266553.


396:	learn: 1.2197800	total: 11s	remaining: 0us
0:	learn: 2.5103565	total: 56.8ms	remaining: 30.3s
200:	learn: 0.7536897	total: 11.8s	remaining: 19.5s
400:	learn: 0.3680169	total: 23.3s	remaining: 7.71s
533:	learn: 0.2622857	total: 31s	remaining: 0us
0:	learn: 2.5174939	total: 59.7ms	remaining: 31.8s
200:	learn: 0.7686574	total: 11.9s	remaining: 19.7s
400:	learn: 0.3785467	total: 23.6s	remaining: 7.82s
533:	learn: 0.2718748	total: 31.3s	remaining: 0us
0:	learn: 2.4934844	total: 56.5ms	remaining: 30.1s
200:	learn: 0.7581061	total: 11.6s	remaining: 19.2s
400:	learn: 0.3684463	total: 23.2s	remaining: 7.69s


[I 2024-08-28 18:43:03,228] Trial 19 finished with value: 0.5691184064352695 and parameters: {'depth': 7, 'learning_rate': 0.03783144781815623, 'iterations': 534}. Best is trial 15 with value: 0.5928373973266553.


533:	learn: 0.2673365	total: 30.8s	remaining: 0us
0:	learn: 2.4368733	total: 13.2ms	remaining: 4.99s
200:	learn: 0.5293184	total: 2.55s	remaining: 2.25s
377:	learn: 0.3140740	total: 4.78s	remaining: 0us
0:	learn: 2.4350409	total: 13.8ms	remaining: 5.19s
200:	learn: 0.5559318	total: 2.51s	remaining: 2.21s
377:	learn: 0.3251679	total: 4.68s	remaining: 0us
0:	learn: 2.4229449	total: 13.5ms	remaining: 5.09s
200:	learn: 0.5449588	total: 2.58s	remaining: 2.27s


[I 2024-08-28 18:43:18,582] Trial 20 finished with value: 0.6006285417220419 and parameters: {'depth': 5, 'learning_rate': 0.081137674853323, 'iterations': 378}. Best is trial 20 with value: 0.6006285417220419.


377:	learn: 0.3249664	total: 4.76s	remaining: 0us
0:	learn: 2.4175675	total: 13.8ms	remaining: 5.52s
200:	learn: 0.4666316	total: 2.48s	remaining: 2.46s
399:	learn: 0.2494700	total: 4.98s	remaining: 0us
0:	learn: 2.4153863	total: 12.6ms	remaining: 5.04s
200:	learn: 0.4920636	total: 2.52s	remaining: 2.5s
399:	learn: 0.2623098	total: 5.04s	remaining: 0us
0:	learn: 2.4016857	total: 12.9ms	remaining: 5.15s
200:	learn: 0.4840787	total: 2.5s	remaining: 2.47s


[I 2024-08-28 18:43:34,705] Trial 21 finished with value: 0.5955715690216354 and parameters: {'depth': 5, 'learning_rate': 0.09397485636564548, 'iterations': 400}. Best is trial 20 with value: 0.6006285417220419.


399:	learn: 0.2591644	total: 4.95s	remaining: 0us
0:	learn: 2.4375591	total: 13.9ms	remaining: 5.34s
200:	learn: 0.5326701	total: 2.64s	remaining: 2.43s
385:	learn: 0.3102074	total: 5.06s	remaining: 0us
0:	learn: 2.4357387	total: 14.4ms	remaining: 5.53s
200:	learn: 0.5591693	total: 2.63s	remaining: 2.42s
385:	learn: 0.3157344	total: 5.1s	remaining: 0us
0:	learn: 2.4237010	total: 13.7ms	remaining: 5.26s
200:	learn: 0.5554487	total: 2.64s	remaining: 2.43s


[I 2024-08-28 18:43:51,205] Trial 22 finished with value: 0.6007593389674017 and parameters: {'depth': 5, 'learning_rate': 0.08068531569302105, 'iterations': 386}. Best is trial 22 with value: 0.6007593389674017.


385:	learn: 0.3229528	total: 5.12s	remaining: 0us
0:	learn: 2.3811592	total: 14.2ms	remaining: 5.41s
200:	learn: 0.3973501	total: 2.61s	remaining: 2.36s
382:	learn: 0.2054493	total: 5.09s	remaining: 0us
0:	learn: 2.3782626	total: 13.3ms	remaining: 5.1s
200:	learn: 0.4112690	total: 2.63s	remaining: 2.38s
382:	learn: 0.2120813	total: 4.97s	remaining: 0us
0:	learn: 2.3617730	total: 13.9ms	remaining: 5.31s
200:	learn: 0.4064694	total: 2.64s	remaining: 2.39s


[I 2024-08-28 18:44:07,378] Trial 23 finished with value: 0.6013751545613606 and parameters: {'depth': 5, 'learning_rate': 0.11878554424184241, 'iterations': 383}. Best is trial 23 with value: 0.6013751545613606.


382:	learn: 0.2135490	total: 4.91s	remaining: 0us
0:	learn: 2.3786829	total: 14.4ms	remaining: 5.5s
200:	learn: 0.3906393	total: 2.52s	remaining: 2.28s
382:	learn: 0.2017426	total: 4.83s	remaining: 0us
0:	learn: 2.3757348	total: 14ms	remaining: 5.33s
200:	learn: 0.4069670	total: 2.56s	remaining: 2.32s
382:	learn: 0.2075568	total: 4.8s	remaining: 0us
0:	learn: 2.3590684	total: 13ms	remaining: 4.95s
200:	learn: 0.4036866	total: 2.6s	remaining: 2.35s


[I 2024-08-28 18:44:22,999] Trial 24 finished with value: 0.5815348971072595 and parameters: {'depth': 5, 'learning_rate': 0.12050509771684866, 'iterations': 383}. Best is trial 23 with value: 0.6013751545613606.


382:	learn: 0.2156288	total: 4.83s	remaining: 0us
0:	learn: 2.5060987	total: 7.85ms	remaining: 1.88s
200:	learn: 1.0400472	total: 1.29s	remaining: 251ms
239:	learn: 0.9456329	total: 1.53s	remaining: 0us
0:	learn: 2.5130042	total: 7.66ms	remaining: 1.83s
200:	learn: 1.0461769	total: 1.26s	remaining: 245ms
239:	learn: 0.9517225	total: 1.53s	remaining: 0us
0:	learn: 2.4997713	total: 7.34ms	remaining: 1.75s
200:	learn: 1.0336727	total: 1.25s	remaining: 243ms


[I 2024-08-28 18:44:28,436] Trial 25 finished with value: 0.5624587354510083 and parameters: {'depth': 4, 'learning_rate': 0.03601417106165773, 'iterations': 240}. Best is trial 23 with value: 0.6013751545613606.


239:	learn: 0.9312788	total: 1.53s	remaining: 0us
0:	learn: 2.3802782	total: 16.7ms	remaining: 6.04s
200:	learn: 0.3891987	total: 2.67s	remaining: 2.14s
361:	learn: 0.2151152	total: 4.77s	remaining: 0us
0:	learn: 2.3773633	total: 13.5ms	remaining: 4.88s
200:	learn: 0.4136809	total: 2.54s	remaining: 2.03s
361:	learn: 0.2237238	total: 4.57s	remaining: 0us
0:	learn: 2.3608106	total: 13.4ms	remaining: 4.82s
200:	learn: 0.4074181	total: 2.63s	remaining: 2.1s


[I 2024-08-28 18:44:43,657] Trial 26 finished with value: 0.5874026774353913 and parameters: {'depth': 5, 'learning_rate': 0.1193968095241222, 'iterations': 362}. Best is trial 23 with value: 0.6013751545613606.


361:	learn: 0.2254017	total: 4.73s	remaining: 0us
0:	learn: 2.5221740	total: 7.14ms	remaining: 3.08s
200:	learn: 1.1999257	total: 1.25s	remaining: 1.44s
400:	learn: 0.8446907	total: 2.53s	remaining: 202ms
432:	learn: 0.8111106	total: 2.76s	remaining: 0us
0:	learn: 2.5271982	total: 8.2ms	remaining: 3.54s
200:	learn: 1.2094858	total: 1.36s	remaining: 1.57s
400:	learn: 0.8460711	total: 2.57s	remaining: 205ms
432:	learn: 0.8085433	total: 2.76s	remaining: 0us
0:	learn: 2.5175575	total: 7.96ms	remaining: 3.44s
200:	learn: 1.1966246	total: 1.22s	remaining: 1.41s
400:	learn: 0.8414101	total: 2.44s	remaining: 195ms
432:	learn: 0.8046385	total: 2.63s	remaining: 0us


[I 2024-08-28 18:44:53,079] Trial 27 finished with value: 0.5788883888092942 and parameters: {'depth': 4, 'learning_rate': 0.026072065411697647, 'iterations': 433}. Best is trial 23 with value: 0.6013751545613606.


0:	learn: 2.5407206	total: 15ms	remaining: 4.06s
200:	learn: 1.3830963	total: 2.5s	remaining: 872ms
270:	learn: 1.2442261	total: 3.39s	remaining: 0us
0:	learn: 2.5404340	total: 13.2ms	remaining: 3.56s
200:	learn: 1.4116310	total: 2.56s	remaining: 893ms
270:	learn: 1.2715238	total: 3.44s	remaining: 0us
0:	learn: 2.5380030	total: 12.7ms	remaining: 3.44s
200:	learn: 1.3936344	total: 2.49s	remaining: 866ms


[I 2024-08-28 18:45:04,154] Trial 28 finished with value: 0.5006425863310543 and parameters: {'depth': 5, 'learning_rate': 0.014918493707280612, 'iterations': 271}. Best is trial 23 with value: 0.6013751545613606.


270:	learn: 1.2528859	total: 3.35s	remaining: 0us
0:	learn: 2.4767476	total: 14ms	remaining: 4.8s
200:	learn: 0.6965037	total: 2.52s	remaining: 1.79s
343:	learn: 0.4716175	total: 4.29s	remaining: 0us
0:	learn: 2.4755724	total: 13.3ms	remaining: 4.56s
200:	learn: 0.7210400	total: 2.58s	remaining: 1.83s
343:	learn: 0.4939827	total: 4.39s	remaining: 0us
0:	learn: 2.4670099	total: 13.5ms	remaining: 4.62s
200:	learn: 0.7069169	total: 2.48s	remaining: 1.76s


[I 2024-08-28 18:45:18,237] Trial 29 finished with value: 0.5939595173638026 and parameters: {'depth': 5, 'learning_rate': 0.05520578699051345, 'iterations': 344}. Best is trial 23 with value: 0.6013751545613606.


343:	learn: 0.4834370	total: 4.33s	remaining: 0us
0:	learn: 2.5525685	total: 8.09ms	remaining: 3.37s
200:	learn: 1.7441926	total: 1.39s	remaining: 1.49s
400:	learn: 1.4506989	total: 2.66s	remaining: 106ms
416:	learn: 1.4328263	total: 2.76s	remaining: 0us
0:	learn: 2.5540252	total: 8.25ms	remaining: 3.43s
200:	learn: 1.7605338	total: 1.21s	remaining: 1.3s
400:	learn: 1.4737377	total: 2.51s	remaining: 100ms
416:	learn: 1.4563110	total: 2.64s	remaining: 0us
0:	learn: 2.5512245	total: 8.1ms	remaining: 3.37s
200:	learn: 1.7524824	total: 1.26s	remaining: 1.35s


[I 2024-08-28 18:45:27,435] Trial 30 finished with value: 0.47882254150916453 and parameters: {'depth': 4, 'learning_rate': 0.00749316724347979, 'iterations': 417}. Best is trial 23 with value: 0.6013751545613606.


400:	learn: 1.4643132	total: 2.47s	remaining: 98.5ms
416:	learn: 1.4473661	total: 2.56s	remaining: 0us
0:	learn: 2.4121132	total: 14.4ms	remaining: 5.51s
200:	learn: 0.4535724	total: 2.5s	remaining: 2.29s
384:	learn: 0.2504308	total: 4.76s	remaining: 0us
0:	learn: 2.4098297	total: 13.2ms	remaining: 5.07s
200:	learn: 0.4860776	total: 2.52s	remaining: 2.31s
384:	learn: 0.2647905	total: 4.74s	remaining: 0us
0:	learn: 2.3956903	total: 13.7ms	remaining: 5.25s
200:	learn: 0.4755879	total: 2.52s	remaining: 2.3s


[I 2024-08-28 18:45:42,898] Trial 31 finished with value: 0.5961556557997997 and parameters: {'depth': 5, 'learning_rate': 0.09763923674414059, 'iterations': 385}. Best is trial 23 with value: 0.6013751545613606.


384:	learn: 0.2686369	total: 4.8s	remaining: 0us
0:	learn: 2.3701701	total: 17.3ms	remaining: 6.21s
200:	learn: 0.3679057	total: 2.56s	remaining: 2.02s
359:	learn: 0.2023223	total: 4.54s	remaining: 0us
0:	learn: 2.3670420	total: 13.2ms	remaining: 4.73s
200:	learn: 0.3994180	total: 2.54s	remaining: 2.01s
359:	learn: 0.2140237	total: 4.68s	remaining: 0us
0:	learn: 2.3497820	total: 14ms	remaining: 5.01s
200:	learn: 0.3848506	total: 2.59s	remaining: 2.04s


[I 2024-08-28 18:45:57,842] Trial 32 finished with value: 0.5950477736125274 and parameters: {'depth': 5, 'learning_rate': 0.12645062453450479, 'iterations': 360}. Best is trial 23 with value: 0.6013751545613606.


359:	learn: 0.2158799	total: 4.64s	remaining: 0us
0:	learn: 2.3484813	total: 15.4ms	remaining: 4.76s
200:	learn: 0.3339707	total: 2.61s	remaining: 1.43s
310:	learn: 0.2112744	total: 4s	remaining: 0us
0:	learn: 2.3448741	total: 13.1ms	remaining: 4.05s
200:	learn: 0.3431420	total: 2.53s	remaining: 1.38s
310:	learn: 0.2120169	total: 3.89s	remaining: 0us
0:	learn: 2.3262082	total: 12.7ms	remaining: 3.93s
200:	learn: 0.3515898	total: 2.5s	remaining: 1.37s


[I 2024-08-28 18:46:10,633] Trial 33 finished with value: 0.5867187217750991 and parameters: {'depth': 5, 'learning_rate': 0.1418560415800161, 'iterations': 311}. Best is trial 23 with value: 0.6013751545613606.


310:	learn: 0.2229139	total: 3.9s	remaining: 0us
0:	learn: 2.4246903	total: 14.2ms	remaining: 5.43s
200:	learn: 0.4929991	total: 2.59s	remaining: 2.35s
382:	learn: 0.2818424	total: 4.98s	remaining: 0us
0:	learn: 2.4226402	total: 13.2ms	remaining: 5.03s
200:	learn: 0.5099140	total: 2.56s	remaining: 2.32s
382:	learn: 0.2886343	total: 4.77s	remaining: 0us
0:	learn: 2.4095225	total: 12.9ms	remaining: 4.94s
200:	learn: 0.5065000	total: 2.53s	remaining: 2.29s


[I 2024-08-28 18:46:26,368] Trial 34 finished with value: 0.6084073510091716 and parameters: {'depth': 5, 'learning_rate': 0.0892150537935293, 'iterations': 383}. Best is trial 34 with value: 0.6084073510091716.


382:	learn: 0.2953347	total: 4.83s	remaining: 0us
0:	learn: 2.4911682	total: 7.05ms	remaining: 1.71s
200:	learn: 0.9167965	total: 1.2s	remaining: 257ms
243:	learn: 0.8253849	total: 1.45s	remaining: 0us
0:	learn: 2.4998171	total: 7.3ms	remaining: 1.77s
200:	learn: 0.9164596	total: 1.23s	remaining: 262ms
243:	learn: 0.8340078	total: 1.48s	remaining: 0us
0:	learn: 2.4832671	total: 7.03ms	remaining: 1.71s
200:	learn: 0.9029215	total: 1.2s	remaining: 258ms


[I 2024-08-28 18:46:31,602] Trial 35 finished with value: 0.5785193769549023 and parameters: {'depth': 4, 'learning_rate': 0.04532788878708002, 'iterations': 244}. Best is trial 34 with value: 0.6084073510091716.


243:	learn: 0.8128938	total: 1.47s	remaining: 0us
0:	learn: 2.4419800	total: 7.62ms	remaining: 2.36s
200:	learn: 0.6832981	total: 1.18s	remaining: 647ms
310:	learn: 0.5258363	total: 1.83s	remaining: 0us
0:	learn: 2.4563418	total: 6.9ms	remaining: 2.14s
200:	learn: 0.6915679	total: 1.28s	remaining: 704ms
310:	learn: 0.5387820	total: 1.96s	remaining: 0us
0:	learn: 2.4290350	total: 7.19ms	remaining: 2.23s
200:	learn: 0.6763614	total: 1.32s	remaining: 724ms


[I 2024-08-28 18:46:38,372] Trial 36 finished with value: 0.5964448513759687 and parameters: {'depth': 4, 'learning_rate': 0.0766365028141808, 'iterations': 311}. Best is trial 34 with value: 0.6084073510091716.


310:	learn: 0.5187964	total: 1.99s	remaining: 0us
0:	learn: 2.4242899	total: 13.5ms	remaining: 5.04s
200:	learn: 0.4935974	total: 2.46s	remaining: 2.13s
374:	learn: 0.2874563	total: 4.58s	remaining: 0us
0:	learn: 2.4222325	total: 13.7ms	remaining: 5.13s
200:	learn: 0.5090716	total: 2.47s	remaining: 2.14s
374:	learn: 0.3012710	total: 4.61s	remaining: 0us
0:	learn: 2.4090817	total: 12.7ms	remaining: 4.73s
200:	learn: 0.5088587	total: 2.42s	remaining: 2.09s


[I 2024-08-28 18:46:53,244] Trial 37 finished with value: 0.5995527208154058 and parameters: {'depth': 5, 'learning_rate': 0.0894818805769783, 'iterations': 375}. Best is trial 34 with value: 0.6084073510091716.


374:	learn: 0.3012198	total: 4.53s	remaining: 0us
0:	learn: 2.5599166	total: 28.8ms	remaining: 12.1s
200:	learn: 2.0674956	total: 5.57s	remaining: 6.07s
400:	learn: 1.8033397	total: 11.1s	remaining: 525ms
419:	learn: 1.7831367	total: 11.6s	remaining: 0us
0:	learn: 2.5607998	total: 28.2ms	remaining: 11.8s
200:	learn: 2.0785092	total: 5.54s	remaining: 6.04s
400:	learn: 1.8183656	total: 11.1s	remaining: 525ms
419:	learn: 1.7979538	total: 11.6s	remaining: 0us
0:	learn: 2.5588447	total: 26.1ms	remaining: 10.9s
200:	learn: 2.0792192	total: 5.51s	remaining: 6.01s
400:	learn: 1.8153906	total: 11.1s	remaining: 525ms


[I 2024-08-28 18:47:29,475] Trial 38 finished with value: 0.45087951847253804 and parameters: {'depth': 6, 'learning_rate': 0.0029757390397433118, 'iterations': 420}. Best is trial 34 with value: 0.6084073510091716.


419:	learn: 1.7951968	total: 11.6s	remaining: 0us
0:	learn: 2.3262263	total: 7.53ms	remaining: 3.69s
200:	learn: 0.4336528	total: 1.28s	remaining: 1.85s
400:	learn: 0.2276540	total: 2.45s	remaining: 550ms
490:	learn: 0.1782358	total: 2.98s	remaining: 0us
0:	learn: 2.3537517	total: 7.08ms	remaining: 3.47s
200:	learn: 0.4402536	total: 1.19s	remaining: 1.71s
400:	learn: 0.2242134	total: 2.36s	remaining: 530ms
490:	learn: 0.1761653	total: 2.88s	remaining: 0us
0:	learn: 2.3029566	total: 6.88ms	remaining: 3.37s
200:	learn: 0.4462462	total: 1.24s	remaining: 1.78s
400:	learn: 0.2438486	total: 2.41s	remaining: 540ms


[I 2024-08-28 18:47:39,675] Trial 39 finished with value: 0.594524827882144 and parameters: {'depth': 4, 'learning_rate': 0.1557173654883564, 'iterations': 491}. Best is trial 34 with value: 0.6084073510091716.


490:	learn: 0.1943792	total: 2.94s	remaining: 0us
0:	learn: 2.5175887	total: 13.3ms	remaining: 4.48s
200:	learn: 1.0484920	total: 2.44s	remaining: 1.65s
336:	learn: 0.7576667	total: 4.14s	remaining: 0us
0:	learn: 2.5170034	total: 15.2ms	remaining: 5.09s
200:	learn: 1.0658596	total: 2.53s	remaining: 1.71s
336:	learn: 0.7777447	total: 4.19s	remaining: 0us
0:	learn: 2.5123016	total: 14.9ms	remaining: 5s
200:	learn: 1.0541458	total: 2.47s	remaining: 1.67s


[I 2024-08-28 18:47:53,201] Trial 40 finished with value: 0.569539087786521 and parameters: {'depth': 5, 'learning_rate': 0.02932380116308545, 'iterations': 337}. Best is trial 34 with value: 0.6084073510091716.


336:	learn: 0.7689446	total: 4.13s	remaining: 0us
0:	learn: 2.4237248	total: 13.6ms	remaining: 5s
200:	learn: 0.4894099	total: 2.5s	remaining: 2.09s
368:	learn: 0.2855033	total: 4.6s	remaining: 0us
0:	learn: 2.4216571	total: 12.8ms	remaining: 4.73s
200:	learn: 0.5113546	total: 2.5s	remaining: 2.09s
368:	learn: 0.2988029	total: 4.68s	remaining: 0us
0:	learn: 2.4084597	total: 14ms	remaining: 5.15s
200:	learn: 0.5081197	total: 2.47s	remaining: 2.06s


[I 2024-08-28 18:48:08,137] Trial 41 finished with value: 0.5958857638400527 and parameters: {'depth': 5, 'learning_rate': 0.08985859352946922, 'iterations': 369}. Best is trial 34 with value: 0.6084073510091716.


368:	learn: 0.2989919	total: 4.52s	remaining: 0us
0:	learn: 2.4651433	total: 13.5ms	remaining: 5.36s
200:	learn: 0.6370431	total: 2.47s	remaining: 2.43s
398:	learn: 0.3875580	total: 4.96s	remaining: 0us
0:	learn: 2.4637853	total: 14ms	remaining: 5.56s
200:	learn: 0.6550222	total: 2.48s	remaining: 2.45s
398:	learn: 0.3879667	total: 4.91s	remaining: 0us
0:	learn: 2.4541673	total: 13.4ms	remaining: 5.32s
200:	learn: 0.6544794	total: 2.48s	remaining: 2.44s


[I 2024-08-28 18:48:24,159] Trial 42 finished with value: 0.6099489074230415 and parameters: {'depth': 5, 'learning_rate': 0.0626785956571374, 'iterations': 399}. Best is trial 42 with value: 0.6099489074230415.


398:	learn: 0.3916947	total: 4.94s	remaining: 0us
0:	learn: 2.4631624	total: 28.1ms	remaining: 12.5s
200:	learn: 0.5521845	total: 5.56s	remaining: 6.72s
400:	learn: 0.2760360	total: 11.1s	remaining: 1.19s
443:	learn: 0.2479503	total: 12.3s	remaining: 0us
0:	learn: 2.4806649	total: 26.8ms	remaining: 11.9s
200:	learn: 0.5499815	total: 5.53s	remaining: 6.68s
400:	learn: 0.2784945	total: 11.1s	remaining: 1.19s
443:	learn: 0.2496632	total: 12.3s	remaining: 0us
0:	learn: 2.4418754	total: 28.1ms	remaining: 12.5s
200:	learn: 0.5440680	total: 5.54s	remaining: 6.7s
400:	learn: 0.2816395	total: 11.1s	remaining: 1.19s


[I 2024-08-28 18:49:02,505] Trial 43 finished with value: 0.5902366937609388 and parameters: {'depth': 6, 'learning_rate': 0.061520723692222806, 'iterations': 444}. Best is trial 42 with value: 0.6099489074230415.


443:	learn: 0.2540877	total: 12.3s	remaining: 0us
0:	learn: 2.4929667	total: 13.8ms	remaining: 5.63s
200:	learn: 0.8113366	total: 2.45s	remaining: 2.52s
400:	learn: 0.4965247	total: 4.94s	remaining: 86.2ms
407:	learn: 0.4894408	total: 5.03s	remaining: 0us
0:	learn: 2.4920356	total: 13.8ms	remaining: 5.62s
200:	learn: 0.8255792	total: 2.48s	remaining: 2.56s
400:	learn: 0.5056679	total: 4.94s	remaining: 86.2ms
407:	learn: 0.4997994	total: 5.02s	remaining: 0us
0:	learn: 2.4849808	total: 13.1ms	remaining: 5.31s
200:	learn: 0.8218640	total: 2.47s	remaining: 2.54s


[I 2024-08-28 18:49:18,785] Trial 44 finished with value: 0.5951655831202173 and parameters: {'depth': 5, 'learning_rate': 0.04485312944109731, 'iterations': 408}. Best is trial 42 with value: 0.6099489074230415.


400:	learn: 0.5087454	total: 4.94s	remaining: 86.3ms
407:	learn: 0.5035401	total: 5.03s	remaining: 0us
0:	learn: 2.3956669	total: 13.9ms	remaining: 4.31s
200:	learn: 0.4249566	total: 2.45s	remaining: 1.33s
309:	learn: 0.2833869	total: 3.8s	remaining: 0us
0:	learn: 2.3930647	total: 14.2ms	remaining: 4.4s
200:	learn: 0.4371074	total: 2.48s	remaining: 1.34s
309:	learn: 0.2863652	total: 3.83s	remaining: 0us
0:	learn: 2.3776454	total: 12.9ms	remaining: 3.99s
200:	learn: 0.4314068	total: 2.46s	remaining: 1.33s


[I 2024-08-28 18:49:31,281] Trial 45 finished with value: 0.6034940128621965 and parameters: {'depth': 5, 'learning_rate': 0.10879708205680315, 'iterations': 310}. Best is trial 42 with value: 0.6099489074230415.


309:	learn: 0.2955823	total: 3.85s	remaining: 0us
0:	learn: 2.3911088	total: 7.11ms	remaining: 725ms
102:	learn: 0.8092829	total: 666ms	remaining: 0us
0:	learn: 2.4113156	total: 8.39ms	remaining: 856ms
102:	learn: 0.8299185	total: 653ms	remaining: 0us
0:	learn: 2.3732774	total: 6.55ms	remaining: 669ms


[I 2024-08-28 18:49:33,729] Trial 46 finished with value: 0.5784690255611766 and parameters: {'depth': 4, 'learning_rate': 0.1102656076666726, 'iterations': 103}. Best is trial 42 with value: 0.6099489074230415.


102:	learn: 0.8051497	total: 610ms	remaining: 0us
0:	learn: 2.3471899	total: 14.2ms	remaining: 4.24s
200:	learn: 0.3290781	total: 2.45s	remaining: 1.19s
298:	learn: 0.2208888	total: 3.65s	remaining: 0us
0:	learn: 2.3435533	total: 12.6ms	remaining: 3.75s
200:	learn: 0.3458891	total: 2.5s	remaining: 1.22s
298:	learn: 0.2283135	total: 3.75s	remaining: 0us
0:	learn: 2.3248089	total: 17.9ms	remaining: 5.34s
200:	learn: 0.3464973	total: 2.47s	remaining: 1.21s


[I 2024-08-28 18:49:45,784] Trial 47 finished with value: 0.5938729559500415 and parameters: {'depth': 5, 'learning_rate': 0.14278575741818328, 'iterations': 299}. Best is trial 42 with value: 0.6099489074230415.


298:	learn: 0.2341727	total: 3.68s	remaining: 0us
0:	learn: 2.2761428	total: 30.3ms	remaining: 7.99s
200:	learn: 0.1729836	total: 5.94s	remaining: 1.89s
264:	learn: 0.1194901	total: 7.89s	remaining: 0us
0:	learn: 2.3216291	total: 29.4ms	remaining: 7.77s
200:	learn: 0.1723122	total: 5.7s	remaining: 1.81s
264:	learn: 0.1202334	total: 7.47s	remaining: 0us
0:	learn: 2.2209977	total: 26.7ms	remaining: 7.05s
200:	learn: 0.1785396	total: 5.57s	remaining: 1.77s


[I 2024-08-28 18:50:09,450] Trial 48 finished with value: 0.5820681360079739 and parameters: {'depth': 6, 'learning_rate': 0.187199261902281, 'iterations': 265}. Best is trial 42 with value: 0.6099489074230415.


264:	learn: 0.1217007	total: 7.36s	remaining: 0us
0:	learn: 2.4538861	total: 28.2ms	remaining: 5.75s
200:	learn: 0.5107875	total: 5.73s	remaining: 114ms
204:	learn: 0.5022485	total: 5.85s	remaining: 0us
0:	learn: 2.4729349	total: 29.6ms	remaining: 6.03s
200:	learn: 0.5055049	total: 5.79s	remaining: 115ms
204:	learn: 0.4949439	total: 5.9s	remaining: 0us
0:	learn: 2.4307158	total: 28.9ms	remaining: 5.89s


[I 2024-08-28 18:50:28,361] Trial 49 finished with value: 0.5785729447439034 and parameters: {'depth': 6, 'learning_rate': 0.06729395418863261, 'iterations': 205}. Best is trial 42 with value: 0.6099489074230415.


200:	learn: 0.5007946	total: 6.24s	remaining: 124ms
204:	learn: 0.4926396	total: 6.35s	remaining: 0us
Best CatBoost hyperparameters:  {'depth': 5, 'learning_rate': 0.0626785956571374, 'iterations': 399}


In [75]:
import optuna
from xgboost import XGBClassifier

# XGBoost 최적화 목적 함수 정의
def optimize_xgboost(trial):
    # 최적화할 하이퍼파라미터 설정
    n_estimators = trial.suggest_int('n_estimators', 100, 600)
    max_depth = trial.suggest_int('max_depth', 3, 8)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-3, 0.2)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
    
    # XGBoost 모델 생성
    model = XGBClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        eval_metric='logloss',
        random_state=42
    )
    
    # 교차 검증으로 모델 성능 평가 (F1 Macro)
    score = cross_val_score(model, train_x_shap, train_y_encoded_down, cv=3, scoring='f1_macro').mean()
    return score

# Optuna 스터디 생성 및 최적화 실행
xgboost_study = optuna.create_study(direction='maximize')
xgboost_study.optimize(optimize_xgboost, n_trials=200)

# 최적 하이퍼파라미터 출력
print("Best XGBoost hyperparameters: ", xgboost_study.best_params)

# 최적 하이퍼파라미터로 XGBoost 모델 생성
best_xgboost_model = XGBClassifier(
    n_estimators=xgboost_study.best_params['n_estimators'],
    max_depth=xgboost_study.best_params['max_depth'],
    learning_rate=xgboost_study.best_params['learning_rate'],
    subsample=xgboost_study.best_params['subsample'],
    colsample_bytree=xgboost_study.best_params['colsample_bytree'],
    eval_metric='logloss',
    random_state=42
)


[I 2024-08-28 18:50:28,375] A new study created in memory with name: no-name-99887387-5401-4023-88c5-af274dd31230
[I 2024-08-28 18:50:32,904] Trial 0 finished with value: 0.6345178215699324 and parameters: {'n_estimators': 107, 'max_depth': 7, 'learning_rate': 0.004998375176079724, 'subsample': 0.6216686201157264, 'colsample_bytree': 0.8801802861189068}. Best is trial 0 with value: 0.6345178215699324.
[I 2024-08-28 18:50:39,696] Trial 1 finished with value: 0.6312797900493908 and parameters: {'n_estimators': 450, 'max_depth': 5, 'learning_rate': 0.10574901182131174, 'subsample': 0.851487661080798, 'colsample_bytree': 0.5110574796518926}. Best is trial 0 with value: 0.6345178215699324.
[I 2024-08-28 18:50:52,105] Trial 2 finished with value: 0.6321053941992916 and parameters: {'n_estimators': 422, 'max_depth': 5, 'learning_rate': 0.003994491843163224, 'subsample': 0.5193418733604906, 'colsample_bytree': 0.9111988417350632}. Best is trial 0 with value: 0.6345178215699324.
[I 2024-08-28 1

Best XGBoost hyperparameters:  {'n_estimators': 572, 'max_depth': 5, 'learning_rate': 0.012612357316550681, 'subsample': 0.992096007767186, 'colsample_bytree': 0.9672455205640248}


In [76]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier,VotingClassifier

# randomforest_model = RandomForestClassifier(random_state=42)

voting_clf = VotingClassifier(
    estimators=[
        ('catboost', best_catboost_model),
        ('xgboost', best_xgboost_model)
    #     ('randomforest', randomforest_model)  # 기존 RandomForest 모델
    # ],
    ],
    voting='soft'
)

# 앙상블 모델 학습
voting_clf.fit(train_x_shap, train_y_encoded_down)


In [77]:
# 예측
predictions = voting_clf.predict(test_x_shap)
predictions_label = le_subclass.inverse_transform(predictions)

# Submission

In [78]:
# 분류 예측 결과 제출 데이터프레임(DataFrame)
# 분류 예측 결과 데이터프레임 파일명을 반드시 clf_submission.csv 로 지정해야합니다.
clf_submission = pd.read_csv("sample_submission.csv")
clf_submission["Fraud_Type"] = predictions_label
clf_submission.head()

Unnamed: 0,ID,Fraud_Type
0,TEST_000000,b
1,TEST_000001,m
2,TEST_000002,m
3,TEST_000003,m
4,TEST_000004,h


In [79]:
# 합성 데이터 생성 결과 제출 데이터프레임(DataFrame)
# 합성 데이터 생성 결과 데이터프레임 파일명을 반드시 syn_submission.csv 로 지정해야합니다.
# all_synthetic_data.head()

In [80]:
'''
(*) 저장 시 각 파일명을 반드시 확인해주세요.
    1. 분류 예측 결과 데이터프레임 파일명 = clf_submission.csv
    2. 합성 데이터 생성 결과 데이터프레임 파일명 = syn_submission.csv

(*) 제출 파일(zip) 내에 두 개의 데이터프레임이 각각 위의 파일명으로 반드시 존재해야합니다.
(*) 파일명을 일치시키지 않으면 채점이 불가능합니다.
'''
from datetime import datetime
today_datetime = datetime.today().strftime('%y%m%d_%H%M')
os.chdir('G:/내 드라이브/DACON_proj/DACON/2024_FSI_AIxData_Challenge')
# 폴더 생성 및 작업 디렉토리 변경
os.makedirs('./submission', exist_ok=True)
os.chdir('./submission')

# CSV 파일로 저장
clf_submission.to_csv('./clf_submission.csv', encoding='UTF-8-sig', index=False)
# all_synthetic_data.to_csv('./syn_submission.csv', encoding='UTF-8-sig', index=False)

# ZIP 파일 생성 및 CSV 파일 추가
with zipfile.ZipFile(f'submission_{today_datetime}.zip', 'w') as submission:
    submission.write('clf_submission.csv')
    submission.write('syn_submission.csv')
    
print('Done.')
os.chdir('G:/내 드라이브/DACON_proj/DACON/2024_FSI_AIxData_Challenge')

Done.


In [81]:
import winsound

# 주파수와 지속시간 설정 (주파수 단위: Hertz, 지속시간 단위: 밀리초)
frequency = 1000  # 주파수 (Hertz)
duration = 300    # 지속시간 (Milliseconds)

# 소리 재생
winsound.Beep(frequency, duration)