# Anomalous Financial Transaction Detection

본 대회의 과제는 금융 거래 데이터에서 **이상 거래를 탐지하는 기능**을 개선하고 활용도를 높이는 분류 AI모델을 개발하는 것입니다. 

특히, 클래스 불균형 문제를 해결하기 위해 오픈소스 생성형 AI 모델을 활용하여 부족한 클래스의 데이터를 보완하고, 이를 통해 분류 모델의 성능을 향상시키는 것이 핵심 목표입니다. 

이러한 접근을 통해 금융보안에 특화된 데이터 분석 및 활용 역량을 강화하여 전문 인력을 양성하고, 금융권의 AI 활용 어려움에 따른 해결 방안을 함께 모색하며 금융 산업의 AI 활용 활성화를 지원하는 것을 목표로 합니다.

# Import Library

In [1]:
# pip install sdv

In [2]:
# 제출 파일 생성 관련
import os
import zipfile

# 데이터 처리 및 분석
import pandas as pd
import numpy as np
from scipy import stats
from tqdm import tqdm

# 머신러닝 전처리
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

# 머신러닝 모델
import xgboost as xgb

# 합성 데이터 생성
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer

# To ignore all warnings
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

# 생성 🏭

# Load Data

In [3]:
train_all = pd.read_csv("train.csv")
test_all = pd.read_csv("test.csv")

In [4]:
train = train_all.drop(columns="ID")

In [5]:
train.shape

(120000, 63)

In [6]:
'''
(*) 리더보드 산식 중 생성데이터의 익명성(TCAP)채점을 위해 각 클래스 별로 1000개의 생성데이터가 반드시 필요합니다.
(*) 본 베이스 라인에서는 "Fraud_Type" 13종류에 대해 1000개씩 , 총 13,000개의 데이터를 생성할 예정입니다.
(*) 분류 모델 성능 개선을 위해 생성 데이터를 활용하는 것에는 생성 데이터의 Row 개수에 제한이 없습니다. 단, 리더보드 평가를 위해 제출을 하는 생성 데이터 프레임은 익명성(TCAP) 평가를 위함이며, 위의 조건을 갖춘 생성 데이터를 제출해야합니다.
'''
N_CLS_PER_GEN = 1000
N_CLS_PER_GEN_2 = 1

In [7]:
# pd.set_option('display.max_rows', None)  # 모든 행 표시
# cond_all = pd.read_excel("데이터_명세_및_생성조건.xlsx", header=1)
# cond_all.iloc[:,2:-1]

## 제출용 합성데이터

In [8]:
# from ctgan import CTGAN 
# import pandas as pd
# from tqdm import tqdm
# import numpy as np
# from scipy import stats
# from sdv.metadata import SingleTableMetadata

# # 이상치 처리 함수
# def handle_outliers(series, n_std=3):
#     mean = series.mean()
#     std = series.std()
#     z_scores = np.abs(stats.zscore(series))
#     return series.mask(z_scores > n_std, mean)

# # 범주형 데이터 조건 강제 함수
# def enforce_categorical_conditions(df):
#     # Customer_Gender: 'male', 'female'만 허용
#     df['Customer_Gender'] = df['Customer_Gender'].apply(lambda x: 'male' if x == 'male' else 'female')
    
#     # Customer_credit_rating: 'S', 'A', 'B', 'C', 'D', 'E'만 허용
#     valid_ratings = ['S', 'A', 'B', 'C', 'D', 'E']
#     df['Customer_credit_rating'] = df['Customer_credit_rating'].apply(lambda x: x if x in valid_ratings else 'B')
    
#     # Customer_loan_type: 'a', 'b', 'c', 'd', 'e'만 허용
#     valid_loan_types = ['a', 'b', 'c', 'd', 'e']
#     df['Customer_loan_type'] = df['Customer_loan_type'].apply(lambda x: x if x in valid_loan_types else 'c')
    
#     # 기타 범주형 변수들 0, 1만 허용
#     binary_columns = ['Customer_flag_change_of_authentication_1', 'Customer_flag_change_of_authentication_2',
#                       'Customer_flag_change_of_authentication_3', 'Customer_flag_change_of_authentication_4',
#                       'Customer_rooting_jailbreak_indicator', 'Customer_mobile_roaming_indicator', 
#                       'Customer_VPN_Indicator', 'Customer_flag_terminal_malicious_behavior_1',
#                       'Customer_flag_terminal_malicious_behavior_2', 'Customer_flag_terminal_malicious_behavior_3',
#                       'Customer_flag_terminal_malicious_behavior_4', 'Customer_flag_terminal_malicious_behavior_5',
#                       'Customer_flag_terminal_malicious_behavior_6', 'Customer_inquery_atm_limit',
#                       'Customer_increase_atm_limit', 'Account_indicator_release_limit_excess',
#                       'Account_indicator_Openbanking', 'Account_release_suspention', 'Transaction_Failure_Status',
#                       'Another_Person_Account', 'Unused_terminal_status', 'Flag_deposit_more_than_tenMillion',
#                       'Unused_account_status', 'Recipient_account_suspend_status', 'First_time_iOS_by_vulnerable_user']
#     for col in binary_columns:
#         df[col] = df[col].apply(lambda x: 1 if x == 1 else 0)
    
#     return df

# # 수치형 데이터 조건 강제 함수
# def enforce_numerical_conditions(df):
#     # Customer_Birthyear: 1950 ~ 2004 범위 제한
#     df['Customer_Birthyear'] = df['Customer_Birthyear'].clip(1950, 2004)
    
#     # Account_initial_balance: 음수가 될 수 없으므로 최소값을 0으로 설정
#     df['Account_initial_balance'] = df['Account_initial_balance'].clip(lower=0)
    
#     # Account_balance: 음수가 될 수 없으므로 최소값을 0으로 설정
#     df['Account_balance'] = df['Account_balance'].clip(lower=0)
    
#     # Account_amount_daily_limit: 0 이상의 값으로 설정 (예시로 최대값도 설정 가능)
#     df['Account_amount_daily_limit'] = df['Account_amount_daily_limit'].clip(lower=0)
    
#     # Account_remaining_amount_daily_limit_exceeded: 0 이상의 값으로 설정
#     df['Account_remaining_amount_daily_limit_exceeded'] = df['Account_remaining_amount_daily_limit_exceeded'].clip(lower=0)
    
#     # Account_one_month_max_amount, Account_dawn_one_month_max_amount: 음수일 수 있으므로, 필요시 범위 설정
#     # 이 항목들은 특정 조건이 있다면 적용
#     # 예: 최댓값을 특정 범위로 제한
#     df['Account_one_month_max_amount'] = df['Account_one_month_max_amount'].clip(lower=-1000000, upper=1000000)
#     df['Account_dawn_one_month_max_amount'] = df['Account_dawn_one_month_max_amount'].clip(lower=-1000000, upper=1000000)
    
#     # Account_one_month_std_dev, Account_dawn_one_month_std_dev: 표준편차는 음수가 될 수 없으므로 0으로 제한
#     df['Account_one_month_std_dev'] = df['Account_one_month_std_dev'].clip(lower=0)
#     df['Account_dawn_one_month_std_dev'] = df['Account_dawn_one_month_std_dev'].clip(lower=0)
    
#     # Transaction_Amount: 특정 범위 내로 설정 (예: 음수도 허용, 최대값 제한)
#     df['Transaction_Amount'] = df['Transaction_Amount'].clip(lower=-100000000, upper=100000000)
    
#     # Distance: 음수가 될 수 없으므로 최소값을 0으로 설정
#     df['Distance'] = df['Distance'].clip(lower=0)
    
#     # Number_of_transaction_with_the_account, Transaction_history_with_the_account: 음수가 될 수 없으므로 최소값을 0으로 설정
#     df['Number_of_transaction_with_the_account'] = df['Number_of_transaction_with_the_account'].clip(lower=0)
#     df['Transaction_history_with_the_account'] = df['Transaction_history_with_the_account'].clip(lower=0)
    
#     return df

# # Time_difference 컬럼을 총 초로 변환 및 이상치 처리
# train['Time_difference_seconds'] = pd.to_timedelta(train['Time_difference']).dt.total_seconds()
# train['Time_difference_seconds'] = handle_outliers(train['Time_difference_seconds'])

# # 모든 Fraud_Type 목록 생성
# fraud_types = train['Fraud_Type'].unique()  # Fraud_Type로 변경

# # 모든 합성 데이터를 저장할 DataFrame 초기화
# all_synthetic_data = pd.DataFrame()

# N_SAMPLE = 100

# # 각 Fraud_Type에 대해 합성 데이터 생성 및 저장
# for fraud_type in tqdm(fraud_types):
    
#     # 해당 Fraud_Type에 대한 서브셋 생성
#     subset = train[train["Fraud_Type"] == fraud_type]

#     # 모든 Fraud_Type에 대해 100개씩 샘플링
#     subset = subset.sample(n=N_SAMPLE, random_state=42)
    
#     # Time_difference 열 제외 (초 단위로 변환된 컬럼만 사용)
#     subset = subset.drop('Time_difference', axis=1)
    
#     # 메타데이터 생성 및 모델 학습
#     metadata = SingleTableMetadata()

#     metadata.detect_from_dataframe(subset)
#     metadata.set_primary_key(None)

#     # 데이터 타입 설정
#     column_sdtypes = {
#         'Customer_Birthyear': 'numerical',
#         # 'Customer_Gender': 'categorical',
#         'Customer_personal_identifier': 'categorical',
#         'Customer_identification_number': 'categorical',
#         # 'Customer_registration_datetime': 'datetime',
#         'Customer_credit_rating': 'categorical',
#         # 'Customer_flag_change_of_authentication_1': 'categorical',
#         # 'Customer_flag_change_of_authentication_2': 'categorical',
#         # 'Customer_flag_change_of_authentication_3': 'categorical',
#         # 'Customer_flag_change_of_authentication_4': 'categorical',
#         # 'Customer_rooting_jailbreak_indicator': 'categorical',
#         # 'Customer_mobile_roaming_indicator': 'categorical',
#         # 'Customer_VPN_Indicator': 'categorical',
#         # 'Customer_loan_type': 'categorical',
#         # 'Customer_flag_terminal_malicious_behavior_1': 'categorical',
#         # 'Customer_flag_terminal_malicious_behavior_2': 'categorical',
#         # 'Customer_flag_terminal_malicious_behavior_3': 'categorical',
#         # 'Customer_flag_terminal_malicious_behavior_4': 'categorical',
#         # 'Customer_flag_terminal_malicious_behavior_5': 'categorical',
#         # 'Customer_flag_terminal_malicious_behavior_6': 'categorical',
#         # 'Customer_inquery_atm_limit': 'categorical',
#         # 'Customer_increase_atm_limit': 'categorical',
#         'Account_account_number': 'categorical',
#         # 'Account_account_type': 'categorical',
#         # 'Account_creation_datetime': 'datetime',
#         'Account_initial_balance': 'numerical',
#         'Account_balance': 'numerical',
#         # 'Account_indicator_release_limit_excess': 'categorical',
#         'Account_amount_daily_limit': 'numerical',
#         'Account_indicator_Openbanking': 'categorical',
#         'Account_remaining_amount_daily_limit_exceeded': 'numerical',
#         # 'Account_release_suspention': 'categorical',
#         'Account_one_month_max_amount': 'numerical',
#         'Account_one_month_std_dev': 'numerical',
#         'Account_dawn_one_month_max_amount': 'numerical',
#         'Account_dawn_one_month_std_dev': 'numerical',
#         # 'Transaction_Datetime': 'datetime',
#         'Transaction_Amount': 'numerical',
#         # 'Channel': 'categorical',
#         # 'Operating_System': 'categorical',
#         # 'Error_Code': 'categorical',
#         # 'Transaction_Failure_Status': 'categorical',
#         # 'Type_General_Automatic': 'categorical',
#         'IP_Address': 'ipv4_address',
#         # 'Access_Medium': 'categorical',
#         'Location': 'categorical',
#         'Recipient_Account_Number': 'categorical',
#         'Transaction_num_connection_failure': 'numerical',
#         # 'Another_Person_Account': 'categorical',
#         'Distance': 'numerical',
#         'Time_difference_seconds': 'numerical',
#         # 'Unused_terminal_status': 'categorical',
#         # 'Last_atm_transaction_datetime': 'datetime',
#         # 'Last_bank_branch_transaction_datetime': 'datetime',
#         # 'Flag_deposit_more_than_tenMillion': 'categorical',
#         # 'Unused_account_status': 'categorical',
#         # 'Recipient_account_suspend_status': 'categorical',
#         'Number_of_transaction_with_the_account': 'numerical',
#         'Transaction_history_with_the_account': 'numerical',
#         # 'First_time_iOS_by_vulnerable_user': 'categorical',
#         # 'Transaction_resumed_date': 'datetime',
#         'Fraud_Type': 'categorical'
#     }

#     # 각 컬럼에 대해 데이터 타입 설정
#     for column, sdtype in column_sdtypes.items():
#         metadata.update_column(
#             column_name=column,
#             sdtype=sdtype
#         )
        
#     synthesizer = CTGANSynthesizer(
#                             metadata,
#                             epochs= 2000
#                         )
#     synthesizer.fit(subset)

#     synthetic_subset = synthesizer.sample(num_rows=N_CLS_PER_GEN)  # 합성 데이터 생성 수 설정
    
#     # 생성된 Time_difference_seconds의 이상치 처리
#     synthetic_subset['Time_difference_seconds'] = handle_outliers(synthetic_subset['Time_difference_seconds'])
    
#     # Time_difference_seconds를 다시 timedelta로 변환
#     synthetic_subset['Time_difference'] = pd.to_timedelta(synthetic_subset['Time_difference_seconds'], unit='s')
    
#     # Time_difference_seconds 컬럼 제거
#     synthetic_subset = synthetic_subset.drop('Time_difference_seconds', axis=1)
    
#     # 생성 조건 반영 (범주형, 수치형, 형식 조건)
#     synthetic_subset = enforce_categorical_conditions(synthetic_subset)
#     synthetic_subset = enforce_numerical_conditions(synthetic_subset)
    
#     # 생성된 데이터를 all_synthetic_data에 추가
#     all_synthetic_data = pd.concat([all_synthetic_data, synthetic_subset], ignore_index=True)

# # 최종 결과 확인
# print("\nFinal All Synthetic Data Shape:", all_synthetic_data.shape)


In [9]:
# all_synthetic_data.to_csv('submission/syn_submission.csv', encoding='UTF-8-sig', index=False)

## 성능용 합성데이터

CTGAN

In [10]:
from ctgan import CTGAN 
import pandas as pd
from tqdm import tqdm
import numpy as np
from scipy import stats
from sdv.metadata import SingleTableMetadata

# 이상치 처리 함수
def handle_outliers(series, n_std=3):
    mean = series.mean()
    std = series.std()
    z_scores = np.abs(stats.zscore(series))
    return series.mask(z_scores > n_std, mean)

# 범주형 데이터 조건 강제 함수
def enforce_categorical_conditions(df):
    # Customer_Gender: 'male', 'female'만 허용
    df['Customer_Gender'] = df['Customer_Gender'].apply(lambda x: 'male' if x == 'male' else 'female')
    
    # Customer_credit_rating: 'S', 'A', 'B', 'C', 'D', 'E'만 허용
    valid_ratings = ['S', 'A', 'B', 'C', 'D', 'E']
    df['Customer_credit_rating'] = df['Customer_credit_rating'].apply(lambda x: x if x in valid_ratings else 'B')
    
    # Customer_loan_type: 'a', 'b', 'c', 'd', 'e'만 허용
    valid_loan_types = ['a', 'b', 'c', 'd', 'e']
    df['Customer_loan_type'] = df['Customer_loan_type'].apply(lambda x: x if x in valid_loan_types else 'c')
    
    # 기타 범주형 변수들 0, 1만 허용
    binary_columns = ['Customer_flag_change_of_authentication_1', 'Customer_flag_change_of_authentication_2',
                      'Customer_flag_change_of_authentication_3', 'Customer_flag_change_of_authentication_4',
                      'Customer_rooting_jailbreak_indicator', 'Customer_mobile_roaming_indicator', 
                      'Customer_VPN_Indicator', 'Customer_flag_terminal_malicious_behavior_1',
                      'Customer_flag_terminal_malicious_behavior_2', 'Customer_flag_terminal_malicious_behavior_3',
                      'Customer_flag_terminal_malicious_behavior_4', 'Customer_flag_terminal_malicious_behavior_5',
                      'Customer_flag_terminal_malicious_behavior_6', 'Customer_inquery_atm_limit',
                      'Customer_increase_atm_limit', 'Account_indicator_release_limit_excess',
                      'Account_indicator_Openbanking', 'Account_release_suspention', 'Transaction_Failure_Status',
                      'Another_Person_Account', 'Unused_terminal_status', 'Flag_deposit_more_than_tenMillion',
                      'Unused_account_status', 'Recipient_account_suspend_status', 'First_time_iOS_by_vulnerable_user']
    for col in binary_columns:
        df[col] = df[col].apply(lambda x: 1 if x == 1 else 0)
    
    return df

# 수치형 데이터 조건 강제 함수
def enforce_numerical_conditions(df):
    # Customer_Birthyear: 1950 ~ 2004 범위 제한
    df['Customer_Birthyear'] = df['Customer_Birthyear'].clip(1950, 2004)
    
    # Account_initial_balance: 음수가 될 수 없으므로 최소값을 0으로 설정
    df['Account_initial_balance'] = df['Account_initial_balance'].clip(lower=0)
    
    # Account_balance: 음수가 될 수 없으므로 최소값을 0으로 설정
    df['Account_balance'] = df['Account_balance'].clip(lower=0)
    
    # Account_amount_daily_limit: 0 이상의 값으로 설정 (예시로 최대값도 설정 가능)
    df['Account_amount_daily_limit'] = df['Account_amount_daily_limit'].clip(lower=0)
    
    # Account_remaining_amount_daily_limit_exceeded: 0 이상의 값으로 설정
    df['Account_remaining_amount_daily_limit_exceeded'] = df['Account_remaining_amount_daily_limit_exceeded'].clip(lower=0)
    
    # Account_one_month_max_amount, Account_dawn_one_month_max_amount: 음수일 수 있으므로, 필요시 범위 설정
    # 이 항목들은 특정 조건이 있다면 적용
    # 예: 최댓값을 특정 범위로 제한
    df['Account_one_month_max_amount'] = df['Account_one_month_max_amount'].clip(lower=-1000000, upper=1000000)
    df['Account_dawn_one_month_max_amount'] = df['Account_dawn_one_month_max_amount'].clip(lower=-1000000, upper=1000000)
    
    # Account_one_month_std_dev, Account_dawn_one_month_std_dev: 표준편차는 음수가 될 수 없으므로 0으로 제한
    df['Account_one_month_std_dev'] = df['Account_one_month_std_dev'].clip(lower=0)
    df['Account_dawn_one_month_std_dev'] = df['Account_dawn_one_month_std_dev'].clip(lower=0)
    
    # Transaction_Amount: 특정 범위 내로 설정 (예: 음수도 허용, 최대값 제한)
    df['Transaction_Amount'] = df['Transaction_Amount'].clip(lower=-100000000, upper=100000000)
    
    # Distance: 음수가 될 수 없으므로 최소값을 0으로 설정
    df['Distance'] = df['Distance'].clip(lower=0)
    
    # Number_of_transaction_with_the_account, Transaction_history_with_the_account: 음수가 될 수 없으므로 최소값을 0으로 설정
    df['Number_of_transaction_with_the_account'] = df['Number_of_transaction_with_the_account'].clip(lower=0)
    df['Transaction_history_with_the_account'] = df['Transaction_history_with_the_account'].clip(lower=0)
    
    return df



# Time_difference 컬럼을 총 초로 변환 및 이상치 처리
train['Time_difference_seconds'] = pd.to_timedelta(train['Time_difference']).dt.total_seconds()
train['Time_difference_seconds'] = handle_outliers(train['Time_difference_seconds'])

# 모든 Fraud_Type 목록 생성
fraud_types = train['Fraud_Type'].unique()  # Fraud_Type로 변경

# 모든 합성 데이터를 저장할 DataFrame 초기화
all_synthetic_data_ctgan = pd.DataFrame()

N_SAMPLE = 100

# 각 Fraud_Type에 대해 합성 데이터 생성 및 저장
for fraud_type in tqdm(fraud_types):
    
    # 해당 Fraud_Type에 대한 서브셋 생성
    subset = train[train["Fraud_Type"] == fraud_type]

    # 모든 Fraud_Type에 대해 100개씩 샘플링
    subset = subset.sample(n=N_SAMPLE, random_state=42)
    
    # Time_difference 열 제외 (초 단위로 변환된 컬럼만 사용)
    subset = subset.drop('Time_difference', axis=1)
    
    # 메타데이터 생성 및 모델 학습
    metadata = SingleTableMetadata()

    metadata.detect_from_dataframe(subset)
    metadata.set_primary_key(None)

    # 데이터 타입 설정
    column_sdtypes = {
        'Customer_Birthyear': 'numerical',
        # 'Customer_Gender': 'categorical',
        'Customer_personal_identifier': 'categorical',
        'Customer_identification_number': 'categorical',
        # 'Customer_registration_datetime': 'datetime',
        'Customer_credit_rating': 'categorical',
        # 'Customer_flag_change_of_authentication_1': 'categorical',
        # 'Customer_flag_change_of_authentication_2': 'categorical',
        # 'Customer_flag_change_of_authentication_3': 'categorical',
        # 'Customer_flag_change_of_authentication_4': 'categorical',
        # 'Customer_rooting_jailbreak_indicator': 'categorical',
        # 'Customer_mobile_roaming_indicator': 'categorical',
        # 'Customer_VPN_Indicator': 'categorical',
        # 'Customer_loan_type': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_1': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_2': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_3': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_4': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_5': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_6': 'categorical',
        # 'Customer_inquery_atm_limit': 'categorical',
        # 'Customer_increase_atm_limit': 'categorical',
        'Account_account_number': 'categorical',
        # 'Account_account_type': 'categorical',
        # 'Account_creation_datetime': 'datetime',
        'Account_initial_balance': 'numerical',
        'Account_balance': 'numerical',
        # 'Account_indicator_release_limit_excess': 'categorical',
        'Account_amount_daily_limit': 'numerical',
        'Account_indicator_Openbanking': 'categorical',
        'Account_remaining_amount_daily_limit_exceeded': 'numerical',
        # 'Account_release_suspention': 'categorical',
        'Account_one_month_max_amount': 'numerical',
        'Account_one_month_std_dev': 'numerical',
        'Account_dawn_one_month_max_amount': 'numerical',
        'Account_dawn_one_month_std_dev': 'numerical',
        # 'Transaction_Datetime': 'datetime',
        'Transaction_Amount': 'numerical',
        # 'Channel': 'categorical',
        # 'Operating_System': 'categorical',
        # 'Error_Code': 'categorical',
        # 'Transaction_Failure_Status': 'categorical',
        # 'Type_General_Automatic': 'categorical',
        'IP_Address': 'ipv4_address',
        # 'Access_Medium': 'categorical',
        'Location': 'categorical',
        'Recipient_Account_Number': 'categorical',
        'Transaction_num_connection_failure': 'numerical',
        # 'Another_Person_Account': 'categorical',
        'Distance': 'numerical',
        'Time_difference_seconds': 'numerical',
        # 'Unused_terminal_status': 'categorical',
        # 'Last_atm_transaction_datetime': 'datetime',
        # 'Last_bank_branch_transaction_datetime': 'datetime',
        # 'Flag_deposit_more_than_tenMillion': 'categorical',
        # 'Unused_account_status': 'categorical',
        # 'Recipient_account_suspend_status': 'categorical',
        'Number_of_transaction_with_the_account': 'numerical',
        'Transaction_history_with_the_account': 'numerical',
        # 'First_time_iOS_by_vulnerable_user': 'categorical',
        # 'Transaction_resumed_date': 'datetime',
        'Fraud_Type': 'categorical'
    }
    # 각 컬럼에 대해 데이터 타입 설정
    for column, sdtype in column_sdtypes.items():
        metadata.update_column(
            column_name=column,
            sdtype=sdtype
        )
        
    synthesizer = CTGANSynthesizer(
                            metadata,
                            epochs=200
                        )
    synthesizer.fit(subset)

    synthetic_subset = synthesizer.sample(num_rows=N_CLS_PER_GEN_2)  # 합성 데이터 생성 수 설정
    
    # 생성된 Time_difference_seconds의 이상치 처리
    synthetic_subset['Time_difference_seconds'] = handle_outliers(synthetic_subset['Time_difference_seconds'])
    
    # Time_difference_seconds를 다시 timedelta로 변환
    synthetic_subset['Time_difference'] = pd.to_timedelta(synthetic_subset['Time_difference_seconds'], unit='s')
    
    # Time_difference_seconds 컬럼 제거
    synthetic_subset = synthetic_subset.drop('Time_difference_seconds', axis=1)
    
    # 생성 조건 반영 (범주형, 수치형, 형식 조건)
    synthetic_subset = enforce_categorical_conditions(synthetic_subset)
    synthetic_subset = enforce_numerical_conditions(synthetic_subset)
    
    # 생성된 데이터를 all_synthetic_data에 추가
    all_synthetic_data_ctgan = pd.concat([all_synthetic_data_ctgan, synthetic_subset], ignore_index=True)

# 최종 결과 확인
print("\nFinal All Synthetic Data ctgan Shape:", all_synthetic_data_ctgan.shape)


100%|██████████| 13/13 [10:55<00:00, 50.41s/it]


Final All Synthetic Data ctgan Shape: (13, 63)





## 원본 데이터와 concat

In [63]:
origin_train = train_all.drop(columns="ID")
train_total = pd.concat([origin_train, all_synthetic_data_ctgan])
train_total.shape

(120013, 63)

# Data Preprocessing 1 : Select x, y

In [64]:
train_x = train_total.drop(columns=['Fraud_Type'])
train_y = train_total['Fraud_Type']

test_x = test_all.drop(columns=['ID'])

# Data Preprocessing 2 : 범주형 변수 인코딩

In [65]:
le_subclass = LabelEncoder()
train_y_encoded = le_subclass.fit_transform(train_y)

# 변환된 레이블 확인
for i, label in enumerate(le_subclass.classes_):
    print(f"원래 레이블: {label}, 변환된 숫자: {i}")

원래 레이블: a, 변환된 숫자: 0
원래 레이블: b, 변환된 숫자: 1
원래 레이블: c, 변환된 숫자: 2
원래 레이블: d, 변환된 숫자: 3
원래 레이블: e, 변환된 숫자: 4
원래 레이블: f, 변환된 숫자: 5
원래 레이블: g, 변환된 숫자: 6
원래 레이블: h, 변환된 숫자: 7
원래 레이블: i, 변환된 숫자: 8
원래 레이블: j, 변환된 숫자: 9
원래 레이블: k, 변환된 숫자: 10
원래 레이블: l, 변환된 숫자: 11
원래 레이블: m, 변환된 숫자: 12


In [66]:
# train_x
# 'Time_difference' 열을 문자열로 변환
train_x['Time_difference'] = train_x['Time_difference'].astype(str)

# 범주형 변수 인코딩
categorical_columns = train_x.select_dtypes(include=['object', 'category']).columns
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# 훈련 데이터 인코딩
train_x_encoded = train_x.copy()
train_x_encoded[categorical_columns] = ordinal_encoder.fit_transform(train_x[categorical_columns])

In [67]:
train_x_encoded['Customer_Total_Authentication_Changes'] = train_x_encoded[
    ['Customer_flag_change_of_authentication_1', 'Customer_flag_change_of_authentication_2', 
     'Customer_flag_change_of_authentication_3', 'Customer_flag_change_of_authentication_4']
].sum(axis=1)

train_x_encoded['Customer_Total_Malicious_Behavior_Flags'] = train_x_encoded[
    ['Customer_flag_terminal_malicious_behavior_1', 'Customer_flag_terminal_malicious_behavior_2', 
     'Customer_flag_terminal_malicious_behavior_3', 'Customer_flag_terminal_malicious_behavior_4', 
     'Customer_flag_terminal_malicious_behavior_5', 'Customer_flag_terminal_malicious_behavior_6']
].sum(axis=1)

train_x_encoded['Daily_Usage_Ratio'] = train_x_encoded['Account_amount_daily_limit'] / train_x_encoded['Account_remaining_amount_daily_limit_exceeded']

train_x_encoded['Transaction_Amount_per_Distance'] = train_x_encoded['Transaction_Amount'] / train_x_encoded['Distance']
train_x_encoded['Channel_Distance_Interaction'] = train_x_encoded['Channel'] * train_x_encoded['Distance']
train_x_encoded['Abs_Transaction_Amount'] = train_x_encoded['Transaction_Amount'].abs()
train_x_encoded['Transaction_Amount_per_Transaction_Count'] = train_x_encoded['Transaction_Amount'] / (train_x_encoded['Number_of_transaction_with_the_account'] + 1)

train_x_encoded['Channel_Transaction_Count_Interaction'] = train_x_encoded['Channel'] * train_x_encoded['Number_of_transaction_with_the_account']
train_x_encoded['Flag_Transaction_Interaction'] = train_x_encoded['Flag_deposit_more_than_tenMillion'] * train_x_encoded['Transaction_Amount']

train_x_encoded['Transaction_Failure_Rate'] = train_x_encoded['Transaction_Failure_Status'].mean()

train_x_encoded['ATM_Limit_Increased'] = (train_x_encoded['Customer_increase_atm_limit'] > 0).astype(int)

# train_x_encoded['Transaction_Weekday'] = pd.to_datetime(train_x_encoded['Transaction_Datetime']).dt.weekday

train_x_encoded.replace([np.inf, -np.inf], np.nan, inplace=True)
train_x_encoded.fillna(train_x_encoded.mean(), inplace=True)


In [68]:
# 특성 순서 저장
feature_order = train_x_encoded.columns.tolist()

### test

In [69]:
# 테스트 데이터 인코딩
test_x_encoded = test_x.copy()
test_x_encoded[categorical_columns] = ordinal_encoder.transform(test_x[categorical_columns])

In [70]:
test_x_encoded['Customer_Total_Authentication_Changes'] = test_x_encoded[
    ['Customer_flag_change_of_authentication_1', 'Customer_flag_change_of_authentication_2', 
     'Customer_flag_change_of_authentication_3', 'Customer_flag_change_of_authentication_4']
].sum(axis=1)

test_x_encoded['Customer_Total_Malicious_Behavior_Flags'] = test_x_encoded[
    ['Customer_flag_terminal_malicious_behavior_1', 'Customer_flag_terminal_malicious_behavior_2', 
     'Customer_flag_terminal_malicious_behavior_3', 'Customer_flag_terminal_malicious_behavior_4', 
     'Customer_flag_terminal_malicious_behavior_5', 'Customer_flag_terminal_malicious_behavior_6']
].sum(axis=1)

test_x_encoded['Daily_Usage_Ratio'] = test_x_encoded['Account_amount_daily_limit'] / test_x_encoded['Account_remaining_amount_daily_limit_exceeded']

test_x_encoded['Transaction_Amount_per_Distance'] = test_x_encoded['Transaction_Amount'] / test_x_encoded['Distance']
test_x_encoded['Channel_Distance_Interaction'] = test_x_encoded['Channel'] * test_x_encoded['Distance']
test_x_encoded['Abs_Transaction_Amount'] = test_x_encoded['Transaction_Amount'].abs()
test_x_encoded['Transaction_Amount_per_Transaction_Count'] = test_x_encoded['Transaction_Amount'] / (test_x_encoded['Number_of_transaction_with_the_account'] + 1)

test_x_encoded['Channel_Transaction_Count_Interaction'] = test_x_encoded['Channel'] * test_x_encoded['Number_of_transaction_with_the_account']
test_x_encoded['Flag_Transaction_Interaction'] = test_x_encoded['Flag_deposit_more_than_tenMillion'] * test_x_encoded['Transaction_Amount']
test_x_encoded['Transaction_Failure_Rate'] = test_x_encoded['Transaction_Failure_Status'].mean()

test_x_encoded['ATM_Limit_Increased'] = (test_x_encoded['Customer_increase_atm_limit'] > 0).astype(int)

# test_x_encoded['Transaction_Weekday'] = pd.to_datetime(test_x_encoded['Transaction_Datetime']).dt.weekday

test_x_encoded.replace([np.inf, -np.inf], np.nan, inplace=True)
test_x_encoded.fillna(test_x_encoded.mean(), inplace=True)

In [71]:
# 특성 순서 맞추기 및 데이터 타입 일치
test_x_encoded = test_x_encoded[feature_order]
# test_x_encoded = test_x_encoded[selected_features]
# for col in selected_features:
for col in feature_order:
    test_x_encoded[col] = test_x_encoded[col].astype(train_x_encoded[col].dtype)

In [72]:
train_x_encoded.shape

(120013, 73)

In [73]:
test_x_encoded.shape

(120000, 73)

In [74]:
train_x_encoded['Fraud_Type'] = train_y_encoded

In [75]:
# 비율 조정을 위한 타겟 클래스별 비율 딕셔너리 (예: Normal 비율 1.0, 다른 클래스는 원하는 비율로 조정)
target_ratios = {
    0: 1.0,
    1: 1.0,
    2: 1.0,
    3: 1.0,
    4: 1.0,
    5: 1.0,
    6: 1.0,
    7: 1.0,
    8: 1.0,
    9: 1.0,
    10: 1.0,
    11: 1.0,
    12: 0.0028,
}

# 각 클래스별로 샘플링하여 새로운 데이터프레임 생성
df_list = []
for target_class, ratio in target_ratios.items():
    df_class = train_x_encoded[train_x_encoded['Fraud_Type'] == target_class]
    num_class = len(df_class)
    
    # 비율에 맞게 샘플링
    df_sampled = df_class.sample(
        n=int(num_class * ratio), replace=False, random_state=42
    )
    
    # 리스트에 추가
    df_list.append(df_sampled)

# 샘플링된 데이터프레임 결합
df_concat = pd.concat(df_list, axis=0).reset_index(drop=True)

# 새로운 데이터프레임의 클래스별 카운트를 확인
print(df_concat.value_counts('Fraud_Type'))

Fraud_Type
12    332
0     101
1     101
2     101
3     101
4     101
5     101
6     101
7     101
8     101
9     101
10    101
11    101
dtype: int64


In [76]:
train_x_encoded_down = df_concat.drop(columns=['Fraud_Type'])
train_y_encoded_down = df_concat['Fraud_Type']

In [77]:
import xgboost as xgb
import shap
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(train_x_encoded_down, train_y_encoded_down, test_size=0.25, random_state=42)

# XGBoost 모델 학습
model = xgb.XGBClassifier(objective="multi:softprob", eval_metric="mlogloss")
model.fit(X_train, y_train)

# SHAP 값 계산
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# 각 클래스별로 가장 중요한 피처 확인
for i in range(len(shap_values)):
    print(f"클래스 {i}에 대한 피처 중요도 순위:")
    shap_importance = np.abs(shap_values[i]).mean(axis=0)
    feature_importance_df = pd.DataFrame({
        'Feature': X_test.columns,
        'Importance': shap_importance
    }).sort_values(by='Importance', ascending=False)
    print(feature_importance_df.head())

    # # SHAP 값 시각화
    # shap.summary_plot(shap_values[i], X_test, show=False)

# 전체 SHAP 값 계산 (평균 절대값을 사용하여 피처 중요도 확인)
shap_values_mean = np.mean([np.abs(s).mean(axis=0) for s in shap_values], axis=0)

# 피처 중요도 데이터프레임 생성 및 정렬
overall_importance_df = pd.DataFrame({
    'Feature': X_test.columns,
    'Importance': shap_values_mean
}).sort_values(by='Importance', ascending=False)

print("전체 모델의 피처 중요도:")
overall_importance_df.head()
# # 전체 모델의 피처 중요도 시각화
# shap.summary_plot(np.mean(shap_values, axis=0), X_test)


클래스 0에 대한 피처 중요도 순위:
                                     Feature  Importance
50                                  Distance    3.019128
51                           Time_difference    1.063906
65           Transaction_Amount_per_Distance    0.539671
25                   Account_initial_balance    0.411040
68  Transaction_Amount_per_Transaction_Count    0.330866
클래스 1에 대한 피처 중요도 순위:
                                 Feature  Importance
5                 Customer_credit_rating    1.885414
10  Customer_rooting_jailbreak_indicator    1.194003
12                Customer_VPN_Indicator    1.064338
28            Account_amount_daily_limit    0.446523
38                               Channel    0.271331
클래스 2에 대한 피처 중요도 순위:
                                        Feature  Importance
63      Customer_Total_Malicious_Behavior_Flags    2.440728
38                                      Channel    0.355793
66                 Channel_Distance_Interaction    0.335461
19  Customer_flag_terminal_malicious_

Unnamed: 0,Feature,Importance
70,Flag_Transaction_Interaction,0.497391
38,Channel,0.381931
50,Distance,0.3475
37,Transaction_Amount,0.282398
58,Number_of_transaction_with_the_account,0.261223


In [78]:
# 임계값 설정 
threshold = 0

# 임계값 이상인 피처들만 필터링
selected_features = overall_importance_df[overall_importance_df['Importance'] > threshold]

# 필터링된 피처 목록 출력
print("임계값 이상인 피처들:")
print(selected_features)

# 필터링된 피처 이름 리스트로 추출
selected_feature_names = selected_features['Feature'].tolist()

임계값 이상인 피처들:
                                        Feature  Importance
70                 Flag_Transaction_Interaction    0.497391
38                                      Channel    0.381931
50                                     Distance    0.347500
37                           Transaction_Amount    0.282398
58       Number_of_transaction_with_the_account    0.261223
..                                          ...         ...
9      Customer_flag_change_of_authentication_4    0.007695
27       Account_indicator_release_limit_excess    0.004273
7      Customer_flag_change_of_authentication_2    0.003475
18  Customer_flag_terminal_malicious_behavior_5    0.001745
14  Customer_flag_terminal_malicious_behavior_1    0.000560

[66 rows x 2 columns]


In [79]:
train_x_shap = train_x_encoded_down[selected_feature_names]
test_x_shap = test_x_encoded[selected_feature_names]

In [80]:
import optuna
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score

# CatBoost 최적화 목적 함수 정의
def optimize_catboost(trial):
    # 최적화할 하이퍼파라미터 설정
    depth = trial.suggest_int('depth', 4, 8)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-3, 0.1)
    iterations = trial.suggest_int('iterations', 100, 500)
    
    # CatBoost 모델 생성
    model = CatBoostClassifier(
        depth=depth,
        learning_rate=learning_rate,
        iterations=iterations,
        verbose=200,
        random_state=42
    )
    
    # 교차 검증으로 모델 성능 평가 (F1 Macro)
    score = cross_val_score(model, train_x_shap, train_y_encoded_down, cv=3, scoring='f1_macro').mean()
    return score

# Optuna 스터디 생성 및 최적화 실행
catboost_study = optuna.create_study(direction='maximize')
catboost_study.optimize(optimize_catboost, n_trials=30)

# 최적 하이퍼파라미터 출력
print("Best CatBoost hyperparameters: ", catboost_study.best_params)

# 최적 하이퍼파라미터로 CatBoost 모델 생성
best_catboost_model = CatBoostClassifier(
    depth=catboost_study.best_params['depth'],
    learning_rate=catboost_study.best_params['learning_rate'],
    iterations=catboost_study.best_params['iterations'],
    verbose=0,
    random_state=42
)


[I 2024-08-26 16:50:50,986] A new study created in memory with name: no-name-55eda4f3-d2c5-4859-9461-9a15aa780c27


0:	learn: 2.5601714	total: 15.4ms	remaining: 3.53s
200:	learn: 1.9361294	total: 2.89s	remaining: 432ms
230:	learn: 1.8829701	total: 3.3s	remaining: 0us
0:	learn: 2.5591248	total: 14.7ms	remaining: 3.39s
200:	learn: 1.9223599	total: 2.72s	remaining: 406ms
230:	learn: 1.8654429	total: 3.13s	remaining: 0us
0:	learn: 2.5597337	total: 15.9ms	remaining: 3.65s
200:	learn: 1.9322021	total: 2.62s	remaining: 391ms


[I 2024-08-26 16:51:01,279] Trial 0 finished with value: 0.46048138186217963 and parameters: {'depth': 5, 'learning_rate': 0.00502818822437096, 'iterations': 231}. Best is trial 0 with value: 0.46048138186217963.


230:	learn: 1.8800948	total: 3.01s	remaining: 0us
0:	learn: 2.5070490	total: 54.8ms	remaining: 25.4s
200:	learn: 0.5667812	total: 14.1s	remaining: 18.5s
400:	learn: 0.2495444	total: 27.8s	remaining: 4.36s
463:	learn: 0.2074124	total: 32.1s	remaining: 0us
0:	learn: 2.5242047	total: 57ms	remaining: 26.4s
200:	learn: 0.5697483	total: 12s	remaining: 15.7s
400:	learn: 0.2507987	total: 26s	remaining: 4.08s
463:	learn: 0.2079007	total: 30.4s	remaining: 0us
0:	learn: 2.5072762	total: 68.5ms	remaining: 31.7s
200:	learn: 0.5830274	total: 14.5s	remaining: 18.9s
400:	learn: 0.2581138	total: 29.8s	remaining: 4.68s


[I 2024-08-26 16:52:39,879] Trial 1 finished with value: 0.5706176167538223 and parameters: {'depth': 7, 'learning_rate': 0.050886173550703816, 'iterations': 464}. Best is trial 1 with value: 0.5706176167538223.


463:	learn: 0.2150440	total: 34.3s	remaining: 0us
0:	learn: 2.5615183	total: 147ms	remaining: 1m 9s
200:	learn: 2.1121959	total: 29.3s	remaining: 39.7s
400:	learn: 1.8209764	total: 56.6s	remaining: 10.2s
472:	learn: 1.7333566	total: 1m 6s	remaining: 0us
0:	learn: 2.5623564	total: 131ms	remaining: 1m 1s
200:	learn: 2.1069904	total: 26.6s	remaining: 36s
400:	learn: 1.8129845	total: 54.2s	remaining: 9.73s
472:	learn: 1.7260062	total: 1m 4s	remaining: 0us
0:	learn: 2.5613616	total: 146ms	remaining: 1m 9s
200:	learn: 2.1081892	total: 28.8s	remaining: 38.9s
400:	learn: 1.8172277	total: 58s	remaining: 10.4s


[I 2024-08-26 16:56:00,718] Trial 2 finished with value: 0.484288390902553 and parameters: {'depth': 8, 'learning_rate': 0.003558400706939005, 'iterations': 473}. Best is trial 1 with value: 0.5706176167538223.


472:	learn: 1.7271858	total: 1m 8s	remaining: 0us
0:	learn: 2.5630893	total: 68.3ms	remaining: 25.2s
200:	learn: 2.3083913	total: 14.1s	remaining: 11.9s
369:	learn: 2.1419301	total: 25s	remaining: 0us
0:	learn: 2.5636427	total: 53.8ms	remaining: 19.9s
200:	learn: 2.3045868	total: 12.3s	remaining: 10.3s
369:	learn: 2.1401157	total: 23.2s	remaining: 0us
0:	learn: 2.5630972	total: 70.5ms	remaining: 26s
200:	learn: 2.3038496	total: 12.5s	remaining: 10.5s


[I 2024-08-26 16:57:13,831] Trial 3 finished with value: 0.4628757343180235 and parameters: {'depth': 7, 'learning_rate': 0.0016145639792699666, 'iterations': 370}. Best is trial 1 with value: 0.5706176167538223.


369:	learn: 2.1388006	total: 23.5s	remaining: 0us
0:	learn: 2.5623241	total: 15ms	remaining: 1.69s
113:	learn: 2.2850367	total: 1.46s	remaining: 0us
0:	learn: 2.5617488	total: 13.6ms	remaining: 1.53s
113:	learn: 2.2766755	total: 1.46s	remaining: 0us
0:	learn: 2.5620837	total: 22.9ms	remaining: 2.59s


[I 2024-08-26 16:57:18,778] Trial 4 finished with value: 0.4537361537550934 and parameters: {'depth': 5, 'learning_rate': 0.0027611456283510266, 'iterations': 114}. Best is trial 1 with value: 0.5706176167538223.


113:	learn: 2.2820868	total: 1.51s	remaining: 0us
0:	learn: 2.5537554	total: 111ms	remaining: 27.7s
200:	learn: 1.5337640	total: 28.6s	remaining: 7.12s
250:	learn: 1.3973242	total: 35.8s	remaining: 0us
0:	learn: 2.5564888	total: 147ms	remaining: 36.7s
200:	learn: 1.5318763	total: 28.1s	remaining: 6.99s
250:	learn: 1.3932547	total: 35.2s	remaining: 0us
0:	learn: 2.5532448	total: 141ms	remaining: 35.2s
200:	learn: 1.5395366	total: 28.6s	remaining: 7.11s


[I 2024-08-26 16:59:06,370] Trial 5 finished with value: 0.4902498831232533 and parameters: {'depth': 8, 'learning_rate': 0.011626566365202711, 'iterations': 251}. Best is trial 1 with value: 0.5706176167538223.


250:	learn: 1.4041285	total: 35.7s	remaining: 0us
0:	learn: 2.5555928	total: 36.8ms	remaining: 18s
200:	learn: 1.6770689	total: 6.73s	remaining: 9.64s
400:	learn: 1.3018517	total: 12.7s	remaining: 2.78s
488:	learn: 1.1839060	total: 15.2s	remaining: 0us
0:	learn: 2.5563660	total: 27.2ms	remaining: 13.3s
200:	learn: 1.6834700	total: 5.68s	remaining: 8.15s
400:	learn: 1.3097545	total: 11.4s	remaining: 2.49s
488:	learn: 1.1909534	total: 13.8s	remaining: 0us
0:	learn: 2.5551053	total: 30ms	remaining: 14.6s
200:	learn: 1.6836090	total: 6.04s	remaining: 8.66s
400:	learn: 1.3094634	total: 11.9s	remaining: 2.62s


[I 2024-08-26 16:59:51,789] Trial 6 finished with value: 0.5093185525377496 and parameters: {'depth': 6, 'learning_rate': 0.008771641260798893, 'iterations': 489}. Best is trial 1 with value: 0.5706176167538223.


488:	learn: 1.1935823	total: 14.6s	remaining: 0us
0:	learn: 2.5616285	total: 116ms	remaining: 11.7s
101:	learn: 2.3181453	total: 14.8s	remaining: 0us
0:	learn: 2.5624397	total: 141ms	remaining: 14.2s
101:	learn: 2.3140554	total: 14.8s	remaining: 0us
0:	learn: 2.5614768	total: 150ms	remaining: 15.1s


[I 2024-08-26 17:00:36,851] Trial 7 finished with value: 0.45566889863913157 and parameters: {'depth': 8, 'learning_rate': 0.0034440244866855495, 'iterations': 102}. Best is trial 1 with value: 0.5706176167538223.


101:	learn: 2.3133632	total: 14.9s	remaining: 0us
0:	learn: 2.5629844	total: 32.4ms	remaining: 5.48s
169:	learn: 2.2960340	total: 5.83s	remaining: 0us
0:	learn: 2.5631468	total: 33.6ms	remaining: 5.69s
169:	learn: 2.2918228	total: 5.59s	remaining: 0us
0:	learn: 2.5628822	total: 30.2ms	remaining: 5.1s


[I 2024-08-26 17:00:54,265] Trial 8 finished with value: 0.44935164996644256 and parameters: {'depth': 6, 'learning_rate': 0.0018388406259450061, 'iterations': 170}. Best is trial 1 with value: 0.5706176167538223.


169:	learn: 2.2985755	total: 5.2s	remaining: 0us
0:	learn: 2.5034434	total: 113ms	remaining: 49.8s
200:	learn: 0.3783375	total: 29s	remaining: 34.9s
400:	learn: 0.1471013	total: 57.3s	remaining: 6s
442:	learn: 0.1278616	total: 1m 2s	remaining: 0us
0:	learn: 2.5184079	total: 125ms	remaining: 55.1s
200:	learn: 0.3747272	total: 28.1s	remaining: 33.8s
400:	learn: 0.1457559	total: 55.5s	remaining: 5.82s
442:	learn: 0.1261798	total: 1m 1s	remaining: 0us
0:	learn: 2.5006709	total: 140ms	remaining: 1m 1s
200:	learn: 0.3898551	total: 28s	remaining: 33.8s
400:	learn: 0.1494382	total: 57s	remaining: 5.97s


[I 2024-08-26 17:04:03,152] Trial 9 finished with value: 0.5527979811029762 and parameters: {'depth': 8, 'learning_rate': 0.06455067543315406, 'iterations': 443}. Best is trial 1 with value: 0.5706176167538223.


442:	learn: 0.1303993	total: 1m 3s	remaining: 0us
0:	learn: 2.4908157	total: 10.8ms	remaining: 3.9s
200:	learn: 0.6609349	total: 1.34s	remaining: 1.08s
361:	learn: 0.4515506	total: 2.36s	remaining: 0us
0:	learn: 2.4750287	total: 8.29ms	remaining: 2.99s
200:	learn: 0.6586417	total: 1.24s	remaining: 995ms
361:	learn: 0.4466730	total: 2.23s	remaining: 0us
0:	learn: 2.4851647	total: 6.94ms	remaining: 2.5s
200:	learn: 0.6779134	total: 1.25s	remaining: 999ms


[I 2024-08-26 17:04:11,205] Trial 10 finished with value: 0.6170752738756614 and parameters: {'depth': 4, 'learning_rate': 0.07776953061852382, 'iterations': 362}. Best is trial 10 with value: 0.6170752738756614.


361:	learn: 0.4749776	total: 2.3s	remaining: 0us
0:	learn: 2.4752790	total: 7.53ms	remaining: 2.73s
200:	learn: 0.5876848	total: 1.27s	remaining: 1.03s
362:	learn: 0.3860475	total: 2.25s	remaining: 0us
0:	learn: 2.4562780	total: 6.53ms	remaining: 2.36s
200:	learn: 0.5799884	total: 1.28s	remaining: 1.04s
362:	learn: 0.3818785	total: 2.35s	remaining: 0us
0:	learn: 2.4684327	total: 8.28ms	remaining: 3s
200:	learn: 0.6001870	total: 1.29s	remaining: 1.04s


[I 2024-08-26 17:04:19,226] Trial 11 finished with value: 0.6177974951013133 and parameters: {'depth': 4, 'learning_rate': 0.09461234810054772, 'iterations': 363}. Best is trial 11 with value: 0.6177974951013133.


362:	learn: 0.3998473	total: 2.26s	remaining: 0us
0:	learn: 2.4707905	total: 7.01ms	remaining: 2.45s
200:	learn: 0.5696253	total: 1.29s	remaining: 961ms
350:	learn: 0.3764610	total: 2.2s	remaining: 0us
0:	learn: 2.4508699	total: 7.3ms	remaining: 2.55s
200:	learn: 0.5724246	total: 1.27s	remaining: 945ms
350:	learn: 0.3738549	total: 2.19s	remaining: 0us
0:	learn: 2.4635985	total: 7.34ms	remaining: 2.57s
200:	learn: 0.5898505	total: 1.34s	remaining: 1s


[I 2024-08-26 17:04:27,076] Trial 12 finished with value: 0.6109163425437646 and parameters: {'depth': 4, 'learning_rate': 0.09952067957622725, 'iterations': 351}. Best is trial 11 with value: 0.6177974951013133.


350:	learn: 0.3987878	total: 2.34s	remaining: 0us
0:	learn: 2.5391507	total: 7.9ms	remaining: 2.91s
200:	learn: 1.2167144	total: 1.28s	remaining: 1.07s
368:	learn: 0.8790754	total: 2.29s	remaining: 0us
0:	learn: 2.5335948	total: 7.57ms	remaining: 2.79s
200:	learn: 1.2290058	total: 1.3s	remaining: 1.09s
368:	learn: 0.8844727	total: 2.32s	remaining: 0us
0:	learn: 2.5371954	total: 8.99ms	remaining: 3.31s
200:	learn: 1.2260450	total: 1.27s	remaining: 1.07s


[I 2024-08-26 17:04:35,126] Trial 13 finished with value: 0.5828754561330882 and parameters: {'depth': 4, 'learning_rate': 0.026637735191983706, 'iterations': 369}. Best is trial 11 with value: 0.6177974951013133.


368:	learn: 0.9000906	total: 2.29s	remaining: 0us
0:	learn: 2.5409782	total: 6.94ms	remaining: 2.05s
200:	learn: 1.2529027	total: 1.27s	remaining: 602ms
295:	learn: 1.0458579	total: 1.88s	remaining: 0us
0:	learn: 2.5358141	total: 7.87ms	remaining: 2.32s
200:	learn: 1.2633381	total: 1.26s	remaining: 596ms
295:	learn: 1.0522428	total: 1.83s	remaining: 0us
0:	learn: 2.5391618	total: 8.16ms	remaining: 2.41s
200:	learn: 1.2648352	total: 1.26s	remaining: 598ms


[I 2024-08-26 17:04:41,656] Trial 14 finished with value: 0.5470440099098663 and parameters: {'depth': 4, 'learning_rate': 0.024737314814788615, 'iterations': 296}. Best is trial 11 with value: 0.6177974951013133.


295:	learn: 1.0653384	total: 1.83s	remaining: 0us
0:	learn: 2.5291965	total: 14.6ms	remaining: 5.87s
200:	learn: 0.9220763	total: 2.55s	remaining: 2.56s
400:	learn: 0.5501078	total: 5.09s	remaining: 25.4ms
402:	learn: 0.5475878	total: 5.12s	remaining: 0us
0:	learn: 2.5214079	total: 12.9ms	remaining: 5.19s
200:	learn: 0.9135427	total: 2.53s	remaining: 2.54s
400:	learn: 0.5466644	total: 5.04s	remaining: 25.1ms
402:	learn: 0.5451441	total: 5.06s	remaining: 0us
0:	learn: 2.5258997	total: 14.6ms	remaining: 5.85s
200:	learn: 0.9234946	total: 2.62s	remaining: 2.63s


[I 2024-08-26 17:04:58,335] Trial 15 finished with value: 0.592968335963341 and parameters: {'depth': 5, 'learning_rate': 0.03797165360527909, 'iterations': 403}. Best is trial 11 with value: 0.6177974951013133.


400:	learn: 0.5670809	total: 5.25s	remaining: 26.2ms
402:	learn: 0.5647591	total: 5.28s	remaining: 0us
0:	learn: 2.4742926	total: 8.11ms	remaining: 2.48s
200:	learn: 0.5811151	total: 1.31s	remaining: 693ms
306:	learn: 0.4363000	total: 2.01s	remaining: 0us
0:	learn: 2.4550892	total: 8.87ms	remaining: 2.71s
200:	learn: 0.5663788	total: 1.24s	remaining: 653ms
306:	learn: 0.4237775	total: 1.94s	remaining: 0us
0:	learn: 2.4673704	total: 6.9ms	remaining: 2.11s
200:	learn: 0.6012765	total: 1.26s	remaining: 663ms


[I 2024-08-26 17:05:05,224] Trial 16 finished with value: 0.6173734550256714 and parameters: {'depth': 4, 'learning_rate': 0.09568927941512038, 'iterations': 307}. Best is trial 11 with value: 0.6177974951013133.


306:	learn: 0.4554160	total: 1.91s	remaining: 0us
0:	learn: 2.5511391	total: 15.1ms	remaining: 4.56s
200:	learn: 1.4405278	total: 2.58s	remaining: 1.32s
303:	learn: 1.2298991	total: 3.85s	remaining: 0us
0:	learn: 2.5481182	total: 14.7ms	remaining: 4.44s
200:	learn: 1.4461987	total: 2.59s	remaining: 1.33s
303:	learn: 1.2331864	total: 3.89s	remaining: 0us
0:	learn: 2.5498715	total: 15ms	remaining: 4.55s
200:	learn: 1.4509848	total: 2.56s	remaining: 1.31s


[I 2024-08-26 17:05:17,866] Trial 17 finished with value: 0.5071518027163749 and parameters: {'depth': 5, 'learning_rate': 0.014570970219286236, 'iterations': 304}. Best is trial 11 with value: 0.6177974951013133.


303:	learn: 1.2411562	total: 3.9s	remaining: 0us
0:	learn: 2.5317889	total: 7.62ms	remaining: 2.29s
200:	learn: 1.0783391	total: 1.31s	remaining: 658ms
301:	learn: 0.8527461	total: 1.95s	remaining: 0us
0:	learn: 2.5246571	total: 7.08ms	remaining: 2.13s
200:	learn: 1.0834997	total: 1.34s	remaining: 672ms
301:	learn: 0.8504022	total: 1.95s	remaining: 0us
0:	learn: 2.5292732	total: 9.02ms	remaining: 2.71s
200:	learn: 1.0936814	total: 1.28s	remaining: 644ms


[I 2024-08-26 17:05:24,646] Trial 18 finished with value: 0.5938627731444569 and parameters: {'depth': 4, 'learning_rate': 0.034315699794573086, 'iterations': 302}. Best is trial 11 with value: 0.6177974951013133.


301:	learn: 0.8604051	total: 1.9s	remaining: 0us
0:	learn: 2.5464867	total: 13.9ms	remaining: 3.44s
200:	learn: 1.2912979	total: 2.67s	remaining: 624ms
247:	learn: 1.1862861	total: 3.27s	remaining: 0us
0:	learn: 2.5424513	total: 14.1ms	remaining: 3.48s
200:	learn: 1.2927086	total: 2.65s	remaining: 619ms
247:	learn: 1.1784681	total: 3.27s	remaining: 0us
0:	learn: 2.5447904	total: 15.2ms	remaining: 3.75s
200:	learn: 1.2993457	total: 2.6s	remaining: 608ms


[I 2024-08-26 17:05:35,266] Trial 19 finished with value: 0.5138342562193444 and parameters: {'depth': 5, 'learning_rate': 0.019506046537043544, 'iterations': 248}. Best is trial 11 with value: 0.6177974951013133.


247:	learn: 1.1935305	total: 3.19s	remaining: 0us
0:	learn: 2.5103497	total: 27.3ms	remaining: 11.3s
200:	learn: 0.6278374	total: 5.72s	remaining: 6.12s
400:	learn: 0.3242916	total: 11.5s	remaining: 429ms
415:	learn: 0.3117820	total: 11.9s	remaining: 0us
0:	learn: 2.5148451	total: 27.1ms	remaining: 11.2s
200:	learn: 0.6316806	total: 5.67s	remaining: 6.06s
400:	learn: 0.3146068	total: 11.4s	remaining: 425ms
415:	learn: 0.3028255	total: 11.8s	remaining: 0us
0:	learn: 2.5074639	total: 28.8ms	remaining: 11.9s
200:	learn: 0.6395674	total: 5.85s	remaining: 6.25s
400:	learn: 0.3241221	total: 11.7s	remaining: 437ms


[I 2024-08-26 17:06:12,778] Trial 20 finished with value: 0.5859078823321825 and parameters: {'depth': 6, 'learning_rate': 0.05180241857567632, 'iterations': 416}. Best is trial 11 with value: 0.6177974951013133.


415:	learn: 0.3123616	total: 12.1s	remaining: 0us
0:	learn: 2.4755607	total: 6.86ms	remaining: 2.3s
200:	learn: 0.5832225	total: 1.24s	remaining: 833ms
335:	learn: 0.4101116	total: 2.05s	remaining: 0us
0:	learn: 2.4566175	total: 6.72ms	remaining: 2.25s
200:	learn: 0.5943003	total: 1.24s	remaining: 830ms
335:	learn: 0.4141623	total: 2.05s	remaining: 0us
0:	learn: 2.4687361	total: 7.61ms	remaining: 2.55s
200:	learn: 0.6045094	total: 1.29s	remaining: 870ms


[I 2024-08-26 17:06:20,069] Trial 21 finished with value: 0.6170464705143726 and parameters: {'depth': 4, 'learning_rate': 0.09430496840168869, 'iterations': 336}. Best is trial 11 with value: 0.6177974951013133.


335:	learn: 0.4254190	total: 2.13s	remaining: 0us
0:	learn: 2.4987963	total: 6.1ms	remaining: 2.38s
200:	learn: 0.7071282	total: 1.26s	remaining: 1.2s
391:	learn: 0.4616953	total: 2.43s	remaining: 0us
0:	learn: 2.4846768	total: 7.74ms	remaining: 3.02s
200:	learn: 0.6989627	total: 1.22s	remaining: 1.16s
391:	learn: 0.4541949	total: 2.4s	remaining: 0us
0:	learn: 2.4937580	total: 6.96ms	remaining: 2.72s
200:	learn: 0.7274300	total: 1.27s	remaining: 1.21s


[I 2024-08-26 17:06:28,597] Trial 22 finished with value: 0.6192131398131927 and parameters: {'depth': 4, 'learning_rate': 0.06920161965531786, 'iterations': 392}. Best is trial 22 with value: 0.6192131398131927.


391:	learn: 0.4831439	total: 2.45s	remaining: 0us
0:	learn: 2.5110331	total: 6.67ms	remaining: 2.73s
200:	learn: 0.8067671	total: 1.25s	remaining: 1.3s
400:	learn: 0.5299763	total: 2.48s	remaining: 61.9ms
410:	learn: 0.5218760	total: 2.54s	remaining: 0us
0:	learn: 2.4994893	total: 6.22ms	remaining: 2.55s
200:	learn: 0.8018049	total: 1.26s	remaining: 1.31s
400:	learn: 0.5241426	total: 2.47s	remaining: 61.7ms
410:	learn: 0.5143285	total: 2.54s	remaining: 0us
0:	learn: 2.5069325	total: 7.51ms	remaining: 3.08s
200:	learn: 0.8284899	total: 1.35s	remaining: 1.41s


[I 2024-08-26 17:06:37,643] Trial 23 finished with value: 0.6185019390791047 and parameters: {'depth': 4, 'learning_rate': 0.05616634412503374, 'iterations': 411}. Best is trial 22 with value: 0.6192131398131927.


400:	learn: 0.5395291	total: 2.63s	remaining: 65.5ms
410:	learn: 0.5304617	total: 2.69s	remaining: 0us
0:	learn: 2.5162673	total: 13.2ms	remaining: 5.42s
200:	learn: 0.7277367	total: 2.6s	remaining: 2.71s
400:	learn: 0.4297449	total: 5.13s	remaining: 128ms
410:	learn: 0.4212885	total: 5.26s	remaining: 0us
0:	learn: 2.5056930	total: 14.9ms	remaining: 6.11s
200:	learn: 0.7251158	total: 2.55s	remaining: 2.66s
400:	learn: 0.4164347	total: 5.09s	remaining: 127ms
410:	learn: 0.4076047	total: 5.22s	remaining: 0us
0:	learn: 2.5117664	total: 14.6ms	remaining: 5.98s
200:	learn: 0.7429732	total: 2.63s	remaining: 2.75s
400:	learn: 0.4454821	total: 5.17s	remaining: 129ms
410:	learn: 0.4380260	total: 5.29s	remaining: 0us


[I 2024-08-26 17:06:54,680] Trial 24 finished with value: 0.602431792542165 and parameters: {'depth': 5, 'learning_rate': 0.05191775378764752, 'iterations': 411}. Best is trial 22 with value: 0.6192131398131927.


0:	learn: 2.5301738	total: 7.16ms	remaining: 3.14s
200:	learn: 1.0494178	total: 1.26s	remaining: 1.5s
400:	learn: 0.6983965	total: 2.48s	remaining: 241ms
439:	learn: 0.6585217	total: 2.73s	remaining: 0us
0:	learn: 2.5226971	total: 6.35ms	remaining: 2.79s
200:	learn: 1.0496828	total: 1.23s	remaining: 1.46s
400:	learn: 0.6955488	total: 2.5s	remaining: 243ms
439:	learn: 0.6562009	total: 2.74s	remaining: 0us
0:	learn: 2.5275351	total: 7.08ms	remaining: 3.11s
200:	learn: 1.0694114	total: 1.38s	remaining: 1.65s
400:	learn: 0.7123795	total: 2.69s	remaining: 261ms


[I 2024-08-26 17:07:04,428] Trial 25 finished with value: 0.6127895118246526 and parameters: {'depth': 4, 'learning_rate': 0.03600496661527211, 'iterations': 440}. Best is trial 22 with value: 0.6192131398131927.


439:	learn: 0.6721742	total: 2.93s	remaining: 0us
0:	learn: 2.5079024	total: 15.4ms	remaining: 5.98s
200:	learn: 0.6433095	total: 2.62s	remaining: 2.46s
389:	learn: 0.3799112	total: 5.13s	remaining: 0us
0:	learn: 2.4955373	total: 13ms	remaining: 5.06s
200:	learn: 0.6463795	total: 2.54s	remaining: 2.38s
389:	learn: 0.3722999	total: 4.89s	remaining: 0us
0:	learn: 2.5026192	total: 16.2ms	remaining: 6.28s
200:	learn: 0.6602734	total: 2.56s	remaining: 2.4s


[I 2024-08-26 17:07:20,649] Trial 26 finished with value: 0.6034321912194458 and parameters: {'depth': 5, 'learning_rate': 0.06100856168878192, 'iterations': 390}. Best is trial 22 with value: 0.6192131398131927.


389:	learn: 0.3966097	total: 5s	remaining: 0us
0:	learn: 2.5586121	total: 8.37ms	remaining: 3.65s
200:	learn: 1.8615507	total: 1.27s	remaining: 1.5s
400:	learn: 1.5627356	total: 2.54s	remaining: 228ms
436:	learn: 1.5251565	total: 2.76s	remaining: 0us
0:	learn: 2.5572431	total: 7.5ms	remaining: 3.27s
200:	learn: 1.8566755	total: 1.25s	remaining: 1.47s
400:	learn: 1.5536048	total: 2.5s	remaining: 224ms
436:	learn: 1.5176595	total: 2.72s	remaining: 0us
0:	learn: 2.5581330	total: 6.42ms	remaining: 2.8s
200:	learn: 1.8633429	total: 1.28s	remaining: 1.51s
400:	learn: 1.5746523	total: 2.56s	remaining: 230ms


[I 2024-08-26 17:07:30,282] Trial 27 finished with value: 0.4828527081589507 and parameters: {'depth': 4, 'learning_rate': 0.006506618059757569, 'iterations': 437}. Best is trial 22 with value: 0.6192131398131927.


436:	learn: 1.5369949	total: 2.8s	remaining: 0us
0:	learn: 2.5451727	total: 56.5ms	remaining: 18.5s
200:	learn: 1.2948905	total: 14.2s	remaining: 8.96s
327:	learn: 0.9718936	total: 22.9s	remaining: 0us
0:	learn: 2.5510498	total: 55.4ms	remaining: 18.1s
200:	learn: 1.2910563	total: 13.5s	remaining: 8.56s
327:	learn: 0.9715275	total: 22.4s	remaining: 0us
0:	learn: 2.5452538	total: 75ms	remaining: 24.5s
200:	learn: 1.2952038	total: 13.3s	remaining: 8.41s


[I 2024-08-26 17:08:38,534] Trial 28 finished with value: 0.5007999479831571 and parameters: {'depth': 7, 'learning_rate': 0.017231218180295067, 'iterations': 328}. Best is trial 22 with value: 0.6192131398131927.


327:	learn: 0.9869778	total: 21.7s	remaining: 0us
0:	learn: 2.5254016	total: 16ms	remaining: 6.15s
200:	learn: 0.8557367	total: 2.57s	remaining: 2.37s
385:	learn: 0.5187565	total: 4.93s	remaining: 0us
0:	learn: 2.5167933	total: 14.1ms	remaining: 5.43s
200:	learn: 0.8487412	total: 2.68s	remaining: 2.46s
385:	learn: 0.5133237	total: 5.07s	remaining: 0us
0:	learn: 2.5217520	total: 15.5ms	remaining: 5.95s
200:	learn: 0.8620021	total: 2.6s	remaining: 2.4s


[I 2024-08-26 17:08:54,679] Trial 29 finished with value: 0.605031712786341 and parameters: {'depth': 5, 'learning_rate': 0.04205225416900106, 'iterations': 386}. Best is trial 22 with value: 0.6192131398131927.


385:	learn: 0.5354350	total: 4.96s	remaining: 0us
Best CatBoost hyperparameters:  {'depth': 4, 'learning_rate': 0.06920161965531786, 'iterations': 392}


In [81]:
import optuna
from xgboost import XGBClassifier

# XGBoost 최적화 목적 함수 정의
def optimize_xgboost(trial):
    # 최적화할 하이퍼파라미터 설정
    n_estimators = trial.suggest_int('n_estimators', 100, 600)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-3, 0.2)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
    
    # XGBoost 모델 생성
    model = XGBClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        eval_metric='logloss',
        random_state=42
    )
    
    # 교차 검증으로 모델 성능 평가 (F1 Macro)
    score = cross_val_score(model, train_x_shap, train_y_encoded_down, cv=3, scoring='f1_macro').mean()
    return score

# Optuna 스터디 생성 및 최적화 실행
xgboost_study = optuna.create_study(direction='maximize')
xgboost_study.optimize(optimize_xgboost, n_trials=70)

# 최적 하이퍼파라미터 출력
print("Best XGBoost hyperparameters: ", xgboost_study.best_params)

# 최적 하이퍼파라미터로 XGBoost 모델 생성
best_xgboost_model = XGBClassifier(
    n_estimators=xgboost_study.best_params['n_estimators'],
    max_depth=xgboost_study.best_params['max_depth'],
    learning_rate=xgboost_study.best_params['learning_rate'],
    subsample=xgboost_study.best_params['subsample'],
    colsample_bytree=xgboost_study.best_params['colsample_bytree'],
    eval_metric='logloss',
    random_state=42
)


[I 2024-08-26 17:08:54,692] A new study created in memory with name: no-name-e3c2da34-5a4d-41e1-87ea-0fbcae6da6a9
[I 2024-08-26 17:09:00,369] Trial 0 finished with value: 0.6565462116551062 and parameters: {'n_estimators': 187, 'max_depth': 5, 'learning_rate': 0.046006754225130606, 'subsample': 0.7435003412172435, 'colsample_bytree': 0.9630990557534018}. Best is trial 0 with value: 0.6565462116551062.
[I 2024-08-26 17:09:11,513] Trial 1 finished with value: 0.6721973626458202 and parameters: {'n_estimators': 466, 'max_depth': 4, 'learning_rate': 0.016254441032166216, 'subsample': 0.9913379656833206, 'colsample_bytree': 0.8922244768514667}. Best is trial 1 with value: 0.6721973626458202.
[I 2024-08-26 17:09:20,038] Trial 2 finished with value: 0.645805789576431 and parameters: {'n_estimators': 561, 'max_depth': 10, 'learning_rate': 0.10172342717633755, 'subsample': 0.6020244809095336, 'colsample_bytree': 0.7217794885748546}. Best is trial 1 with value: 0.6721973626458202.
[I 2024-08-26 

Best XGBoost hyperparameters:  {'n_estimators': 192, 'max_depth': 4, 'learning_rate': 0.033137537489353004, 'subsample': 0.9120130514591027, 'colsample_bytree': 0.8500883988635178}


In [88]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier,VotingClassifier

# randomforest_model = RandomForestClassifier(random_state=42)

voting_clf = VotingClassifier(
    estimators=[
        ('catboost', best_catboost_model),
        ('xgboost', best_xgboost_model)
    #     ('randomforest', randomforest_model)  # 기존 RandomForest 모델
    # ],
    ],
    voting='soft'
)

# 앙상블 모델 학습
voting_clf.fit(train_x_shap, train_y_encoded_down)


In [89]:
# 예측
predictions = voting_clf.predict(test_x_shap)
predictions_label = le_subclass.inverse_transform(predictions)

# Submission

In [90]:
# 분류 예측 결과 제출 데이터프레임(DataFrame)
# 분류 예측 결과 데이터프레임 파일명을 반드시 clf_submission.csv 로 지정해야합니다.
clf_submission = pd.read_csv("sample_submission.csv")
clf_submission["Fraud_Type"] = predictions_label
clf_submission.head()

Unnamed: 0,ID,Fraud_Type
0,TEST_000000,b
1,TEST_000001,m
2,TEST_000002,m
3,TEST_000003,m
4,TEST_000004,h


In [85]:
# 합성 데이터 생성 결과 제출 데이터프레임(DataFrame)
# 합성 데이터 생성 결과 데이터프레임 파일명을 반드시 syn_submission.csv 로 지정해야합니다.
# all_synthetic_data.head()

In [91]:
'''
(*) 저장 시 각 파일명을 반드시 확인해주세요.
    1. 분류 예측 결과 데이터프레임 파일명 = clf_submission.csv
    2. 합성 데이터 생성 결과 데이터프레임 파일명 = syn_submission.csv

(*) 제출 파일(zip) 내에 두 개의 데이터프레임이 각각 위의 파일명으로 반드시 존재해야합니다.
(*) 파일명을 일치시키지 않으면 채점이 불가능합니다.
'''
from datetime import datetime
today_datetime = datetime.today().strftime('%y%m%d_%H%M')
os.chdir('G:/내 드라이브/DACON_proj/DACON/2024_FSI_AIxData_Challenge')
# 폴더 생성 및 작업 디렉토리 변경
os.makedirs('./submission', exist_ok=True)
os.chdir('./submission')

# CSV 파일로 저장
clf_submission.to_csv('./clf_submission.csv', encoding='UTF-8-sig', index=False)
# all_synthetic_data.to_csv('./syn_submission.csv', encoding='UTF-8-sig', index=False)

# ZIP 파일 생성 및 CSV 파일 추가
with zipfile.ZipFile(f'submission_{today_datetime}.zip', 'w') as submission:
    submission.write('clf_submission.csv')
    submission.write('syn_submission.csv')
    
print('Done.')
os.chdir('G:/내 드라이브/DACON_proj/DACON/2024_FSI_AIxData_Challenge')

Done.


In [87]:
import winsound

# 주파수와 지속시간 설정 (주파수 단위: Hertz, 지속시간 단위: 밀리초)
frequency = 1000  # 주파수 (Hertz)
duration = 300    # 지속시간 (Milliseconds)

# 소리 재생
winsound.Beep(frequency, duration)