# Anomalous Financial Transaction Detection

본 대회의 과제는 금융 거래 데이터에서 **이상 거래를 탐지하는 기능**을 개선하고 활용도를 높이는 분류 AI모델을 개발하는 것입니다. 

특히, 클래스 불균형 문제를 해결하기 위해 오픈소스 생성형 AI 모델을 활용하여 부족한 클래스의 데이터를 보완하고, 이를 통해 분류 모델의 성능을 향상시키는 것이 핵심 목표입니다. 

이러한 접근을 통해 금융보안에 특화된 데이터 분석 및 활용 역량을 강화하여 전문 인력을 양성하고, 금융권의 AI 활용 어려움에 따른 해결 방안을 함께 모색하며 금융 산업의 AI 활용 활성화를 지원하는 것을 목표로 합니다.

# Import Library

In [36]:
# pip install sdv

In [37]:
# 제출 파일 생성 관련
import os
import zipfile

# 데이터 처리 및 분석
import pandas as pd
import numpy as np
from scipy import stats
from tqdm import tqdm

# 머신러닝 전처리
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

# 머신러닝 모델
import xgboost as xgb

# 합성 데이터 생성
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer

# To ignore all warnings
import warnings
warnings.filterwarnings('ignore')

# pd.set_option('display.max_columns', None)

# 생성 🏭

# Load Data

In [38]:
train_all = pd.read_csv("train.csv")
test_all = pd.read_csv("test.csv")

In [39]:
train = train_all.drop(columns="ID")

In [40]:
train.shape

(120000, 63)

In [41]:
'''
(*) 리더보드 산식 중 생성데이터의 익명성(TCAP)채점을 위해 각 클래스 별로 1000개의 생성데이터가 반드시 필요합니다.
(*) 본 베이스 라인에서는 "Fraud_Type" 13종류에 대해 1000개씩 , 총 13,000개의 데이터를 생성할 예정입니다.
(*) 분류 모델 성능 개선을 위해 생성 데이터를 활용하는 것에는 생성 데이터의 Row 개수에 제한이 없습니다. 단, 리더보드 평가를 위해 제출을 하는 생성 데이터 프레임은 익명성(TCAP) 평가를 위함이며, 위의 조건을 갖춘 생성 데이터를 제출해야합니다.
'''
N_CLS_PER_GEN = 1000
N_CLS_PER_GEN_2 = 5

In [42]:
# pd.set_option('display.max_rows', None)  # 모든 행 표시
# cond_all = pd.read_excel("데이터_명세_및_생성조건.xlsx", header=1)
# cond_all.iloc[:,2:-1]

## 제출용 합성데이터

### 지금이 제일 높은거임

In [43]:
# from ctgan import CTGAN 
# import pandas as pd
# from tqdm import tqdm
# import numpy as np
# from scipy import stats
# from sdv.metadata import SingleTableMetadata

# # 이상치 처리 함수
# def handle_outliers(series, n_std=3):
#     mean = series.mean()
#     std = series.std()
#     z_scores = np.abs(stats.zscore(series))
#     return series.mask(z_scores > n_std, mean)

# # 범주형 데이터 조건 강제 함수
# def enforce_categorical_conditions(df):
#     # Customer_Gender: 'male', 'female'만 허용
#     df['Customer_Gender'] = df['Customer_Gender'].apply(lambda x: 'male' if x == 'male' else 'female')
    
#     # Customer_credit_rating: 'S', 'A', 'B', 'C', 'D', 'E'만 허용
#     valid_ratings = ['S', 'A', 'B', 'C', 'D', 'E']
#     df['Customer_credit_rating'] = df['Customer_credit_rating'].apply(lambda x: x if x in valid_ratings else 'B')
    
#     # Customer_loan_type: 'a', 'b', 'c', 'd', 'e'만 허용
#     valid_loan_types = ['a', 'b', 'c', 'd', 'e']
#     df['Customer_loan_type'] = df['Customer_loan_type'].apply(lambda x: x if x in valid_loan_types else 'c')
    
#     # 기타 범주형 변수들 0, 1만 허용
#     binary_columns = ['Customer_flag_change_of_authentication_1', 'Customer_flag_change_of_authentication_2',
#                       'Customer_flag_change_of_authentication_3', 'Customer_flag_change_of_authentication_4',
#                       'Customer_rooting_jailbreak_indicator', 'Customer_mobile_roaming_indicator', 
#                       'Customer_VPN_Indicator', 'Customer_flag_terminal_malicious_behavior_1',
#                       'Customer_flag_terminal_malicious_behavior_2', 'Customer_flag_terminal_malicious_behavior_3',
#                       'Customer_flag_terminal_malicious_behavior_4', 'Customer_flag_terminal_malicious_behavior_5',
#                       'Customer_flag_terminal_malicious_behavior_6', 'Customer_inquery_atm_limit',
#                       'Customer_increase_atm_limit', 'Account_indicator_release_limit_excess',
#                       'Account_indicator_Openbanking', 'Account_release_suspention', 'Transaction_Failure_Status',
#                       'Another_Person_Account', 'Unused_terminal_status', 'Flag_deposit_more_than_tenMillion',
#                       'Unused_account_status', 'Recipient_account_suspend_status', 'First_time_iOS_by_vulnerable_user']
#     for col in binary_columns:
#         df[col] = df[col].apply(lambda x: 1 if x == 1 else 0)
    
#     return df

# # 수치형 데이터 조건 강제 함수
# def enforce_numerical_conditions(df):
#     # Customer_Birthyear: 1950 ~ 2004 범위 제한
#     df['Customer_Birthyear'] = df['Customer_Birthyear'].clip(1950, 2004)
    
#     # Account_initial_balance: 음수가 될 수 없으므로 최소값을 0으로 설정
#     df['Account_initial_balance'] = df['Account_initial_balance'].clip(lower=0)
    
#     # Account_balance: 음수가 될 수 없으므로 최소값을 0으로 설정
#     df['Account_balance'] = df['Account_balance'].clip(lower=0)
    
#     # Account_amount_daily_limit: 0 이상의 값으로 설정 (예시로 최대값도 설정 가능)
#     df['Account_amount_daily_limit'] = df['Account_amount_daily_limit'].clip(lower=0)
    
#     # Account_remaining_amount_daily_limit_exceeded: 0 이상의 값으로 설정
#     df['Account_remaining_amount_daily_limit_exceeded'] = df['Account_remaining_amount_daily_limit_exceeded'].clip(lower=0)
    
#     # Account_one_month_max_amount, Account_dawn_one_month_max_amount: 음수일 수 있으므로, 필요시 범위 설정
#     # 이 항목들은 특정 조건이 있다면 적용
#     # 예: 최댓값을 특정 범위로 제한
#     df['Account_one_month_max_amount'] = df['Account_one_month_max_amount'].clip(lower=-1000000, upper=1000000)
#     df['Account_dawn_one_month_max_amount'] = df['Account_dawn_one_month_max_amount'].clip(lower=-1000000, upper=1000000)
    
#     # Account_one_month_std_dev, Account_dawn_one_month_std_dev: 표준편차는 음수가 될 수 없으므로 0으로 제한
#     df['Account_one_month_std_dev'] = df['Account_one_month_std_dev'].clip(lower=0)
#     df['Account_dawn_one_month_std_dev'] = df['Account_dawn_one_month_std_dev'].clip(lower=0)
    
#     # Transaction_Amount: 특정 범위 내로 설정 (예: 음수도 허용, 최대값 제한)
#     df['Transaction_Amount'] = df['Transaction_Amount'].clip(lower=-100000000, upper=100000000)
    
#     # Distance: 음수가 될 수 없으므로 최소값을 0으로 설정
#     df['Distance'] = df['Distance'].clip(lower=0)
    
#     # Number_of_transaction_with_the_account, Transaction_history_with_the_account: 음수가 될 수 없으므로 최소값을 0으로 설정
#     df['Number_of_transaction_with_the_account'] = df['Number_of_transaction_with_the_account'].clip(lower=0)
#     df['Transaction_history_with_the_account'] = df['Transaction_history_with_the_account'].clip(lower=0)
    
#     return df

# # Time_difference 컬럼을 총 초로 변환 및 이상치 처리
# train['Time_difference_seconds'] = pd.to_timedelta(train['Time_difference']).dt.total_seconds()
# train['Time_difference_seconds'] = handle_outliers(train['Time_difference_seconds'])

# # 모든 Fraud_Type 목록 생성
# fraud_types = train['Fraud_Type'].unique()  # Fraud_Type로 변경

# # 모든 합성 데이터를 저장할 DataFrame 초기화
# all_synthetic_data = pd.DataFrame()

# N_SAMPLE = 100

# # 각 Fraud_Type에 대해 합성 데이터 생성 및 저장
# for fraud_type in tqdm(fraud_types):
    
#     # 해당 Fraud_Type에 대한 서브셋 생성
#     subset = train[train["Fraud_Type"] == fraud_type]

#     # 모든 Fraud_Type에 대해 100개씩 샘플링
#     subset = subset.sample(n=N_SAMPLE, random_state=42)
    
#     # Time_difference 열 제외 (초 단위로 변환된 컬럼만 사용)
#     subset = subset.drop('Time_difference', axis=1)
    
#     # 메타데이터 생성 및 모델 학습
#     metadata = SingleTableMetadata()

#     metadata.detect_from_dataframe(subset)
#     metadata.set_primary_key(None)

#     # 데이터 타입 설정
#     column_sdtypes = {
#         'Customer_Birthyear': 'numerical',
#         # 'Customer_Gender': 'categorical',
#         'Customer_personal_identifier': 'categorical',
#         'Customer_identification_number': 'categorical',
#         # 'Customer_registration_datetime': 'datetime',
#         'Customer_credit_rating': 'categorical',
#         # 'Customer_flag_change_of_authentication_1': 'categorical',
#         # 'Customer_flag_change_of_authentication_2': 'categorical',
#         # 'Customer_flag_change_of_authentication_3': 'categorical',
#         # 'Customer_flag_change_of_authentication_4': 'categorical',
#         # 'Customer_rooting_jailbreak_indicator': 'categorical',
#         # 'Customer_mobile_roaming_indicator': 'categorical',
#         # 'Customer_VPN_Indicator': 'categorical',
#         # 'Customer_loan_type': 'categorical',
#         # 'Customer_flag_terminal_malicious_behavior_1': 'categorical',
#         # 'Customer_flag_terminal_malicious_behavior_2': 'categorical',
#         # 'Customer_flag_terminal_malicious_behavior_3': 'categorical',
#         # 'Customer_flag_terminal_malicious_behavior_4': 'categorical',
#         # 'Customer_flag_terminal_malicious_behavior_5': 'categorical',
#         # 'Customer_flag_terminal_malicious_behavior_6': 'categorical',
#         # 'Customer_inquery_atm_limit': 'categorical',
#         # 'Customer_increase_atm_limit': 'categorical',
#         'Account_account_number': 'categorical',
#         # 'Account_account_type': 'categorical',
#         # 'Account_creation_datetime': 'datetime',
#         'Account_initial_balance': 'numerical',
#         'Account_balance': 'numerical',
#         # 'Account_indicator_release_limit_excess': 'categorical',
#         'Account_amount_daily_limit': 'numerical',
#         'Account_indicator_Openbanking': 'categorical',
#         'Account_remaining_amount_daily_limit_exceeded': 'numerical',
#         # 'Account_release_suspention': 'categorical',
#         'Account_one_month_max_amount': 'numerical',
#         'Account_one_month_std_dev': 'numerical',
#         'Account_dawn_one_month_max_amount': 'numerical',
#         'Account_dawn_one_month_std_dev': 'numerical',
#         # 'Transaction_Datetime': 'datetime',
#         'Transaction_Amount': 'numerical',
#         # 'Channel': 'categorical',
#         # 'Operating_System': 'categorical',
#         # 'Error_Code': 'categorical',
#         # 'Transaction_Failure_Status': 'categorical',
#         # 'Type_General_Automatic': 'categorical',
#         'IP_Address': 'ipv4_address',
#         # 'Access_Medium': 'categorical',
#         'Location': 'categorical',
#         'Recipient_Account_Number': 'categorical',
#         'Transaction_num_connection_failure': 'numerical',
#         # 'Another_Person_Account': 'categorical',
#         'Distance': 'numerical',
#         'Time_difference_seconds': 'numerical',
#         # 'Unused_terminal_status': 'categorical',
#         # 'Last_atm_transaction_datetime': 'datetime',
#         # 'Last_bank_branch_transaction_datetime': 'datetime',
#         # 'Flag_deposit_more_than_tenMillion': 'categorical',
#         # 'Unused_account_status': 'categorical',
#         # 'Recipient_account_suspend_status': 'categorical',
#         'Number_of_transaction_with_the_account': 'numerical',
#         'Transaction_history_with_the_account': 'numerical',
#         # 'First_time_iOS_by_vulnerable_user': 'categorical',
#         # 'Transaction_resumed_date': 'datetime',
#         'Fraud_Type': 'categorical'
#     }

#     # 각 컬럼에 대해 데이터 타입 설정
#     for column, sdtype in column_sdtypes.items():
#         metadata.update_column(
#             column_name=column,
#             sdtype=sdtype
#         )
        
#     synthesizer = CTGANSynthesizer(
#                             metadata,
#                             epochs= 2000
#                         )
#     synthesizer.fit(subset)

#     synthetic_subset = synthesizer.sample(num_rows=N_CLS_PER_GEN)  # 합성 데이터 생성 수 설정
    
#     # 생성된 Time_difference_seconds의 이상치 처리
#     synthetic_subset['Time_difference_seconds'] = handle_outliers(synthetic_subset['Time_difference_seconds'])
    
#     # Time_difference_seconds를 다시 timedelta로 변환
#     synthetic_subset['Time_difference'] = pd.to_timedelta(synthetic_subset['Time_difference_seconds'], unit='s')
    
#     # Time_difference_seconds 컬럼 제거
#     synthetic_subset = synthetic_subset.drop('Time_difference_seconds', axis=1)
    
#     # 생성 조건 반영 (범주형, 수치형, 형식 조건)
#     synthetic_subset = enforce_categorical_conditions(synthetic_subset)
#     synthetic_subset = enforce_numerical_conditions(synthetic_subset)
    
#     # 생성된 데이터를 all_synthetic_data에 추가
#     all_synthetic_data = pd.concat([all_synthetic_data, synthetic_subset], ignore_index=True)

# # 최종 결과 확인
# print("\nFinal All Synthetic Data Shape:", all_synthetic_data.shape)


In [44]:
# all_synthetic_data.to_csv('submission/syn_submission.csv', encoding='UTF-8-sig', index=False)

In [45]:
from ctgan import CTGAN 
import pandas as pd
from tqdm import tqdm
import numpy as np
from scipy import stats
from sdv.metadata import SingleTableMetadata

# 이상치 처리 함수
def handle_outliers(series, n_std=3):
    mean = series.mean()
    std = series.std()
    z_scores = np.abs(stats.zscore(series))
    return series.mask(z_scores > n_std, mean)

# 범주형 데이터 조건 강제 함수
def enforce_categorical_conditions(df):
    # Customer_Gender: 'male', 'female'만 허용
    df['Customer_Gender'] = df['Customer_Gender'].apply(lambda x: 'male' if x == 'male' else 'female')
    
    # Customer_credit_rating: 'S', 'A', 'B', 'C', 'D', 'E'만 허용
    valid_ratings = ['S', 'A', 'B', 'C', 'D', 'E']
    df['Customer_credit_rating'] = df['Customer_credit_rating'].apply(lambda x: x if x in valid_ratings else 'B')
    
    # Customer_loan_type: 'a', 'b', 'c', 'd', 'e'만 허용
    valid_loan_types = ['a', 'b', 'c', 'd', 'e']
    df['Customer_loan_type'] = df['Customer_loan_type'].apply(lambda x: x if x in valid_loan_types else 'c')
    
    # 기타 범주형 변수들 0, 1만 허용
    binary_columns = ['Customer_flag_change_of_authentication_1', 'Customer_flag_change_of_authentication_2',
                      'Customer_flag_change_of_authentication_3', 'Customer_flag_change_of_authentication_4',
                      'Customer_rooting_jailbreak_indicator', 'Customer_mobile_roaming_indicator', 
                      'Customer_VPN_Indicator', 'Customer_flag_terminal_malicious_behavior_1',
                      'Customer_flag_terminal_malicious_behavior_2', 'Customer_flag_terminal_malicious_behavior_3',
                      'Customer_flag_terminal_malicious_behavior_4', 'Customer_flag_terminal_malicious_behavior_5',
                      'Customer_flag_terminal_malicious_behavior_6', 'Customer_inquery_atm_limit',
                      'Customer_increase_atm_limit', 'Account_indicator_release_limit_excess',
                      'Account_indicator_Openbanking', 'Account_release_suspention', 'Transaction_Failure_Status',
                      'Another_Person_Account', 'Unused_terminal_status', 'Flag_deposit_more_than_tenMillion',
                      'Unused_account_status', 'Recipient_account_suspend_status', 'First_time_iOS_by_vulnerable_user']
    for col in binary_columns:
        df[col] = df[col].apply(lambda x: 1 if x == 1 else 0)
    
    return df

# 수치형 데이터 조건 강제 함수
def enforce_numerical_conditions(df):
    # Customer_Birthyear: 1950 ~ 2004 범위 제한
    df['Customer_Birthyear'] = df['Customer_Birthyear'].clip(1950, 2004)
    
    # Account_initial_balance: 음수가 될 수 없으므로 최소값을 0으로 설정
    df['Account_initial_balance'] = df['Account_initial_balance'].clip(lower=0)
    
    # Account_balance: 음수가 될 수 없으므로 최소값을 0으로 설정
    df['Account_balance'] = df['Account_balance'].clip(lower=0)
    
    # Account_amount_daily_limit: 0 이상의 값으로 설정 (예시로 최대값도 설정 가능)
    df['Account_amount_daily_limit'] = df['Account_amount_daily_limit'].clip(lower=0)
    
    # Account_remaining_amount_daily_limit_exceeded: 0 이상의 값으로 설정
    df['Account_remaining_amount_daily_limit_exceeded'] = df['Account_remaining_amount_daily_limit_exceeded'].clip(lower=0)
    
    # Account_one_month_max_amount, Account_dawn_one_month_max_amount: 음수일 수 있으므로, 필요시 범위 설정
    # 이 항목들은 특정 조건이 있다면 적용
    # 예: 최댓값을 특정 범위로 제한
    df['Account_one_month_max_amount'] = df['Account_one_month_max_amount'].clip(lower=-1000000, upper=1000000)
    df['Account_dawn_one_month_max_amount'] = df['Account_dawn_one_month_max_amount'].clip(lower=-1000000, upper=1000000)
    
    # Account_one_month_std_dev, Account_dawn_one_month_std_dev: 표준편차는 음수가 될 수 없으므로 0으로 제한
    df['Account_one_month_std_dev'] = df['Account_one_month_std_dev'].clip(lower=0)
    df['Account_dawn_one_month_std_dev'] = df['Account_dawn_one_month_std_dev'].clip(lower=0)
    
    # Transaction_Amount: 특정 범위 내로 설정 (예: 음수도 허용, 최대값 제한)
    df['Transaction_Amount'] = df['Transaction_Amount'].clip(lower=-100000000, upper=100000000)
    
    # Distance: 음수가 될 수 없으므로 최소값을 0으로 설정
    df['Distance'] = df['Distance'].clip(lower=0)
    
    # Number_of_transaction_with_the_account, Transaction_history_with_the_account: 음수가 될 수 없으므로 최소값을 0으로 설정
    df['Number_of_transaction_with_the_account'] = df['Number_of_transaction_with_the_account'].clip(lower=0)
    df['Transaction_history_with_the_account'] = df['Transaction_history_with_the_account'].clip(lower=0)
    
    return df

# Time_difference 컬럼을 총 초로 변환 및 이상치 처리
train['Time_difference_seconds'] = pd.to_timedelta(train['Time_difference']).dt.total_seconds()
train['Time_difference_seconds'] = handle_outliers(train['Time_difference_seconds'])

# 모든 Fraud_Type 목록 생성
fraud_types = train['Fraud_Type'].unique()  # Fraud_Type로 변경

# 모든 합성 데이터를 저장할 DataFrame 초기화
all_synthetic_data = pd.DataFrame()

N_SAMPLE = 100

# 각 Fraud_Type에 대해 합성 데이터 생성 및 저장
for fraud_type in tqdm(fraud_types):
    
    # 해당 Fraud_Type에 대한 서브셋 생성
    subset = train[train["Fraud_Type"] == fraud_type]

    # 모든 Fraud_Type에 대해 100개씩 샘플링
    subset = subset.sample(n=N_SAMPLE, random_state=42)
    
    # Time_difference 열 제외 (초 단위로 변환된 컬럼만 사용)
    subset = subset.drop('Time_difference', axis=1)
    
    # 메타데이터 생성 및 모델 학습
    metadata = SingleTableMetadata()

    metadata.detect_from_dataframe(subset)
    metadata.set_primary_key(None)

    # 데이터 타입 설정
    column_sdtypes = {
        'Customer_Birthyear': 'numerical',
        # 'Customer_Gender': 'categorical',
        'Customer_personal_identifier': 'categorical',
        'Customer_identification_number': 'categorical',
        # 'Customer_registration_datetime': 'datetime',
        'Customer_credit_rating': 'categorical',
        # 'Customer_flag_change_of_authentication_1': 'categorical',
        # 'Customer_flag_change_of_authentication_2': 'categorical',
        # 'Customer_flag_change_of_authentication_3': 'categorical',
        # 'Customer_flag_change_of_authentication_4': 'categorical',
        # 'Customer_rooting_jailbreak_indicator': 'categorical',
        # 'Customer_mobile_roaming_indicator': 'categorical',
        # 'Customer_VPN_Indicator': 'categorical',
        # 'Customer_loan_type': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_1': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_2': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_3': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_4': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_5': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_6': 'categorical',
        # 'Customer_inquery_atm_limit': 'categorical',
        # 'Customer_increase_atm_limit': 'categorical',
        'Account_account_number': 'categorical',
        # 'Account_account_type': 'categorical',
        # 'Account_creation_datetime': 'datetime',
        'Account_initial_balance': 'numerical',
        'Account_balance': 'numerical',
        # 'Account_indicator_release_limit_excess': 'categorical',
        'Account_amount_daily_limit': 'numerical',
        'Account_indicator_Openbanking': 'categorical',
        'Account_remaining_amount_daily_limit_exceeded': 'numerical',
        # 'Account_release_suspention': 'categorical',
        'Account_one_month_max_amount': 'numerical',
        'Account_one_month_std_dev': 'numerical',
        'Account_dawn_one_month_max_amount': 'numerical',
        'Account_dawn_one_month_std_dev': 'numerical',
        # 'Transaction_Datetime': 'datetime',
        'Transaction_Amount': 'numerical',
        # 'Channel': 'categorical',
        # 'Operating_System': 'categorical',
        # 'Error_Code': 'categorical',
        # 'Transaction_Failure_Status': 'categorical',
        # 'Type_General_Automatic': 'categorical',
        'IP_Address': 'ipv4_address',
        # 'Access_Medium': 'categorical',
        'Location': 'categorical',
        'Recipient_Account_Number': 'categorical',
        'Transaction_num_connection_failure': 'numerical',
        # 'Another_Person_Account': 'categorical',
        'Distance': 'numerical',
        'Time_difference_seconds': 'numerical',
        # 'Unused_terminal_status': 'categorical',
        # 'Last_atm_transaction_datetime': 'datetime',
        # 'Last_bank_branch_transaction_datetime': 'datetime',
        # 'Flag_deposit_more_than_tenMillion': 'categorical',
        # 'Unused_account_status': 'categorical',
        # 'Recipient_account_suspend_status': 'categorical',
        'Number_of_transaction_with_the_account': 'numerical',
        'Transaction_history_with_the_account': 'numerical',
        # 'First_time_iOS_by_vulnerable_user': 'categorical',
        # 'Transaction_resumed_date': 'datetime',
        'Fraud_Type': 'categorical'
    }

    # 각 컬럼에 대해 데이터 타입 설정
    for column, sdtype in column_sdtypes.items():
        metadata.update_column(
            column_name=column,
            sdtype=sdtype
        )
        
    synthesizer = CTGANSynthesizer(
                            metadata,
                            epochs= 4000
                        )
    synthesizer.fit(subset)

    synthetic_subset = synthesizer.sample(num_rows=N_CLS_PER_GEN)  # 합성 데이터 생성 수 설정
    
    # 생성된 Time_difference_seconds의 이상치 처리
    synthetic_subset['Time_difference_seconds'] = handle_outliers(synthetic_subset['Time_difference_seconds'])
    
    # Time_difference_seconds를 다시 timedelta로 변환
    synthetic_subset['Time_difference'] = pd.to_timedelta(synthetic_subset['Time_difference_seconds'], unit='s')
    
    # Time_difference_seconds 컬럼 제거
    synthetic_subset = synthetic_subset.drop('Time_difference_seconds', axis=1)
    
    # 생성 조건 반영 (범주형, 수치형, 형식 조건)
    synthetic_subset = enforce_categorical_conditions(synthetic_subset)
    synthetic_subset = enforce_numerical_conditions(synthetic_subset)
    
    # 생성된 데이터를 all_synthetic_data에 추가
    all_synthetic_data = pd.concat([all_synthetic_data, synthetic_subset], ignore_index=True)

# 최종 결과 확인
print("\nFinal All Synthetic Data Shape:", all_synthetic_data.shape)


100%|██████████| 13/13 [3:06:02<00:00, 858.62s/it] 


Final All Synthetic Data Shape: (13000, 63)





In [46]:
all_synthetic_data.to_csv('submission/syn_submission.csv', encoding='UTF-8-sig', index=False)

## 성능용 합성데이터

CTGAN

In [47]:
from ctgan import CTGAN 
import pandas as pd
from tqdm import tqdm
import numpy as np
from scipy import stats
from sdv.metadata import SingleTableMetadata

# 이상치 처리 함수
def handle_outliers(series, n_std=3):
    mean = series.mean()
    std = series.std()
    z_scores = np.abs(stats.zscore(series))
    return series.mask(z_scores > n_std, mean)

# 범주형 데이터 조건 강제 함수
def enforce_categorical_conditions(df):
    # Customer_Gender: 'male', 'female'만 허용
    df['Customer_Gender'] = df['Customer_Gender'].apply(lambda x: 'male' if x == 'male' else 'female')
    
    # Customer_credit_rating: 'S', 'A', 'B', 'C', 'D', 'E'만 허용
    valid_ratings = ['S', 'A', 'B', 'C', 'D', 'E']
    df['Customer_credit_rating'] = df['Customer_credit_rating'].apply(lambda x: x if x in valid_ratings else 'B')
    
    # Customer_loan_type: 'a', 'b', 'c', 'd', 'e'만 허용
    valid_loan_types = ['a', 'b', 'c', 'd', 'e']
    df['Customer_loan_type'] = df['Customer_loan_type'].apply(lambda x: x if x in valid_loan_types else 'c')
    
    # 기타 범주형 변수들 0, 1만 허용
    binary_columns = ['Customer_flag_change_of_authentication_1', 'Customer_flag_change_of_authentication_2',
                      'Customer_flag_change_of_authentication_3', 'Customer_flag_change_of_authentication_4',
                      'Customer_rooting_jailbreak_indicator', 'Customer_mobile_roaming_indicator', 
                      'Customer_VPN_Indicator', 'Customer_flag_terminal_malicious_behavior_1',
                      'Customer_flag_terminal_malicious_behavior_2', 'Customer_flag_terminal_malicious_behavior_3',
                      'Customer_flag_terminal_malicious_behavior_4', 'Customer_flag_terminal_malicious_behavior_5',
                      'Customer_flag_terminal_malicious_behavior_6', 'Customer_inquery_atm_limit',
                      'Customer_increase_atm_limit', 'Account_indicator_release_limit_excess',
                      'Account_indicator_Openbanking', 'Account_release_suspention', 'Transaction_Failure_Status',
                      'Another_Person_Account', 'Unused_terminal_status', 'Flag_deposit_more_than_tenMillion',
                      'Unused_account_status', 'Recipient_account_suspend_status', 'First_time_iOS_by_vulnerable_user']
    for col in binary_columns:
        df[col] = df[col].apply(lambda x: 1 if x == 1 else 0)
    
    return df

# 수치형 데이터 조건 강제 함수
def enforce_numerical_conditions(df):
    # Customer_Birthyear: 1950 ~ 2004 범위 제한
    df['Customer_Birthyear'] = df['Customer_Birthyear'].clip(1950, 2004)
    
    # Account_initial_balance: 음수가 될 수 없으므로 최소값을 0으로 설정
    df['Account_initial_balance'] = df['Account_initial_balance'].clip(lower=0)
    
    # Account_balance: 음수가 될 수 없으므로 최소값을 0으로 설정
    df['Account_balance'] = df['Account_balance'].clip(lower=0)
    
    # Account_amount_daily_limit: 0 이상의 값으로 설정 (예시로 최대값도 설정 가능)
    df['Account_amount_daily_limit'] = df['Account_amount_daily_limit'].clip(lower=0)
    
    # Account_remaining_amount_daily_limit_exceeded: 0 이상의 값으로 설정
    df['Account_remaining_amount_daily_limit_exceeded'] = df['Account_remaining_amount_daily_limit_exceeded'].clip(lower=0)
    
    # Account_one_month_max_amount, Account_dawn_one_month_max_amount: 음수일 수 있으므로, 필요시 범위 설정
    # 이 항목들은 특정 조건이 있다면 적용
    # 예: 최댓값을 특정 범위로 제한
    df['Account_one_month_max_amount'] = df['Account_one_month_max_amount'].clip(lower=-1000000, upper=1000000)
    df['Account_dawn_one_month_max_amount'] = df['Account_dawn_one_month_max_amount'].clip(lower=-1000000, upper=1000000)
    
    # Account_one_month_std_dev, Account_dawn_one_month_std_dev: 표준편차는 음수가 될 수 없으므로 0으로 제한
    df['Account_one_month_std_dev'] = df['Account_one_month_std_dev'].clip(lower=0)
    df['Account_dawn_one_month_std_dev'] = df['Account_dawn_one_month_std_dev'].clip(lower=0)
    
    # Transaction_Amount: 특정 범위 내로 설정 (예: 음수도 허용, 최대값 제한)
    df['Transaction_Amount'] = df['Transaction_Amount'].clip(lower=-100000000, upper=100000000)
    
    # Distance: 음수가 될 수 없으므로 최소값을 0으로 설정
    df['Distance'] = df['Distance'].clip(lower=0)
    
    # Number_of_transaction_with_the_account, Transaction_history_with_the_account: 음수가 될 수 없으므로 최소값을 0으로 설정
    df['Number_of_transaction_with_the_account'] = df['Number_of_transaction_with_the_account'].clip(lower=0)
    df['Transaction_history_with_the_account'] = df['Transaction_history_with_the_account'].clip(lower=0)
    
    return df



# Time_difference 컬럼을 총 초로 변환 및 이상치 처리
train['Time_difference_seconds'] = pd.to_timedelta(train['Time_difference']).dt.total_seconds()
train['Time_difference_seconds'] = handle_outliers(train['Time_difference_seconds'])

# 모든 Fraud_Type 목록 생성
fraud_types = train['Fraud_Type'].unique()  # Fraud_Type로 변경

# 모든 합성 데이터를 저장할 DataFrame 초기화
all_synthetic_data_ctgan = pd.DataFrame()

N_SAMPLE = 100

# 각 Fraud_Type에 대해 합성 데이터 생성 및 저장
for fraud_type in tqdm(fraud_types):
    
    # 해당 Fraud_Type에 대한 서브셋 생성
    subset = train[train["Fraud_Type"] == fraud_type]

    # 모든 Fraud_Type에 대해 100개씩 샘플링
    subset = subset.sample(n=N_SAMPLE, random_state=42)
    
    # Time_difference 열 제외 (초 단위로 변환된 컬럼만 사용)
    subset = subset.drop('Time_difference', axis=1)
    
    # 메타데이터 생성 및 모델 학습
    metadata = SingleTableMetadata()

    metadata.detect_from_dataframe(subset)
    metadata.set_primary_key(None)

    # 데이터 타입 설정
    column_sdtypes = {
        'Customer_Birthyear': 'numerical',
        # 'Customer_Gender': 'categorical',
        'Customer_personal_identifier': 'categorical',
        'Customer_identification_number': 'categorical',
        # 'Customer_registration_datetime': 'datetime',
        'Customer_credit_rating': 'categorical',
        # 'Customer_flag_change_of_authentication_1': 'categorical',
        # 'Customer_flag_change_of_authentication_2': 'categorical',
        # 'Customer_flag_change_of_authentication_3': 'categorical',
        # 'Customer_flag_change_of_authentication_4': 'categorical',
        # 'Customer_rooting_jailbreak_indicator': 'categorical',
        # 'Customer_mobile_roaming_indicator': 'categorical',
        # 'Customer_VPN_Indicator': 'categorical',
        # 'Customer_loan_type': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_1': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_2': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_3': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_4': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_5': 'categorical',
        # 'Customer_flag_terminal_malicious_behavior_6': 'categorical',
        # 'Customer_inquery_atm_limit': 'categorical',
        # 'Customer_increase_atm_limit': 'categorical',
        'Account_account_number': 'categorical',
        # 'Account_account_type': 'categorical',
        # 'Account_creation_datetime': 'datetime',
        'Account_initial_balance': 'numerical',
        'Account_balance': 'numerical',
        # 'Account_indicator_release_limit_excess': 'categorical',
        'Account_amount_daily_limit': 'numerical',
        'Account_indicator_Openbanking': 'categorical',
        'Account_remaining_amount_daily_limit_exceeded': 'numerical',
        # 'Account_release_suspention': 'categorical',
        'Account_one_month_max_amount': 'numerical',
        'Account_one_month_std_dev': 'numerical',
        'Account_dawn_one_month_max_amount': 'numerical',
        'Account_dawn_one_month_std_dev': 'numerical',
        # 'Transaction_Datetime': 'datetime',
        'Transaction_Amount': 'numerical',
        # 'Channel': 'categorical',
        # 'Operating_System': 'categorical',
        # 'Error_Code': 'categorical',
        # 'Transaction_Failure_Status': 'categorical',
        # 'Type_General_Automatic': 'categorical',
        'IP_Address': 'ipv4_address',
        # 'Access_Medium': 'categorical',
        'Location': 'categorical',
        'Recipient_Account_Number': 'categorical',
        'Transaction_num_connection_failure': 'numerical',
        # 'Another_Person_Account': 'categorical',
        'Distance': 'numerical',
        'Time_difference_seconds': 'numerical',
        # 'Unused_terminal_status': 'categorical',
        # 'Last_atm_transaction_datetime': 'datetime',
        # 'Last_bank_branch_transaction_datetime': 'datetime',
        # 'Flag_deposit_more_than_tenMillion': 'categorical',
        # 'Unused_account_status': 'categorical',
        # 'Recipient_account_suspend_status': 'categorical',
        'Number_of_transaction_with_the_account': 'numerical',
        'Transaction_history_with_the_account': 'numerical',
        # 'First_time_iOS_by_vulnerable_user': 'categorical',
        # 'Transaction_resumed_date': 'datetime',
        'Fraud_Type': 'categorical'
    }
    # 각 컬럼에 대해 데이터 타입 설정
    for column, sdtype in column_sdtypes.items():
        metadata.update_column(
            column_name=column,
            sdtype=sdtype
        )
        
    synthesizer = CTGANSynthesizer(
                            metadata,
                            epochs=1000
                        )
    synthesizer.fit(subset)

    synthetic_subset = synthesizer.sample(num_rows=N_CLS_PER_GEN_2)  # 합성 데이터 생성 수 설정
    
    # 생성된 Time_difference_seconds의 이상치 처리
    synthetic_subset['Time_difference_seconds'] = handle_outliers(synthetic_subset['Time_difference_seconds'])
    
    # Time_difference_seconds를 다시 timedelta로 변환
    synthetic_subset['Time_difference'] = pd.to_timedelta(synthetic_subset['Time_difference_seconds'], unit='s')
    
    # Time_difference_seconds 컬럼 제거
    synthetic_subset = synthetic_subset.drop('Time_difference_seconds', axis=1)
    
    # 생성 조건 반영 (범주형, 수치형, 형식 조건)
    synthetic_subset = enforce_categorical_conditions(synthetic_subset)
    synthetic_subset = enforce_numerical_conditions(synthetic_subset)
    
    # 생성된 데이터를 all_synthetic_data에 추가
    all_synthetic_data_ctgan = pd.concat([all_synthetic_data_ctgan, synthetic_subset], ignore_index=True)

# 최종 결과 확인
print("\nFinal All Synthetic Data ctgan Shape:", all_synthetic_data_ctgan.shape)


100%|██████████| 13/13 [46:29<00:00, 214.59s/it]


Final All Synthetic Data ctgan Shape: (65, 63)





## 원본 데이터와 concat

In [48]:
origin_train = train_all.drop(columns="ID")
train_total = pd.concat([origin_train, all_synthetic_data_ctgan])
train_total.shape

(120065, 63)

# Data Preprocessing 1 : Select x, y

In [49]:
train_x = train_total.drop(columns=['Fraud_Type'])
train_y = train_total['Fraud_Type']

test_x = test_all.drop(columns=['ID'])

# Data Preprocessing 2 : 범주형 변수 인코딩

In [50]:
le_subclass = LabelEncoder()
train_y_encoded = le_subclass.fit_transform(train_y)

# 변환된 레이블 확인
for i, label in enumerate(le_subclass.classes_):
    print(f"원래 레이블: {label}, 변환된 숫자: {i}")

원래 레이블: a, 변환된 숫자: 0
원래 레이블: b, 변환된 숫자: 1
원래 레이블: c, 변환된 숫자: 2
원래 레이블: d, 변환된 숫자: 3
원래 레이블: e, 변환된 숫자: 4
원래 레이블: f, 변환된 숫자: 5
원래 레이블: g, 변환된 숫자: 6
원래 레이블: h, 변환된 숫자: 7
원래 레이블: i, 변환된 숫자: 8
원래 레이블: j, 변환된 숫자: 9
원래 레이블: k, 변환된 숫자: 10
원래 레이블: l, 변환된 숫자: 11
원래 레이블: m, 변환된 숫자: 12


In [51]:
# train_x
# 'Time_difference' 열을 문자열로 변환
train_x['Time_difference'] = train_x['Time_difference'].astype(str)

# 범주형 변수 인코딩
categorical_columns = train_x.select_dtypes(include=['object', 'category']).columns
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# 훈련 데이터 인코딩
train_x_encoded = train_x.copy()
train_x_encoded[categorical_columns] = ordinal_encoder.fit_transform(train_x[categorical_columns])

In [52]:
train_x_encoded['Customer_Total_Authentication_Changes'] = train_x_encoded[
    ['Customer_flag_change_of_authentication_1', 'Customer_flag_change_of_authentication_2', 
     'Customer_flag_change_of_authentication_3', 'Customer_flag_change_of_authentication_4']
].sum(axis=1)

train_x_encoded['Customer_Total_Malicious_Behavior_Flags'] = train_x_encoded[
    ['Customer_flag_terminal_malicious_behavior_1', 'Customer_flag_terminal_malicious_behavior_2', 
     'Customer_flag_terminal_malicious_behavior_3', 'Customer_flag_terminal_malicious_behavior_4', 
     'Customer_flag_terminal_malicious_behavior_5', 'Customer_flag_terminal_malicious_behavior_6']
].sum(axis=1)

train_x_encoded['Daily_Usage_Ratio'] = train_x_encoded['Account_amount_daily_limit'] / train_x_encoded['Account_remaining_amount_daily_limit_exceeded']

train_x_encoded['Transaction_Amount_per_Distance'] = train_x_encoded['Transaction_Amount'] / train_x_encoded['Distance']
train_x_encoded['Channel_Distance_Interaction'] = train_x_encoded['Channel'] * train_x_encoded['Distance']
train_x_encoded['Abs_Transaction_Amount'] = train_x_encoded['Transaction_Amount'].abs()
train_x_encoded['Transaction_Amount_per_Transaction_Count'] = train_x_encoded['Transaction_Amount'] / (train_x_encoded['Number_of_transaction_with_the_account'] + 1)

train_x_encoded['Channel_Transaction_Count_Interaction'] = train_x_encoded['Channel'] * train_x_encoded['Number_of_transaction_with_the_account']
train_x_encoded['Flag_Transaction_Interaction'] = train_x_encoded['Flag_deposit_more_than_tenMillion'] * train_x_encoded['Transaction_Amount']

train_x_encoded['Transaction_Failure_Rate'] = train_x_encoded['Transaction_Failure_Status'].mean()

train_x_encoded['ATM_Limit_Increased'] = (train_x_encoded['Customer_increase_atm_limit'] > 0).astype(int)

# train_x_encoded['Transaction_Weekday'] = pd.to_datetime(train_x_encoded['Transaction_Datetime']).dt.weekday

train_x_encoded.replace([np.inf, -np.inf], np.nan, inplace=True)
train_x_encoded.fillna(train_x_encoded.mean(), inplace=True)


In [53]:
# 특성 순서 저장
feature_order = train_x_encoded.columns.tolist()

### test

In [54]:
# 테스트 데이터 인코딩
test_x_encoded = test_x.copy()
test_x_encoded[categorical_columns] = ordinal_encoder.transform(test_x[categorical_columns])

In [55]:
test_x_encoded['Customer_Total_Authentication_Changes'] = test_x_encoded[
    ['Customer_flag_change_of_authentication_1', 'Customer_flag_change_of_authentication_2', 
     'Customer_flag_change_of_authentication_3', 'Customer_flag_change_of_authentication_4']
].sum(axis=1)

test_x_encoded['Customer_Total_Malicious_Behavior_Flags'] = test_x_encoded[
    ['Customer_flag_terminal_malicious_behavior_1', 'Customer_flag_terminal_malicious_behavior_2', 
     'Customer_flag_terminal_malicious_behavior_3', 'Customer_flag_terminal_malicious_behavior_4', 
     'Customer_flag_terminal_malicious_behavior_5', 'Customer_flag_terminal_malicious_behavior_6']
].sum(axis=1)

test_x_encoded['Daily_Usage_Ratio'] = test_x_encoded['Account_amount_daily_limit'] / test_x_encoded['Account_remaining_amount_daily_limit_exceeded']

test_x_encoded['Transaction_Amount_per_Distance'] = test_x_encoded['Transaction_Amount'] / test_x_encoded['Distance']
test_x_encoded['Channel_Distance_Interaction'] = test_x_encoded['Channel'] * test_x_encoded['Distance']
test_x_encoded['Abs_Transaction_Amount'] = test_x_encoded['Transaction_Amount'].abs()
test_x_encoded['Transaction_Amount_per_Transaction_Count'] = test_x_encoded['Transaction_Amount'] / (test_x_encoded['Number_of_transaction_with_the_account'] + 1)

test_x_encoded['Channel_Transaction_Count_Interaction'] = test_x_encoded['Channel'] * test_x_encoded['Number_of_transaction_with_the_account']
test_x_encoded['Flag_Transaction_Interaction'] = test_x_encoded['Flag_deposit_more_than_tenMillion'] * test_x_encoded['Transaction_Amount']
test_x_encoded['Transaction_Failure_Rate'] = test_x_encoded['Transaction_Failure_Status'].mean()

test_x_encoded['ATM_Limit_Increased'] = (test_x_encoded['Customer_increase_atm_limit'] > 0).astype(int)

# test_x_encoded['Transaction_Weekday'] = pd.to_datetime(test_x_encoded['Transaction_Datetime']).dt.weekday

test_x_encoded.replace([np.inf, -np.inf], np.nan, inplace=True)
test_x_encoded.fillna(test_x_encoded.mean(), inplace=True)

In [56]:
# 특성 순서 맞추기 및 데이터 타입 일치
test_x_encoded = test_x_encoded[feature_order]
# test_x_encoded = test_x_encoded[selected_features]
# for col in selected_features:
for col in feature_order:
    test_x_encoded[col] = test_x_encoded[col].astype(train_x_encoded[col].dtype)

In [57]:
train_x_encoded.shape

(120065, 73)

In [58]:
test_x_encoded.shape

(120000, 73)

In [59]:
train_x_encoded['Fraud_Type'] = train_y_encoded

In [60]:
# 비율 조정을 위한 타겟 클래스별 비율 딕셔너리 (예: Normal 비율 1.0, 다른 클래스는 원하는 비율로 조정)
target_ratios = {
    0: 1.0,
    1: 1.0,
    2: 1.0,
    3: 1.0,
    4: 1.0,
    5: 1.0,
    6: 1.0,
    7: 1.0,
    8: 1.0,
    9: 1.0,
    10: 1.0,
    11: 1.0,
    12: 0.004,  # 33 해보기
}

# 각 클래스별로 샘플링하여 새로운 데이터프레임 생성
df_list = []
for target_class, ratio in target_ratios.items():
    df_class = train_x_encoded[train_x_encoded['Fraud_Type'] == target_class]
    num_class = len(df_class)
    
    # 비율에 맞게 샘플링
    df_sampled = df_class.sample(
        n=int(num_class * ratio), replace=False, random_state=42
    )
    
    # 리스트에 추가
    df_list.append(df_sampled)

# 샘플링된 데이터프레임 결합
df_concat = pd.concat(df_list, axis=0).reset_index(drop=True)

# 새로운 데이터프레임의 클래스별 카운트를 확인
print(df_concat.value_counts('Fraud_Type'))

Fraud_Type
12    475
0     105
1     105
2     105
3     105
4     105
5     105
6     105
7     105
8     105
9     105
10    105
11    105
dtype: int64


In [61]:
train_x_encoded_down = df_concat.drop(columns=['Fraud_Type'])
train_y_encoded_down = df_concat['Fraud_Type']

In [62]:
import xgboost as xgb
import shap
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(train_x_encoded_down, train_y_encoded_down, test_size=0.25, random_state=42)

# XGBoost 모델 학습
model = xgb.XGBClassifier(objective="multi:softprob", eval_metric="mlogloss")
model.fit(X_train, y_train)

# SHAP 값 계산
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# # 각 클래스별로 가장 중요한 피처 확인
# for i in range(len(shap_values)):
#     print(f"클래스 {i}에 대한 피처 중요도 순위:")
#     shap_importance = np.abs(shap_values[i]).mean(axis=0)
#     feature_importance_df = pd.DataFrame({
#         'Feature': X_test.columns,
#         'Importance': shap_importance
#     }).sort_values(by='Importance', ascending=False)
#     print(feature_importance_df.head())

    # # SHAP 값 시각화
    # shap.summary_plot(shap_values[i], X_test, show=False)

# 전체 SHAP 값 계산 (평균 절대값을 사용하여 피처 중요도 확인)
shap_values_mean = np.mean([np.abs(s).mean(axis=0) for s in shap_values], axis=0)

# 피처 중요도 데이터프레임 생성 및 정렬
overall_importance_df = pd.DataFrame({
    'Feature': X_test.columns,
    'Importance': shap_values_mean
}).sort_values(by='Importance', ascending=False)

print("전체 모델의 피처 중요도:")
overall_importance_df.head()
# # 전체 모델의 피처 중요도 시각화
# shap.summary_plot(np.mean(shap_values, axis=0), X_test)


전체 모델의 피처 중요도:


Unnamed: 0,Feature,Importance
70,Flag_Transaction_Interaction,0.535425
38,Channel,0.385711
50,Distance,0.367875
37,Transaction_Amount,0.351628
58,Number_of_transaction_with_the_account,0.27675


In [63]:
# 임계값 설정 
threshold = 0

# 임계값 이상인 피처들만 필터링
selected_features = overall_importance_df[overall_importance_df['Importance'] > threshold]

# 필터링된 피처 목록 출력
print("임계값 이상인 피처들:")
print(selected_features)

# 필터링된 피처 이름 리스트로 추출
selected_feature_names = selected_features['Feature'].tolist()

임계값 이상인 피처들:
                                        Feature  Importance
70                 Flag_Transaction_Interaction    0.535425
38                                      Channel    0.385711
50                                     Distance    0.367875
37                           Transaction_Amount    0.351628
58       Number_of_transaction_with_the_account    0.276750
..                                          ...         ...
7      Customer_flag_change_of_authentication_2    0.004009
16  Customer_flag_terminal_malicious_behavior_3    0.003481
14  Customer_flag_terminal_malicious_behavior_1    0.001109
17  Customer_flag_terminal_malicious_behavior_4    0.001075
18  Customer_flag_terminal_malicious_behavior_5    0.000735

[66 rows x 2 columns]


In [64]:
train_x_shap = train_x_encoded_down[selected_feature_names]
test_x_shap = test_x_encoded[selected_feature_names]

In [65]:
import optuna
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score

# CatBoost 최적화 목적 함수 정의
def optimize_catboost(trial):
    # 최적화할 하이퍼파라미터 설정
    depth = trial.suggest_int('depth', 4, 8)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-3, 0.2)
    iterations = trial.suggest_int('iterations', 100, 600)
    
    # CatBoost 모델 생성
    model = CatBoostClassifier(
        depth=depth,
        learning_rate=learning_rate,
        iterations=iterations,
        verbose=0,
        random_state=42
    )
    
    # 교차 검증으로 모델 성능 평가 (F1 Macro)
    score = cross_val_score(model, train_x_shap, train_y_encoded_down, cv=3, scoring='f1_macro').mean()
    return score

# Optuna 스터디 생성 및 최적화 실행
catboost_study = optuna.create_study(direction='maximize')
catboost_study.optimize(optimize_catboost, n_trials=100)

# 최적 하이퍼파라미터 출력
print("Best CatBoost hyperparameters: ", catboost_study.best_params)

# 최적 하이퍼파라미터로 CatBoost 모델 생성
best_catboost_model = CatBoostClassifier(
    depth=catboost_study.best_params['depth'],
    learning_rate=catboost_study.best_params['learning_rate'],
    iterations=catboost_study.best_params['iterations'],
    verbose=0,
    random_state=42
)


[I 2024-08-29 19:59:29,178] A new study created in memory with name: no-name-e7573517-dde7-4d9d-8aa4-23506a1aee0e
[I 2024-08-29 20:00:27,234] Trial 0 finished with value: 0.5818069404504893 and parameters: {'depth': 7, 'learning_rate': 0.13784275298491652, 'iterations': 352}. Best is trial 0 with value: 0.5818069404504893.
[I 2024-08-29 20:00:39,153] Trial 1 finished with value: 0.418086458497784 and parameters: {'depth': 5, 'learning_rate': 0.0017413944477009783, 'iterations': 324}. Best is trial 0 with value: 0.5818069404504893.
[I 2024-08-29 20:00:46,620] Trial 2 finished with value: 0.4728390067955925 and parameters: {'depth': 4, 'learning_rate': 0.008582048380670181, 'iterations': 392}. Best is trial 0 with value: 0.5818069404504893.
[I 2024-08-29 20:01:05,227] Trial 3 finished with value: 0.46940580836252405 and parameters: {'depth': 7, 'learning_rate': 0.02945593564824172, 'iterations': 112}. Best is trial 0 with value: 0.5818069404504893.
[I 2024-08-29 20:01:17,753] Trial 4 fin

Best CatBoost hyperparameters:  {'depth': 4, 'learning_rate': 0.09029978363192846, 'iterations': 484}


In [66]:
import optuna
from xgboost import XGBClassifier

# XGBoost 최적화 목적 함수 정의
def optimize_xgboost(trial):
    # 최적화할 하이퍼파라미터 설정
    n_estimators = trial.suggest_int('n_estimators', 100, 600)
    max_depth = trial.suggest_int('max_depth', 3, 8)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-3, 0.2)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
    
    # XGBoost 모델 생성
    model = XGBClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        eval_metric='logloss',
        random_state=42
    )
    
    # 교차 검증으로 모델 성능 평가 (F1 Macro)
    score = cross_val_score(model, train_x_shap, train_y_encoded_down, cv=3, scoring='f1_macro').mean()
    return score

# Optuna 스터디 생성 및 최적화 실행
xgboost_study = optuna.create_study(direction='maximize')
xgboost_study.optimize(optimize_xgboost, n_trials=200)

# 최적 하이퍼파라미터 출력
print("Best XGBoost hyperparameters: ", xgboost_study.best_params)

# 최적 하이퍼파라미터로 XGBoost 모델 생성
best_xgboost_model = XGBClassifier(
    n_estimators=xgboost_study.best_params['n_estimators'],
    max_depth=xgboost_study.best_params['max_depth'],
    learning_rate=xgboost_study.best_params['learning_rate'],
    subsample=xgboost_study.best_params['subsample'],
    colsample_bytree=xgboost_study.best_params['colsample_bytree'],
    eval_metric='logloss',
    random_state=42
)


[I 2024-08-29 20:30:46,862] A new study created in memory with name: no-name-f55b4507-16b1-4262-8b56-76248efef5cd
[I 2024-08-29 20:30:55,293] Trial 0 finished with value: 0.6328392425228907 and parameters: {'n_estimators': 225, 'max_depth': 7, 'learning_rate': 0.0022073388499135433, 'subsample': 0.7076315678547631, 'colsample_bytree': 0.9509490162048375}. Best is trial 0 with value: 0.6328392425228907.
[I 2024-08-29 20:30:59,028] Trial 1 finished with value: 0.6592205887952011 and parameters: {'n_estimators': 149, 'max_depth': 5, 'learning_rate': 0.021743280421842107, 'subsample': 0.822315952291619, 'colsample_bytree': 0.7091142690541201}. Best is trial 1 with value: 0.6592205887952011.
[I 2024-08-29 20:31:19,295] Trial 2 finished with value: 0.6464109756863377 and parameters: {'n_estimators': 535, 'max_depth': 8, 'learning_rate': 0.004195611562779018, 'subsample': 0.6945560745583423, 'colsample_bytree': 0.7509293648658424}. Best is trial 1 with value: 0.6592205887952011.
[I 2024-08-29

Best XGBoost hyperparameters:  {'n_estimators': 394, 'max_depth': 4, 'learning_rate': 0.021195374812380065, 'subsample': 0.9889743943366038, 'colsample_bytree': 0.9658659242376666}


In [67]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier,VotingClassifier

# randomforest_model = RandomForestClassifier(random_state=42)

voting_clf = VotingClassifier(
    estimators=[
        ('catboost', best_catboost_model),
        ('xgboost', best_xgboost_model)
    #     ('randomforest', randomforest_model)  # 기존 RandomForest 모델
    # ],
    ],
    voting='soft'
)

# 앙상블 모델 학습
voting_clf.fit(train_x_shap, train_y_encoded_down)


In [68]:
# 예측
predictions = voting_clf.predict(test_x_shap)
predictions_label = le_subclass.inverse_transform(predictions)

# Submission

In [69]:
# 분류 예측 결과 제출 데이터프레임(DataFrame)
# 분류 예측 결과 데이터프레임 파일명을 반드시 clf_submission.csv 로 지정해야합니다.
clf_submission = pd.read_csv("sample_submission.csv")
clf_submission["Fraud_Type"] = predictions_label
clf_submission.head()

Unnamed: 0,ID,Fraud_Type
0,TEST_000000,b
1,TEST_000001,m
2,TEST_000002,m
3,TEST_000003,m
4,TEST_000004,h


In [70]:
# 합성 데이터 생성 결과 제출 데이터프레임(DataFrame)
# 합성 데이터 생성 결과 데이터프레임 파일명을 반드시 syn_submission.csv 로 지정해야합니다.
# all_synthetic_data.head()

In [71]:
'''
(*) 저장 시 각 파일명을 반드시 확인해주세요.
    1. 분류 예측 결과 데이터프레임 파일명 = clf_submission.csv
    2. 합성 데이터 생성 결과 데이터프레임 파일명 = syn_submission.csv

(*) 제출 파일(zip) 내에 두 개의 데이터프레임이 각각 위의 파일명으로 반드시 존재해야합니다.
(*) 파일명을 일치시키지 않으면 채점이 불가능합니다.
'''
from datetime import datetime
today_datetime = datetime.today().strftime('%y%m%d_%H%M')
os.chdir('G:/내 드라이브/DACON_proj/DACON/2024_FSI_AIxData_Challenge')
# 폴더 생성 및 작업 디렉토리 변경
os.makedirs('./submission', exist_ok=True)
os.chdir('./submission')

# CSV 파일로 저장
clf_submission.to_csv('./clf_submission.csv', encoding='UTF-8-sig', index=False)
# all_synthetic_data.to_csv('./syn_submission.csv', encoding='UTF-8-sig', index=False)

# ZIP 파일 생성 및 CSV 파일 추가
with zipfile.ZipFile(f'submission_{today_datetime}.zip', 'w') as submission:
    submission.write('clf_submission.csv')
    submission.write('syn_submission.csv')
    
print('Done.')
os.chdir('G:/내 드라이브/DACON_proj/DACON/2024_FSI_AIxData_Challenge')

Done.


In [72]:
import winsound

# 주파수와 지속시간 설정 (주파수 단위: Hertz, 지속시간 단위: 밀리초)
frequency = 1000  # 주파수 (Hertz)
duration = 300    # 지속시간 (Milliseconds)

# 소리 재생
winsound.Beep(frequency, duration)