In [18]:
# 제출 파일 생성 관련
import os
import zipfile

# 데이터 처리 및 분석
import pandas as pd
import numpy as np
from scipy import stats
from tqdm import tqdm

# 머신러닝 전처리
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

# 머신러닝 모델
import xgboost as xgb

# 합성 데이터 생성
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer, TVAESynthesizer

# To ignore all warnings
import warnings
warnings.filterwarnings('ignore')

In [19]:
train_all = pd.read_csv("../data/train.csv")
test_all = pd.read_csv("../data/test.csv")

train = train_all.drop(columns="ID")

train["Fraud_Type"].value_counts()

Fraud_Type
m    118800
a       100
j       100
h       100
k       100
c       100
g       100
i       100
b       100
f       100
d       100
e       100
l       100
Name: count, dtype: int64

In [32]:
test_all

Unnamed: 0,ID,Customer_Birthyear,Customer_Gender,Customer_personal_identifier,Customer_identification_number,Customer_registration_datetime,Customer_credit_rating,Customer_flag_change_of_authentication_1,Customer_flag_change_of_authentication_2,Customer_flag_change_of_authentication_3,...,Unused_terminal_status,Last_atm_transaction_datetime,Last_bank_branch_transaction_datetime,Flag_deposit_more_than_tenMillion,Unused_account_status,Recipient_account_suspend_status,Number_of_transaction_with_the_account,Transaction_history_with_the_account,First_time_iOS_by_vulnerable_user,Transaction_resumed_date
0,TEST_000000,1960,female,주지아,DOMcBN-kRMFflJ,2003-01-07 10:59:08,E,1,0,0,...,1,2003-01-10 05:27:56,2003-01-08 05:27:56,0,1,1,0,0,0,2003-01-08 05:27:56
1,TEST_000001,1960,female,주지아,DOMcBN-kRMFflJ,2003-01-07 10:59:08,E,1,1,1,...,0,2003-01-11 21:29:50,2003-01-08 05:27:56,0,1,0,0,0,0,2003-01-08 05:27:56
2,TEST_000002,1951,male,김정수,pZrAvI-mhxfVyw,2003-01-06 18:10:55,B,1,1,1,...,0,2003-01-13 01:08:19,2003-01-13 01:08:19,1,0,0,2,2,0,2003-01-13 01:08:19
3,TEST_000003,1999,female,김현지,fVlbzX-wvugTpH,2003-01-08 05:28:53,B,0,1,1,...,1,2003-01-21 10:03:32,2003-01-26 13:49:24,0,1,1,0,0,0,2003-01-20 10:03:32
4,TEST_000004,1996,female,박은정,chYftA-AjVuXMW,2003-01-17 03:37:22,A,0,1,0,...,1,2003-01-28 19:04:19,2003-01-28 19:04:19,0,1,1,0,0,0,2003-01-28 19:04:19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119995,TEST_119995,2004,male,김준혁,bGEIhk-AliYKgY,2024-12-24 02:29:11,D,1,0,0,...,1,2054-02-20 19:20:47,2053-02-13 08:49:51,0,1,0,0,0,0,2054-02-20 19:20:47
119996,TEST_119996,1955,male,백지후,QiwTkT-mJyULxr,2010-07-15 01:27:01,B,0,0,1,...,1,2054-08-17 01:50:35,2053-08-03 00:00:34,0,0,0,1,4,0,2054-08-17 01:50:35
119997,TEST_119997,1987,female,김예원,GqwqCU-MxnBKdl,2011-09-30 02:32:19,B,0,1,1,...,1,2054-10-15 23:21:57,2054-10-16 02:17:10,1,1,0,0,0,0,2054-10-15 18:58:16
119998,TEST_119998,2004,female,이예진,FuMoei-UYOnYPv,2024-04-02 11:15:06,C,1,1,1,...,1,2055-02-15 08:59:13,2050-12-20 07:39:32,0,1,1,0,0,0,2055-02-15 08:59:13


In [24]:
N_CLS_PER_GEN = 1000

# 이상치 처리 함수
def handle_outliers(series, n_std=3):
    mean = series.mean()
    std = series.std()
    z_scores = np.abs(stats.zscore(series))
    return series.mask(z_scores > n_std, mean)

# Time_difference 컬럼을 총 초로 변환 및 이상치 처리
train['Time_difference_seconds'] = pd.to_timedelta(train['Time_difference']).dt.total_seconds()

# train['Time_difference_seconds'] = handle_outliers(train['Time_difference_seconds'])

In [3]:
# 모든 Fraud_Type 목록 생성 (m 포함)
fraud_types = train['Fraud_Type'].unique()

# 모든 합성 데이터를 저장할 DataFrame 초기화
all_synthetic_data = pd.DataFrame()

N_SAMPLE = 100

# 각 Fraud_Type에 대해 합성 데이터 생성 및 저장
for fraud_type in tqdm(fraud_types):
    
    # 해당 Fraud_Type에 대한 서브셋 생성
    subset = train[train["Fraud_Type"] == fraud_type]

    # 모든 Fraud_Type에 대해 100개씩 샘플링
    subset = subset.sample(n=N_SAMPLE, random_state=42)
    
    # Time_difference 열 제외 (초 단위로 변환된 컬럼만 사용)
    subset = subset.drop('Time_difference', axis=1)
    
    # 메타데이터 생성 및 모델 학습
    metadata = SingleTableMetadata()

    metadata.detect_from_dataframe(subset)
    metadata.set_primary_key(None)

    # 데이터 타입 설정
    column_sdtypes = {
        'Account_initial_balance': 'numerical',
        'Account_balance': 'numerical',
        'Customer_identification_number': 'categorical',  
        'Customer_personal_identifier': 'categorical',
        'Account_account_number': 'categorical',
        'IP_Address': 'ipv4_address',  
        'Location': 'categorical',
        'Recipient_Account_Number': 'categorical',
        'Fraud_Type': 'categorical',
        'Time_difference_seconds': 'numerical',
        'Customer_Birthyear': 'numerical'
    }

    # 각 컬럼에 대해 데이터 타입 설정
    for column, sdtype in column_sdtypes.items():
        metadata.update_column(
            column_name=column,
            sdtype=sdtype
        )
        
    synthesizer = TVAESynthesizer(
                            metadata,
                            epochs=100
                        )
    synthesizer.fit(subset)

    synthetic_subset = synthesizer.sample(num_rows=N_CLS_PER_GEN)
    
    # 생성된 Time_difference_seconds의 이상치 처리
    synthetic_subset['Time_difference_seconds'] = handle_outliers(synthetic_subset['Time_difference_seconds'])
    
    # Time_difference_seconds를 다시 timedelta로 변환
    synthetic_subset['Time_difference'] = pd.to_timedelta(synthetic_subset['Time_difference_seconds'], unit='s')
    
    # Time_difference_seconds 컬럼 제거
    synthetic_subset = synthetic_subset.drop('Time_difference_seconds', axis=1)
    
    # 생성된 데이터를 all_synthetic_data에 추가
    all_synthetic_data = pd.concat([all_synthetic_data, synthetic_subset], ignore_index=True)
    
# 최종 결과 확인
print("\nFinal All Synthetic Data Shape:", all_synthetic_data.shape)
all_synthetic_data.head()

100%|██████████| 13/13 [01:14<00:00,  5.72s/it]


Final All Synthetic Data Shape: (13000, 63)





Unnamed: 0,Customer_Birthyear,Customer_Gender,Customer_personal_identifier,Customer_identification_number,Customer_registration_datetime,Customer_credit_rating,Customer_flag_change_of_authentication_1,Customer_flag_change_of_authentication_2,Customer_flag_change_of_authentication_3,Customer_flag_change_of_authentication_4,...,Last_bank_branch_transaction_datetime,Flag_deposit_more_than_tenMillion,Unused_account_status,Recipient_account_suspend_status,Number_of_transaction_with_the_account,Transaction_history_with_the_account,First_time_iOS_by_vulnerable_user,Fraud_Type,Transaction_resumed_date,Time_difference
0,1961,male,김민서,uhnwZi-vUVOlHE,2011-03-02 06:16:34,B,1,1,1,1,...,2017-03-06 15:33:46,0,1,1,0,0,0,m,2020-05-14 02:02:51,0 days 13:08:30
1,1971,female,김민서,uhnwZi-vUVOlHE,2009-03-07 00:58:14,B,1,1,1,1,...,2029-02-25 20:36:23,0,1,0,0,0,0,m,2029-10-20 14:17:05,1 days 07:42:08
2,1957,female,한준혁,uhnwZi-vUVOlHE,2009-04-28 23:53:01,B,1,1,1,1,...,2019-12-21 10:14:03,0,1,1,0,0,0,m,2026-03-31 07:50:46,3 days 04:15:16
3,1964,male,이상훈,uhnwZi-vUVOlHE,2004-12-07 02:45:09,B,1,1,1,1,...,2021-07-05 12:14:53,0,1,0,0,0,0,m,2015-05-15 15:29:53,6 days 05:35:08
4,1959,male,김정훈,YuMAdG-cURFSEF,2005-10-23 15:55:22,B,1,1,1,1,...,2012-04-16 02:43:13,0,1,0,0,0,0,m,2007-05-19 14:09:24,0 days 00:01:38


In [5]:
all_synthetic_data["Fraud_Type"].value_counts()

Fraud_Type
m    1000
a    1000
j    1000
h    1000
k    1000
c    1000
g    1000
i    1000
b    1000
f    1000
d    1000
e    1000
l    1000
Name: count, dtype: int64

In [9]:
origin_train = train_all.drop(columns="ID")
train_total = pd.concat([origin_train, all_synthetic_data])
train_total.shape

(133000, 63)

In [10]:
train_x = train_total.drop(columns=['Fraud_Type'])
train_y = train_total['Fraud_Type']

test_x = test_all.drop(columns=['ID'])

In [11]:
train_total["Fraud_Type"].value_counts()

Fraud_Type
m    119800
a      1100
j      1100
h      1100
k      1100
c      1100
g      1100
i      1100
b      1100
f      1100
d      1100
e      1100
l      1100
Name: count, dtype: int64

In [12]:
train_x["Time_difference"] = train_x["Time_difference"].astype(str)

In [13]:
le_subclass = LabelEncoder()
train_y_encoded = le_subclass.fit_transform(train_y)

# 변환된 레이블 확인
for i, label in enumerate(le_subclass.classes_):
    print(f"원래 레이블: {label}, 변환된 숫자: {i}")
    
# train_x
# 범주형 변수 인코딩
categorical_columns = train_x.select_dtypes(include=['object', 'category']).columns
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# 훈련 데이터 인코딩
train_x_encoded = train_x.copy()
train_x_encoded[categorical_columns] = ordinal_encoder.fit_transform(train_x[categorical_columns])

# 특성 순서 저장
feature_order = train_x_encoded.columns.tolist()

원래 레이블: a, 변환된 숫자: 0
원래 레이블: b, 변환된 숫자: 1
원래 레이블: c, 변환된 숫자: 2
원래 레이블: d, 변환된 숫자: 3
원래 레이블: e, 변환된 숫자: 4
원래 레이블: f, 변환된 숫자: 5
원래 레이블: g, 변환된 숫자: 6
원래 레이블: h, 변환된 숫자: 7
원래 레이블: i, 변환된 숫자: 8
원래 레이블: j, 변환된 숫자: 9
원래 레이블: k, 변환된 숫자: 10
원래 레이블: l, 변환된 숫자: 11
원래 레이블: m, 변환된 숫자: 12


In [20]:
# 각 클래스의 샘플 개수 계산
unique_, counts_ = np.unique(train_y_encoded, return_counts=True)
class_counts = dict(zip(unique_, counts_))

# 전체 샘플 수 및 클래스 수 계산
total_samples = len(train_y_encoded)
num_classes = len(class_counts)

# 클래스 비율에 기반한 가중치 계산
weights = {k: total_samples / (num_classes * v) for k, v in class_counts.items()}

# 가중치 배열 생성
sample_weight = np.array([weights[label] for label in train_y_encoded])

In [21]:
# 모델 정의 및 학습
model = xgb.XGBClassifier(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=10,
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'
)
model.fit(train_x_encoded[feature_order], train_y_encoded, sample_weight=sample_weight)

In [22]:
# 테스트 데이터 인코딩
test_x_encoded = test_x.copy()
test_x_encoded[categorical_columns] = ordinal_encoder.transform(test_x[categorical_columns])


# 특성 순서 맞추기 및 데이터 타입 일치
test_x_encoded = test_x_encoded[feature_order]
for col in feature_order:
    test_x_encoded[col] = test_x_encoded[col].astype(train_x_encoded[col].dtype)
    
# 예측
predictions = model.predict(test_x_encoded)
predictions_label = le_subclass.inverse_transform(predictions)

In [23]:
from datetime import datetime

In [24]:
# 분류 예측 결과 제출 데이터프레임(DataFrame)
# 분류 예측 결과 데이터프레임 파일명을 반드시 clf_submission.csv 로 지정해야합니다.
clf_submission = pd.read_csv("../submit/sample_submission.csv")
clf_submission["Fraud_Type"] = predictions_label
clf_submission.head()
# 합성 데이터 생성 결과 제출 데이터프레임(DataFrame)
# 합성 데이터 생성 결과 데이터프레임 파일명을 반드시 syn_submission.csv 로 지정해야합니다.
all_synthetic_data.head()
'''
(*) 저장 시 각 파일명을 반드시 확인해주세요.
    1. 분류 예측 결과 데이터프레임 파일명 = clf_submission.csv
    2. 합성 데이터 생성 결과 데이터프레임 파일명 = syn_submission.csv

(*) 제출 파일(zip) 내에 두 개의 데이터프레임이 각각 위의 파일명으로 반드시 존재해야합니다.
(*) 파일명을 일치시키지 않으면 채점이 불가능합니다.
'''

# 폴더 생성 및 작업 디렉토리 변경
now_ = datetime.now().strftime("%y%m%d%H%M%S")
os.makedirs(f'../submit/{now_}submission', exist_ok=True)
os.chdir(f"../submit/{now_}submission/")

# CSV 파일로 저장
clf_submission.to_csv('./clf_submission.csv', encoding='UTF-8-sig', index=False)
all_synthetic_data.to_csv('./syn_submission.csv', encoding='UTF-8-sig', index=False)

# ZIP 파일 생성 및 CSV 파일 추가
with zipfile.ZipFile(f"../{now_}submission.zip", 'w') as submission:
    submission.write('clf_submission.csv')
    submission.write('syn_submission.csv')
    
print('Done.')

Done.
