1. 원본 데이터 불러오기
2. 데이터 생성 전 데이터 전처리 (이상치 정도만)
3. 데이터 생성
4. 데이터 생성 후 데이터 전처리 
5. 변수 생성 및 선택
6. 분류
7. 검증 (stratified K-fold)
8. 예측

In [1]:
import sys
sys.path.append("../")

In [2]:
from data_loaders import DataLoader
data_loader = DataLoader(data_dir="../data")

In [4]:
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer, TVAESynthesizer
from tqdm import tqdm 
import warnings
warnings.filterwarnings('ignore')

N_CLS_PER_GEN = 1000
N_SAMPLE = 100

column_sdtypes = {
    'Account_initial_balance': 'numerical',
    'Account_balance': 'numerical',
    'Customer_identification_number': 'categorical',  
    'Customer_personal_identifier': 'categorical',
    'Account_account_number': 'categorical',
    'IP_Address': 'ipv4_address',  
    'Location': 'categorical',
    'Recipient_Account_Number': 'categorical',
    'Fraud_Type': 'categorical',
    # 'Time_difference': 'numerical',
    'Customer_Birthyear': 'numerical'
}

fraud_types = data_loader.train['Fraud_Type'].unique()
for fraud_type in tqdm(fraud_types):
    subset = data_loader.train[data_loader.train["Fraud_Type"] == fraud_type]
    subset = subset.sample(n=N_SAMPLE, random_state=42)
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(subset)
    metadata.set_primary_key(None)
    
    for column, sdtype in column_sdtypes.items():
        metadata.update_column(column_name=column, sdtype=sdtype)
        
    synthesizer = CTGANSynthesizer(metadata, epochs=100)
    synthesizer.fit(subset)

    synthetic_subset = synthesizer.sample(num_rows=N_CLS_PER_GEN)

100%|██████████| 13/13 [13:38<00:00, 62.97s/it]


In [23]:
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(subset)

In [24]:
metadata

{
    "primary_key": "Account_initial_balance",
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "columns": {
        "Customer_Birthyear": {
            "sdtype": "numerical"
        },
        "Customer_Gender": {
            "sdtype": "categorical"
        },
        "Customer_personal_identifier": {
            "sdtype": "unknown",
            "pii": true
        },
        "Customer_identification_number": {
            "sdtype": "unknown",
            "pii": true
        },
        "Customer_registration_datetime": {
            "sdtype": "datetime",
            "datetime_format": "%Y-%m-%d %H:%M:%S"
        },
        "Customer_credit_rating": {
            "sdtype": "categorical"
        },
        "Customer_flag_change_of_authentication_1": {
            "sdtype": "categorical"
        },
        "Customer_flag_change_of_authentication_2": {
            "sdtype": "categorical"
        },
        "Customer_flag_change_of_authentication_3": {
            "sdtype": "categorica

In [18]:
synthesizer.save(
    "test_synthesizer.pkl"
)

In [19]:
import pickle

with open("test_synthesizer.pkl", "rb") as f:
    ts = pickle.load(f)

In [21]:
ts.sample(num_rows=N_CLS_PER_GEN)

Unnamed: 0,Customer_Birthyear,Customer_Gender,Customer_personal_identifier,Customer_identification_number,Customer_registration_datetime,Customer_credit_rating,Customer_flag_change_of_authentication_1,Customer_flag_change_of_authentication_2,Customer_flag_change_of_authentication_3,Customer_flag_change_of_authentication_4,...,Last_atm_transaction_datetime,Last_bank_branch_transaction_datetime,Flag_deposit_more_than_tenMillion,Unused_account_status,Recipient_account_suspend_status,Number_of_transaction_with_the_account,Transaction_history_with_the_account,First_time_iOS_by_vulnerable_user,Fraud_Type,Transaction_resumed_date
0,1957,male,우지현,emitpe-rYrfRKX,2012-01-05 21:38:51,B,1,1,1,1,...,2022-05-09 23:29:41,2046-12-11 09:58:19,1,1,0,0,1,0,l,2046-11-27 16:12:07
1,1971,male,김정훈,ifreSY-NmNAKAY,2006-05-07 19:20:55,B,1,1,1,1,...,2035-02-24 03:53:25,2040-03-24 18:41:47,1,1,1,0,0,0,l,2043-02-06 22:46:25
2,1958,female,송서연,dtKzeu-JBIzyUK,2006-03-21 17:03:26,B,1,1,1,1,...,2030-12-04 01:16:26,2029-06-24 07:35:57,0,0,1,2,0,0,l,2043-10-27 08:16:12
3,1971,male,황광수,CydnTW-PxvvTxW,2005-12-06 22:02:03,B,1,1,1,1,...,2027-12-01 19:16:34,2035-02-10 21:30:54,1,1,0,1,0,0,l,2037-08-20 02:09:09
4,1961,female,손준영,MzNMMd-fPiEnme,2007-06-21 15:30:35,B,1,1,1,1,...,2033-11-02 21:35:05,2036-06-18 21:52:30,0,0,1,0,0,0,l,2046-11-27 16:12:07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1963,female,나민준,dxZwWA-kelhBDb,2010-06-16 11:08:34,A,1,1,1,1,...,2022-05-18 17:30:12,2030-03-09 07:27:52,0,1,0,0,0,0,l,2044-02-10 02:56:14
996,1973,male,김광수,SnjfWt-uwygNAu,2007-08-05 08:55:50,A,1,1,0,0,...,2028-03-19 00:02:05,2042-06-21 01:33:31,1,0,0,0,2,0,l,2046-11-27 16:12:07
997,1976,male,김준호,dtKzeu-JBIzyUK,2008-11-12 12:31:58,D,1,1,1,1,...,2026-01-28 15:56:19,2046-12-11 09:58:19,1,1,0,0,0,0,l,2042-10-03 00:47:25
998,1972,female,김수민,VYOHdA-iiByQVf,2008-01-12 03:52:26,B,1,1,1,1,...,2035-02-26 13:10:10,2032-07-27 08:00:48,0,0,0,0,0,0,l,2046-11-27 16:12:07
