# Loss 정상화를 해보겠습니다.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from Models.model import BaseModel
from torch.utils.data import Dataset, DataLoader
from utils.utils import *
from tqdm import tqdm
from sklearn.metrics import f1_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## 1. 데이터 로드 및 전처리

In [2]:
# Feature Selection
cat_features = ['Gender', 'Card Brand', 'Card Type', 'Expires', 'Has Chip', 'Year PIN last Changed', 'Whether Security Chip is Used', 'Day', 'Error Message']

num_features = ['Current Age', 'Retirement Age', 'Per Capita Income - Zipcode', 'Yearly Income', 'Total Debt', 'Credit Score', 'Credit Limit', 'Amount','Since Open Month']

discarded = ['User', 'Birth Year', 'Birth Month', 'Card', 'Card Number', 'Zipcode', 'Merchandise Code', 'Acct Open Date', 'Year', 'Month']

print(len(cat_features)*5 + len(num_features))


54


In [None]:
def process_data(data_path, cat_features, num_features, mode = 'AE'):
    df = pd.read_csv(data_path)
    ## 파생 변 수 생성
    df['Error Message'] = df['Error Message'].astype(bool)
    df['Since Open Month'] = (df['Year'] - df['Acct Open Date'].str[-4:].astype(int)) * 12 + (df['Month'] - df['Acct Open Date'].str[:2].astype(int)).astype(int)

    ## IQR을 사용한 이상치 제거
    for col in num_features:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR  
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    ## train-valid 분리
    mask = df['Month'].between(1, 9)
    train_df = df[mask]
    valid_df = df[~mask]
    if mode == 'AE':
        train_df = train_df[train_df['Is Fraud?'] == 'No']    

    ## 정규화
    scaler = StandardScaler()
    scaler.fit(train_df[num_features])

    train_df[num_features] = pd.DataFrame(
        scaler.transform(train_df[num_features]),
        columns = num_features,
        index = train_df.index,
    )
    valid_df[num_features] = pd.DataFrame(
        scaler.transform(valid_df[num_features]),
        columns = num_features,
        index = valid_df.index,
    )

    ## Label Encoding
    label_encoders = {}
    for col in cat_features:
        le = LabelEncoder()
        train_df[col] = le.fit_transform(train_df[col])
        valid_df[col] = le.transform(valid_df[col])
        label_encoders[col] = le
        
    ## 최종 데이터 분리
    train_cat_X = train_df[cat_features]
    train_num_X = train_df[num_features]
    train_y = train_df['Is Fraud?'].astype(int)
    
    valid_cat_X = valid_df[cat_features]
    valid_num_X = valid_df[num_features]
    valid_y = valid_df['Is Fraud?'].astype(int)
    return (train_cat_X, train_num_X, train_y), (valid_cat_X, valid_num_X, valid_y), label_encoders

    



In [None]:
# 데이터 전처리
data_path = 'Data/[24-2 DS_Project2] Data.csv'
(train_cat_X, train_num_X, train_y), (valid_cat_X, valid_num_X, valid_y), label_encoders = process_data(
    data_path,
    cat_features,
    num_features
)

## 2. Model, Dataset, DataLoader 설정
## 3. 학습 및 검증
## 4. 결과 분석

In [None]:
class AutoEncoder(BaseModel):
    def __init__(self, encoding_dim, cat_features, num_features, num_classes=1):
        super(AutoEncoder, self).__init__(encoding_dim, cat_features, num_features, num_classes)
        self.input_dim = len(cat_features) * 5 + len(num_features)
        
        # 더 깊은 디코더 네트워크 구성
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
            nn.Dropout(0.3),  # 드롭아웃 추가
            nn.Linear(64, 48),
            nn.BatchNorm1d(48),
            nn.LeakyReLU(),
            nn.Dropout(0.3),  # 드롭아웃 추가
            nn.Linear(48, self.input_dim)
        )
        
        # 임베딩 레이어 초기화 수정
        for emb in self.cat_embeddings:
            nn.init.uniform_(emb.weight, -0.05, 0.05)

    def forward(self, x_cat, x_num):
        embeddings = [emb(x_cat[:, i]) for i, emb in enumerate(self.cat_embeddings)]
        # 임베딩 벡터 정규화 추가
        normalized_embeddings = [torch.nn.functional.normalize(emb, p=2, dim=1) for emb in embeddings]
        original_x = torch.cat(normalized_embeddings + [x_num], dim=1)
        x = self.fc_cat(original_x)
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded, original_x