In [1]:

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import os
import json
from Models.AutoEncoder import AutoEncoder, AE_validDataset, AE_trainDataset
from utils.utils import process_data
from sklearn.metrics import f1_score
import numpy as np

# Auto Encoder Test

* L1 정규화 추가
* weight_decay 추가
* BatchNorm 추가

## Base Setup

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
num_epochs = 100
batch_size = 128
lr = 1e-4
encoding_dim = 28

In [4]:
# Feature Selection
cat_features = ['Card', 'Gender', 'Card Brand', 'Card Type', 'Expires', 'Has Chip', 
                'Year PIN last Changed', 'Whether Security Chip is Used', 'Day']
num_features = ['Current Age', 'Retirement Age', 'Per Capita Income - Zipcode', 'Zipcode',
                'Yearly Income', 'Total Debt', 'Credit Score', 'Credit Limit', 'Amount']
discarded = ['User', 'Birth Year', 'Birth Month']
print(len(cat_features)*5 + len(num_features))


54


In [5]:
# 데이터 전처리
data_path = 'Data/[24-2 DS_Project2] Data.csv'
(train_cat_X, train_num_X, train_y), (valid_cat_X, valid_num_X, valid_y), label_encoders = process_data(
    data_path,
    cat_features,
    num_features,
    discarded
)

TRANSITION
IQR
SPLIT
DISCARD
SCALE
ENCODE
UNLABEL
TARGET
TRAIN CAT/NUM
VALID CAT/NUM
RETURN


In [6]:
print(train_y.shape)
print(train_cat_X.shape)

(902733, 1)
(902733, 9)


In [7]:
train_cat_X,
train_num_X,
train_y,
valid_cat_X,
valid_num_X,
valid_y,
label_encoders

{'Card': LabelEncoder(),
 'Gender': LabelEncoder(),
 'Card Brand': LabelEncoder(),
 'Card Type': LabelEncoder(),
 'Expires': LabelEncoder(),
 'Has Chip': LabelEncoder(),
 'Year PIN last Changed': LabelEncoder(),
 'Whether Security Chip is Used': LabelEncoder(),
 'Day': LabelEncoder()}

In [8]:
train_dataset = AE_trainDataset(train_cat_X, train_num_X, device)
valid_dataset = AE_validDataset(valid_cat_X, valid_num_X, valid_y, device)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

In [9]:
model = AutoEncoder(encoding_dim=encoding_dim, cat_features=cat_features, num_features=num_features).to(device)
optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
criterion = nn.MSELoss()


In [10]:
# Kaiming Initialization 적용
for name, param in model.named_parameters():
    if isinstance(param, nn.Linear):
        nn.init.kaiming_normal_(param.weight, mode='fan_in', nonlinearity='relu')
        if param.bias is not None:
            nn.init.zeros_(param.bias)

In [11]:
best_f1 = 0
l1_lambda = 1e-5
from tqdm import tqdm
for epoch in tqdm(range(num_epochs)):
    # 학습 단계
    model.train()
    train_loss = 0
    for cat_features, num_features in train_loader:
        optimizer.zero_grad()
        y_hat, y = model(cat_features, num_features)
        
        # MSE 손실 계산
        mse_loss = criterion(y_hat, y)
        
        # L1 정규화 계산
        l1_reg = torch.tensor(0., requires_grad=True).to(device)
        for param in model.parameters():
            l1_reg = l1_reg + torch.norm(param, 1)
        
        # 총 손실 = MSE 손실 + L1 정규화
        loss = mse_loss + l1_lambda * l1_reg
        
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    # 평균 train_loss 계산
    train_loss /= len(train_loader)
    
    # 검증 단계 (10 에포크마다)
    if epoch % 10 == 0:
        model.eval()
        valid_loss = 0
        reconstruction_errors = []
        all_labels = []
        
        with torch.no_grad():
            for cat_features, num_features, labels in valid_loader:
                y_hat, y = model(cat_features, num_features)
                batch_loss = criterion(y_hat, y)
                valid_loss += batch_loss.item()
                
                sample_errors = torch.mean((y_hat - y) ** 2, dim=1)
                reconstruction_errors.extend(sample_errors.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
            
            # 성능 평가
            threshold = np.percentile(reconstruction_errors, 90)
            predictions = (np.array(reconstruction_errors) > threshold).astype(int)
            f1 = f1_score(all_labels, predictions)
            
            # 결과 로깅
            print(f"Epoch {epoch}: Valid Loss = {valid_loss:.4f}, F1 Score = {f1:.4f}")
            
            # 최고 성능 모델 저장
            if f1 > best_f1:
                best_f1 = f1

print(f'best F1 Score: {best_f1}')

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch 0: Valid Loss = 4020.9072, F1 Score = 0.0020


 10%|█         | 10/100 [08:50<1:19:19, 52.88s/it]

Epoch 10: Valid Loss = 185.0619, F1 Score = 0.0016


 20%|██        | 20/100 [17:26<1:07:47, 50.84s/it]

Epoch 20: Valid Loss = 89.4141, F1 Score = 0.0036


 30%|███       | 30/100 [27:28<1:09:33, 59.62s/it]

Epoch 30: Valid Loss = 46.4173, F1 Score = 0.0027


 40%|████      | 40/100 [37:32<1:00:17, 60.29s/it]

Epoch 40: Valid Loss = 33.2769, F1 Score = 0.0019


 50%|█████     | 50/100 [46:47<45:08, 54.17s/it]  

Epoch 50: Valid Loss = 17.4496, F1 Score = 0.0023


 60%|██████    | 60/100 [56:15<37:57, 56.95s/it]

Epoch 60: Valid Loss = 24.1408, F1 Score = 0.0019


 70%|███████   | 70/100 [1:06:01<30:21, 60.71s/it]

Epoch 70: Valid Loss = 19.3517, F1 Score = 0.0029


 80%|████████  | 80/100 [1:18:08<22:05, 66.30s/it]

Epoch 80: Valid Loss = 33.9858, F1 Score = 0.0033


 90%|█████████ | 90/100 [1:28:11<09:33, 57.39s/it]

Epoch 90: Valid Loss = 21.1166, F1 Score = 0.0037


100%|██████████| 100/100 [1:37:16<00:00, 58.36s/it]


best F1 Score: 0.003692033482924345
