In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from Models.AutoEncoder import AutoEncoder, AE_trainDataset, AE_validDataset
from utils.utils import *
from tqdm import tqdm
from sklearn.metrics import f1_score


# Auto Encoder Test

* L1 정규화 추가
* weight_decay 추가
* BatchNorm 추가

## Base Setup

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
num_epochs = 100
batch_size = 128
lr = 1e-4

In [4]:
# Feature Selection
cat_features = ['Gender', 'Card Brand', 'Card Type', 'Expires', 'Has Chip', 'Year PIN last Changed', 'Whether Security Chip is Used', 'Day', 'Error Message']

num_features = ['Current Age', 'Retirement Age', 'Per Capita Income - Zipcode', 'Yearly Income', 'Total Debt', 'Credit Score', 'Credit Limit', 'Amount','Since Open Month']

discarded = ['User', 'Birth Year', 'Birth Month', 'Card', 'Card Number', 'Zipcode', 'Merchandise Code', 'Acct Open Date', 'Year', 'Month']

print(len(cat_features)*5 + len(num_features))


54


In [5]:
# 데이터 전처리
data_path = 'Data/[24-2 DS_Project2] Data.csv'
(train_cat_X, train_num_X, train_y), (valid_cat_X, valid_num_X, valid_y), label_encoders = process_data(
    data_path,
    cat_features,
    num_features,
    discarded
)

TRANSITION
IQR
SPLIT
DISCARD
SCALE
ENCODE
TARGET
UNLABEL
TRAIN CAT/NUM
VALID CAT/NUM
RETURN


In [1]:
print(train_y.count())
print(train_cat_X.shape)

NameError: name 'train_y' is not defined

In [7]:
train_cat_X,
train_num_X,
train_y,
valid_cat_X,
valid_num_X,
valid_y,
label_encoders

{'Gender': LabelEncoder(),
 'Card Brand': LabelEncoder(),
 'Card Type': LabelEncoder(),
 'Expires': LabelEncoder(),
 'Has Chip': LabelEncoder(),
 'Year PIN last Changed': LabelEncoder(),
 'Whether Security Chip is Used': LabelEncoder(),
 'Day': LabelEncoder(),
 'Error Message': LabelEncoder()}

In [8]:
train_dataset = AE_trainDataset(train_cat_X, train_num_X, device)
valid_dataset = AE_validDataset(valid_cat_X, valid_num_X, valid_y, device)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

In [9]:
model = AutoEncoder(encoding_dim=32, cat_features=cat_features, num_features=num_features).to(device)
optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
criterion = nn.MSELoss()


In [10]:
best_f1 = 0
l1_lambda = 1e-5
for epoch in tqdm(range(num_epochs), desc='학습 진행률', ncols=100, position=0, leave=True):
    # 학습 단계
    model.train()
    train_loss = 0
    for cat_features, num_features in train_loader:
        optimizer.zero_grad()
        y_hat, y = model(cat_features, num_features)
        
        # MSE 손실 계산
        mse_loss = criterion(y_hat, y)
        
        # L1 정규화 계산
        l1_reg = torch.tensor(0., requires_grad=True).to(device)
        for param in model.parameters():
            l1_reg = l1_reg + torch.norm(param, 1)
        
        # 총 손실 = MSE 손실 + L1 정규화
        loss = mse_loss + l1_lambda * l1_reg
        
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    # 평균 train_loss 계산
    train_loss /= len(train_loader)
    
    # 검증 단계 (10 에포크마다)
# ... existing code ...

    if epoch % 10 == 0:
        model.eval()
        valid_loss = 0
        reconstruction_errors = []
        all_labels = []
        
        with torch.no_grad():
            for cat_features, num_features, labels in valid_loader:
                y_hat, y = model(cat_features, num_features)
                # 배치 단위의 MSE 손실 계산
                batch_loss = criterion(y_hat, y)
                valid_loss += batch_loss.item()
                
                # 이상치 탐지를 위한 샘플별 reconstruction error 계산
                sample_errors = torch.mean((y_hat - y) ** 2, dim=1)
                reconstruction_errors.extend(sample_errors.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
            
            # 평균 validation loss 계산
            valid_loss /= len(valid_loader)
            
            # 이상치 탐지를 위한 임계값 설정 및 예측
            threshold = np.percentile(reconstruction_errors, 95)
            predictions = (np.array(reconstruction_errors) > threshold).astype(int)
            f1 = f1_score(all_labels, predictions)           
            tqdm.write(f"에포크 {epoch}: Train Loss = {train_loss:.4f}, Valid Loss = {valid_loss:.4f}, F1 Score = {f1:.4f}")
    else:
        # 10 에포크마다가 아닐 때는 train_loss만 출력
        tqdm.write(f"에포크 {epoch}: Train Loss = {train_loss:.4f}")

학습 진행률:   1%|▍                                               | 1/100 [00:37<1:02:38, 37.96s/it]     

에포크 0: Train Loss = 0.1581, Valid Loss = 0.0236, F1 Score = 0.0015


학습 진행률:   2%|█                                                 | 2/100 [01:11<57:24, 35.15s/it]     

에포크 1: Train Loss = 0.0311


학습 진행률:   3%|█▌                                                | 3/100 [01:45<56:10, 34.74s/it]     

에포크 2: Train Loss = 0.0152


학습 진행률:   4%|█▉                                              | 4/100 [02:27<1:00:08, 37.58s/it]     

에포크 3: Train Loss = 0.0103


학습 진행률:   5%|██▌                                               | 5/100 [02:51<51:52, 32.76s/it]     

에포크 4: Train Loss = 0.0078


학습 진행률:   6%|███                                               | 6/100 [03:22<50:19, 32.12s/it]     

에포크 5: Train Loss = 0.0061


학습 진행률:   7%|███▌                                              | 7/100 [03:53<49:04, 31.66s/it]     

에포크 6: Train Loss = 0.0050


학습 진행률:   8%|████                                              | 8/100 [04:26<49:12, 32.10s/it]     

에포크 7: Train Loss = 0.0042


학습 진행률:   9%|████▌                                             | 9/100 [05:03<51:17, 33.82s/it]     

에포크 8: Train Loss = 0.0038


학습 진행률:  10%|████▉                                            | 10/100 [05:38<51:04, 34.05s/it]     

에포크 9: Train Loss = 0.0035


학습 진행률:  11%|█████▍                                           | 11/100 [06:14<51:19, 34.60s/it]     

에포크 10: Train Loss = 0.0032, Valid Loss = 0.0001, F1 Score = 0.0004


학습 진행률:  12%|█████▉                                           | 12/100 [06:46<49:34, 33.81s/it]     

에포크 11: Train Loss = 0.0031


학습 진행률:  13%|██████▎                                          | 13/100 [07:18<48:34, 33.50s/it]     

에포크 12: Train Loss = 0.0030


학습 진행률:  14%|██████▊                                          | 14/100 [07:49<46:47, 32.64s/it]     

에포크 13: Train Loss = 0.0029


학습 진행률:  15%|███████▎                                         | 15/100 [08:21<45:52, 32.39s/it]     

에포크 14: Train Loss = 0.0029


학습 진행률:  16%|███████▊                                         | 16/100 [09:01<48:47, 34.85s/it]     

에포크 15: Train Loss = 0.0028


학습 진행률:  16%|███████▊                                         | 16/100 [09:23<49:20, 35.24s/it]


KeyboardInterrupt: 