In [1]:
import pandas as pd
import numpy as np
from future.utils.surrogateescape import encoded

from Models.DT import *
from utils.utils import *
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from Models.AutoEncoder import AE_trainDataset, AE_validDataset, AE_Dataset
from utils.utils import *
from tqdm import tqdm
from sklearn.metrics import f1_score
import experiments.autoencoder_experiment_ver4_1 as AE

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
cat_features = [
    'Gender',
    'Zipcode',
    'Day',
    'Card Brand',
    'Card Type',
    'Has Chip',
    'Whether Security Chip is Used',
    'Error Message',
    'WeekDay',
    'Credit Signal',
    'PIN Change',
    'Security Level'
]
num_features = [
    'Current Age',
    'Retirement Age',
    'Per Capita Income - Zipcode',
    'Yearly Income',
    'Total Debt',
    'Credit Score',
    'Valid Month',
    'Credit Limit',
    'Since Open Month',
    'Year PIN last Changed',
    'Amount',
    'Credit Util',
    'Years Changed PIN',
    'Security Score'
]
discarded = [
    'User',
    'Birth Year',
    'Birth Month',
    'Year',
    'Month',
    'Merchandise Code',
    'Card',
    'Card Number',
    'Expires',
    'Acct Open Date',
]

In [4]:
model_path = 'experiments/AutoEncoder4_1/AE4_1_dim31_batch256_lr0.000100_l10.000003.pth'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = AE.AutoEncoder(encoding_dim=31, cat_features=cat_features, num_features=num_features, num_classes=1)
model.load_state_dict(torch.load(model_path))
model = model.to(device)
print(model)

  model.load_state_dict(torch.load(model_path))


AutoEncoder(
  (cat_embeddings): ModuleList(
    (0-11): 12 x Embedding(100, 5)
  )
  (fc_cat): Linear(in_features=74, out_features=64, bias=True)
  (encoder): Sequential(
    (0): Linear(in_features=64, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=31, bias=True)
    (3): ReLU()
  )
  (decoder): Sequential(
    (0): Linear(in_features=31, out_features=64, bias=True)
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.2, inplace=False)
    (4): Linear(in_features=64, out_features=74, bias=True)
  )
  (classifier): Sequential(
    (0): Linear(in_features=32, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=1, bias=True)
  )
)


In [5]:
(train_cat_X, train_num_X, train_y), (valid_cat_X, valid_num_X, valid_y), label_encoders, _ = dt_process_data(
    './Data/[24-2 DS_Project2] Data.csv',
    cat_features,
    num_features,
    discarded
)

TRANSITION
IQR
SPLIT
DISCARD
SCALE
ENCODE
TARGET
TRAIN CAT/NUM
VALID CAT/NUM
RETURN


In [6]:
smote = SMOTE(random_state=42, sampling_strategy=0.5)
train_X_resampled, train_y_resampled = smote.fit_resample(
    pd.concat([train_cat_X, train_num_X], axis=1), train_y['Is Fraud?']
)
# Resampled 데이터를 나누기
train_cat_X_resampled = train_X_resampled[cat_features]
train_num_X_resampled = train_X_resampled[num_features]
train_y_resampled = pd.DataFrame(train_y_resampled, columns=['Is Fraud?'])

In [7]:
print("Before SMOTE:", train_y['Is Fraud?'].value_counts())
print("After SMOTE:", train_y_resampled['Is Fraud?'].value_counts())


Before SMOTE: Is Fraud?
0.0    733455
1.0       897
Name: count, dtype: int64
After SMOTE: Is Fraud?
0.0    733455
1.0    366727
Name: count, dtype: int64


In [8]:
model.eval()

AutoEncoder(
  (cat_embeddings): ModuleList(
    (0-11): 12 x Embedding(100, 5)
  )
  (fc_cat): Linear(in_features=74, out_features=64, bias=True)
  (encoder): Sequential(
    (0): Linear(in_features=64, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=31, bias=True)
    (3): ReLU()
  )
  (decoder): Sequential(
    (0): Linear(in_features=31, out_features=64, bias=True)
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.2, inplace=False)
    (4): Linear(in_features=64, out_features=74, bias=True)
  )
  (classifier): Sequential(
    (0): Linear(in_features=32, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=1, bias=True)
  )
)

In [9]:
train_embeddings = model.get_embedding(
    torch.tensor(train_cat_X_resampled.values, dtype=torch.long).to(device),
    torch.tensor(train_num_X_resampled.values, dtype=torch.float).to(device),
)

In [10]:
valid_embeddings = model.get_embedding(
    torch.tensor(valid_cat_X.values, dtype=torch.long).to(device),
    torch.tensor(valid_num_X.values, dtype=torch.float).to(device),
)

In [11]:
train_embeddings = train_embeddings.cpu().detach().numpy()
valid_embeddings = valid_embeddings.cpu().detach().numpy()


In [16]:
rf_classifier = RandomForestClassifier(
    random_state=42,
    n_estimators=300,          # 트리 개수 더욱 증가
    max_depth=15,              # 더 깊은 트리 허용
    min_samples_leaf=1,        # 리프 노드 최소 샘플 수 더 감소
    min_samples_split=3,       # 분할 기준 완화
    class_weight={0: 1, 1: 12},  # 사기 클래스에 더 높은 가중치 부여
    max_features='sqrt',       
    bootstrap=True,
    oob_score=True,           # Out-of-bag 점수 확인
    n_jobs=-1
)

In [17]:
rf_classifier.fit(train_embeddings, train_y_resampled)
y_pred = rf_classifier.predict(valid_embeddings)
conf_matrix = confusion_matrix(valid_y, y_pred)
class_report = classification_report(valid_y, y_pred)
print(conf_matrix)
print(class_report)

  return fit_method(estimator, *args, **kwargs)


[[132890 116832]
 [   143    215]]
              precision    recall  f1-score   support

         0.0       1.00      0.53      0.69    249722
         1.0       0.00      0.60      0.00       358

    accuracy                           0.53    250080
   macro avg       0.50      0.57      0.35    250080
weighted avg       1.00      0.53      0.69    250080



In [14]:
from sklearn.ensemble import IsolationForest

# Isolation Forest를 사용한 이상 탐지
# Isolation Forest 파라미터 조정
iso_forest = IsolationForest(
    n_estimators=500,          # 트리 개수 증가
    max_samples=256,           # 명시적인 샘플 크기 지정
    contamination=0.002,       # 실제 사기 비율에 더 가깝게 조정
    max_features=0.8,          # 특성 샘플링 비율 지정
    bootstrap=True,            # 부트스트랩 샘플링 활성화
    random_state=42,
    n_jobs=-1
)

# 학습 데이터로 모델 학습
iso_forest.fit(train_embeddings)

# 예측 수행 (1: 정상, -1: 이상치)
iso_pred = iso_forest.predict(valid_embeddings)

# -1을 1로, 1을 0으로 변환하여 fraud/non-fraud 레이블로 매핑
iso_pred_mapped = np.where(iso_pred == -1, 1, 0)

# 성능 평가
iso_conf_matrix = confusion_matrix(valid_y, iso_pred_mapped)
iso_class_report = classification_report(valid_y, iso_pred_mapped)

print("Isolation Forest 결과:")
print("\n혼동 행렬:")
print(iso_conf_matrix)
print("\n분류 보고서:")
print(iso_class_report)


Isolation Forest 결과:

혼동 행렬:
[[249022    700]
 [   358      0]]

분류 보고서:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    249722
         1.0       0.00      0.00      0.00       358

    accuracy                           1.00    250080
   macro avg       0.50      0.50      0.50    250080
weighted avg       1.00      1.00      1.00    250080

