In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

# Torch 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 데이터 불러오기
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
X_cols = X.columns
X.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [15]:
# 데이터 스플릿
X_train, X_test = train_test_split(X, test_size=0.3, random_state=42)

# 데이터 표준화
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [16]:
# Denoising SSAE 모델 정의
class DenoisingSSAE(nn.Module):
    def __init__(self, input_dim):
        super(DenoisingSSAE, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.Tanh(),
            nn.Linear(16, 8),
            nn.Tanh(),
            nn.Linear(8, 4),
            nn.Tanh(),

        )
        self.decoder = nn.Sequential(
            nn.Linear(4, 8),
            nn.Tanh(),
            nn.Linear(8, 16),
            nn.Tanh(),
            nn.Linear(16, input_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Noise 추가 함수
def add_noise(data, noise_factor=0.2):
    noise = noise_factor * np.random.randn(*data.shape)
    noisy_data = data + noise
    noisy_data = np.clip(noisy_data, 0., 1.)
    return noisy_data

In [17]:
# 모델 생성 및 학습 설정
input_dim = X_train_scaled.shape[1]
model = DenoisingSSAE(input_dim).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [18]:
# 모델 학습
epochs = 100000
X_train_noisy = add_noise(X_train_scaled)

for epoch in range(epochs):
    model.train()
    inputs = torch.FloatTensor(X_train_noisy).to(device)
    targets = torch.FloatTensor(X_train_scaled).to(device)

    # Forward pass
    outputs = model(inputs)
    loss = criterion(outputs, targets)

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch+1) % 10000 == 0 or epoch == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

Epoch [1/100000], Loss: 1.0474
Epoch [10000/100000], Loss: 0.1270
Epoch [20000/100000], Loss: 0.1217
Epoch [30000/100000], Loss: 0.1212
Epoch [40000/100000], Loss: 0.1196
Epoch [50000/100000], Loss: 0.1156
Epoch [60000/100000], Loss: 0.1148
Epoch [70000/100000], Loss: 0.1140
Epoch [80000/100000], Loss: 0.1142
Epoch [90000/100000], Loss: 0.1112
Epoch [100000/100000], Loss: 0.1114


In [24]:
# 테스트 데이터 준비
X_test_scaled = scaler.transform(X_test)

# 테스트 적용
model.eval()
X_test_inputs = torch.FloatTensor(X_test_scaled).to(device)
X_test_outputs = model(X_test_inputs).detach().cpu().numpy()

# Reconstruction 에러 계산
reconstruction_error = np.sqrt(np.mean((X_test_outputs - X_test_scaled) ** 2))
print(f'Reconstruction Error: {reconstruction_error:.4f}')

Reconstruction Error: 2.1293


In [25]:
# Reconstruction 데이터 프레임 생성
X_re_cols = [f're_{col}' for col in X_cols]
X_reconstruction = pd.DataFrame(scaler.inverse_transform(X_test_outputs), columns = X_re_cols)

In [26]:
# 기존 값과 Reconstrcution 값 비교
pd.concat([X_test.loc[:, 'mean radius'].reset_index(drop=True), X_reconstruction.loc[:, 're_mean radius'].reset_index(drop=True)], axis=1)

Unnamed: 0,mean radius,re_mean radius
0,12.470,12.250829
1,18.940,17.543982
2,15.460,20.198011
3,12.400,14.309513
4,11.540,12.491550
...,...,...
166,12.780,15.182795
167,14.740,19.649630
168,9.904,10.767224
169,13.820,12.138457
