# Dataset

In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader, random_split

import warnings
warnings.filterwarnings(action='ignore')

plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [14]:
class TitanicDataset(Dataset):
    def __init__(self, dataframe, target_column=None, transform=None, is_train=True): ## 기본값 필수
        self.dataframe = dataframe.copy() ## 복사본
        self.target_column = target_column
        self.transform = transform
        self.is_train = is_train

        self._preprocess()
        
        # ❗훈련세트와 테스트세트는 다르기 때문에
        if self.is_train and target_column:
            self.targets = self.dataframe[target_column].values
            self.features = self.dataframe.drop([target_column], axis=1).values
        else:
            self.targets = None
            self.features = self.dataframe.values
    
    # _: 내부에서만 쓰고 외부에선 부르지마라
    def _preprocess(self):
        # 불필요한 컬럼을 삭제
        columns_to_drop = ["PassengerId", "Name", "Ticket", "Cabin"]
        existing_columns = [
            col for col in columns_to_drop if col in self.dataframe.columns
        ]
        if existing_columns:
            self.dataframe.drop(existing_columns, axis=1, inplace=True)

        # 나이 결측값 처리 (중앙값)
        if "Age" in self.dataframe.columns:
            self.dataframe["Age"].fillna(self.dataframe["Age"].median(), inplace=True)

        # 승선항구 결측값 처리 (최빈값: 제일많은 빈도)
        if "Embarked" in self.dataframe.columns:
            self.dataframe["Embarked"].fillna(
                self.dataframe["Embarked"].mode()[0], inplace=True
            )

        # 요금 (중앙값)
        if "Fare" in self.dataframe.columns:
            self.dataframe["Fare"].fillna(self.dataframe["Fare"].median(), inplace=True)

        # 새로운 특성
        if "SibSp" in self.dataframe.columns and "Parch" in self.dataframe.columns:
            self.dataframe["FamilySize"] = (
                self.dataframe["SibSp"] + self.dataframe["Parch"] + 1
            )
            self.dataframe["IsAlone"] = (self.dataframe["FamilySize"] == 1).astype(int)
        
        # 나이 그룹
        if "Age" in self.dataframe.columns:
            self.dataframe["AgeGroup"] = pd.cut(
                self.dataframe["Age"],
                bins=[0, 12, 18, 35, 60, 100],
                labels=[0, 1, 2, 3, 4],
            ).astype(int)

        # 요금 그룹
        if "Fare" in self.dataframe.columns:
            self.dataframe["FareGroup"] = pd.qcut(
                self.dataframe["Fare"], q=4, labels=[0, 1, 2, 3]
            ).astype(int)

        # 원-핫 인코딩
        if "Sex" in self.dataframe.columns:
            sex_dummies = pd.get_dummies(self.dataframe["Sex"], drop_first=True)
            self.dataframe = pd.concat([self.dataframe, sex_dummies], axis=1)
            self.dataframe.drop(["Sex"], axis=1, inplace=True)

        if "Embarked" in self.dataframe.columns:
            embarked_dummies = pd.get_dummies(
                self.dataframe["Embarked"], drop_first=True
            )
            self.dataframe = pd.concat([self.dataframe, embarked_dummies], axis=1)
            self.dataframe.drop(["Embarked"], axis=1, inplace=True)

        # 나머지 결측 (평균)
        self.dataframe.fillna(self.dataframe.mean(), inplace=True)
        print(f"전처리 후 특성 수: {len(self.dataframe.columns)}")
        print(f"특성 목록: {list(self.dataframe.columns)}")


    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        features = self.features[idx]

        # 변환 적용
        if self.transform:
            features = self.transform(features)
        features = torch.FloatTensor(features)

        if self.is_train and self.targets is not None:
            target = torch.LongTensor([self.targets[idx]])[0]
            return features, target
        else:
            return features


In [15]:
from sklearn.preprocessing import StandardScaler

# 2. 데이터 변환 클래스
class StandardScaleTransform:

    def __init__(self):
        self.scaler = StandardScaler()
        self.fitted = False

    # torch가 아니라 사이키런을 사용하는거기때문에
    def fit(self, data):
        self.scaler.fit(data)
        self.fitted = True
        return self

    def __call__(self, sample):
        if not self.fitted:
            raise ValueError(
                "스케일러가 아직 학습되지 않았습니다. fit() 메서드를 먼저 호출하세요."
            )

        if sample.ndim == 1:
            sample = sample.reshape(1, -1)
            return self.scaler.transform(sample).flatten()
        else:
            return self.scaler.transform(sample)

In [16]:
# 데이터가 train, test로 나뉘어진 경우 속성이 다르다 == test.csv엔 target값이 없다
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

train_data =TitanicDataset(df_train, target_column="Survived")
test_data =TitanicDataset(df_test, is_train=False) 

transform = StandardScaleTransform()
transform.fit(train_data.features)

train_data.transform = transform
test_data.transform = transform

전처리 후 특성 수: 13
특성 목록: ['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'IsAlone', 'AgeGroup', 'FareGroup', 'male', 'Q', 'S']
전처리 후 특성 수: 12
특성 목록: ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'IsAlone', 'AgeGroup', 'FareGroup', 'male', 'Q', 'S']


In [17]:
# train_data.features, train_data.targets

In [18]:
train_dataset, val_dataset = random_split(train_data, [0.2, 0.8])

In [19]:
# train_dataset[0]

In [20]:
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

# 모델(은닉층 구성)

### 활성화 함수
- 은닉층
    - ❗ReLU:: 모르겠으면 렐루
    - ❗Leaky ReLU
    - ELU
    - (LSTM)
    - ❗Tanh
    - SiLU:: 최근 연구에서 가장 좋다고 알려짐

- 출력층
    - ❗Sigmoid (이진분류)
    - ❗Softmax (다진분류)
    - ❗Linear (회귀)

- 타이타닉 데이터:: ```Input --> Hidden --> Output```
    - Input으로 12개(특성 개수)가 들어감
    - Output으로 2가지(Survived)가 나옴 ==> ``시그모이드 함수``

In [21]:
# input_size = train_data.features.shape[1]
# input_size

- nn.Linear(input_size, 256) 
    - 12개를 넣어서 hidden_size가 256개
    - hidden_size든 몇개든 상관xx __ 클수록 좋겠지..
- nn.BatchNorm1d(256)
    - 배치 정규화
- nn.ReLU() 
    - 활성화 함수
- nn.Dropout(0.5) 
    - Dropout Layer:: 50%확률로 아무거나 자름 -> 떨어지는게 정상이지만 ''성능이 올라감'', 내려갈때도 있음\\


In [22]:
class TitanicNet(nn.Module):
    def __init__(self, input_size, hidden_size=[256, 128, 64], dropout_rate=0.3):
        super(TitanicNet, self).__init__()
        layers = []
        prev_size = input_size

        for i, hidden_size in enumerate(hidden_size):
            layers.extend([
                nn.Linear(prev_size, hidden_size), 
                nn.BatchNorm1d(hidden_size), 
                nn.ReLU(),
                nn.Dropout(dropout_rate), 
            ])       
            prev_size = hidden_size
        
        # 출력층
        layers.append(nn.Linear(prev_size, 2))
        self.network = nn.Sequential(*layers)

    def forward(self, x):   # 순전파
        return self.network(x)

# 학습 및 평가

In [23]:
def train_model(
    model,
    train_loader,
    val_loader,
    criterion,
    optimizer,
    scheduler=None,
    num_epochs=100,
    device="cpu",
):
    """모델 훈련 함수"""
    # 상태값
    model.to(device)
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []

    best_val_loss = float("inf")
    best_model_state = None
    patience_counter = 0
    patience = 20  # 조기 종료를 위한 patience
    
    # 학습데이터
    for epoch in range(num_epochs):
        model.train() # 학습모델
        running_loss = 0.0
        correct_train = 0
        total_train = 0

       
        for batch_idx, (data, target) in enumerate(train_loader): # 배치사이즈만큼 돌때는 DataLoader(빈부분알아서 처리)가 필요함
            data, target = data.to(device), target.to(device)

            optimizer.zero_grad()
            output = model(data)
            print(output.shape)
            loss = criterion(output, target)
            loss.backward() # 자동미분
            optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(output.data, 1)
            total_train += target.size(0)
            correct_train += (predicted == target).sum().item()

        # 검증 모드
        model.eval()
        val_loss = 0.0
        correct_val = 0
        total_val = 0

        with torch.no_grad():
            for data, target in val_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                val_loss += criterion(output, target).item()
                _, predicted = torch.max(output.data, 1)
                total_val += target.size(0)
                correct_val += (predicted == target).sum().item()

        # 평균 계산
        avg_train_loss = running_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_loader)
        train_acc = 100.0 * correct_train / total_train
        val_acc = 100.0 * correct_val / total_val

        train_losses.append(avg_train_loss)
        val_losses.append(avg_val_loss)
        train_accuracies.append(train_acc)
        val_accuracies.append(val_acc)

        # 학습률 스케줄러
        if scheduler:
            scheduler.step(avg_val_loss)

        # 최고 성능 모델 저장
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_model_state = model.state_dict().copy()
            patience_counter = 0
            if (epoch + 1) % 10 == 0:
                print(
                    f"=*=*=*= Validation Loss decreased to {avg_val_loss:.6f}. Saving the model! =*=*=*="
                )
        else:
            patience_counter += 1

        if (epoch + 1) % 20 == 0:
            print(
                f"Epoch [{epoch+1}/{num_epochs}] - "
                f"Train Loss: {avg_train_loss:.4f}, Train Acc: {train_acc:.2f}%, "
                f"Val Loss: {avg_val_loss:.4f}, Val Acc: {val_acc:.2f}%"
            )

        # 조기 종료
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

    # 최고 성능 모델 로드
    if best_model_state is not None:
        model.load_state_dict(best_model_state)

    return {
        "train_losses": train_losses,
        "val_losses": val_losses,
        "train_accuracies": train_accuracies,
        "val_accuracies": val_accuracies,
    }

In [24]:
input_size = train_data.features.shape[1]
model = TitanicNet(input_size)
criterion = nn.CrossEntropyLoss()   # 손실함수와 옵티마이저
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", patience=10, factor=0.5
)

# 5. 모델 훈련
print(f"\n5. 모델 훈련 시작...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"사용 장치: {device}")

history = train_model(
    model,
    train_loader,
    val_loader,
    criterion,
    optimizer,
    scheduler,
    num_epochs=300,
    device=device,
)


5. 모델 훈련 시작...
사용 장치: cpu
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([19, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([19, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([19, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([19, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([19, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([19, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([19, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([19, 2])
torch.Size([3

In [None]:
model
# 결과:: 3층짜리 은닉층 + (마지막층(12)) 출 력층

TitanicNet(
  (network): Sequential(
    (0): Linear(in_features=12, out_features=256, bias=True)
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=128, out_features=64, bias=True)
    (9): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): Dropout(p=0.3, inplace=False)
    (12): Linear(in_features=64, out_features=2, bias=True)
  )
)