+# hw2 - titanic 

In [1]:
import os, torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader, random_split

In [2]:
# train_data를 위한 데이터셋 클래스 정의 -> feature와 target 모두 정의하고 반환한다.
class TitanicDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X.values)
        self.y = torch.LongTensor(y.values)
    
    #데이터 셋의 크기(가지고 있는 데이터(샘플)의 개수)
    def __len__(self): 
        return len(self.X)
    
    #인덱스 입력시 출력되는 값들을 정의
    def __getitem__(self, idx):
        feature = self.X[idx]
        target = self.y[idx]
        return {'input':feature, 'target':target} #튜플
    
    #dataset의 정보를 출력해준다.
    def __str__(self):
        str = "Data Size: {0}, Input Shape: {1}, Target Shape: {2}".format(len(self.X), self.X.shape, self.y.shape)
        return str
        

In [3]:
# test_data를 위한 데이터셋 클래스 정의 -> feature만을 정의하고 반환한다.
class TitanicTestDataset(Dataset): 
    def __init__(self, X): 
        self.X = torch.FloatTensor(X.values)  
    def __len__(self): 
        return len(self.X)  
    def __getitem__(self, idx):  
        feature = self.X[idx] 
        return {'input': feature}  
    def __str__(self): 
        str = "Data Size: {0}, Input Shape: {1}".format(  len(self.X), self.X.shape )  
        return str

test data를 다루기 위한 데이터셋으로 feature만 나오게 되어있다.

In [4]:
def get_preprocessed_dataset():
    #CURRENT_FILE_PATH = os.path.dirname(os.path.abspath(__file__)) #dirname: path의 디렉토리 부분만 가져오기,abspath: 현재 파일의 절대 경로가져오기
    CURRENT_FILE_PATH = os.getcwd() # ipynb에서 __file__이 정의되어있지 않음으로 현재 작업 디렉토리를 가져오는 getcwd를 이용
    train_data_path = os.path.join(CURRENT_FILE_PATH, 'train.csv')
    test_data_path = os.path.join(CURRENT_FILE_PATH, 'test.csv')
    
    train_df = pd.read_csv(train_data_path)# train data load
    test_df = pd.read_csv(test_data_path) # test data load
    
    #train_df와 test_df 결합
    all_df = pd.concat([train_df, test_df], sort=False)
    
    #전처리 함수 (전처리 하고 len(train_df)와 len(test_df)를 이용해 다시 분할해야 train, val, test 분할이 가능하다.) train_data와 test_data의 개수로 봤을때 val을 구분하는게 맞는가?
    #전처리 1: 데이터에서 승객번호, 이름, 티켓번호, 방 호수 삭제
    #전처리 2: 적절한 타입으로 변경(feature는 float으로 target은 int64로 변경해야 한다.)
    #전처리 3: NaN값을 가지고 있는 Age에 대한 처리
    #전처리 4: 데이터를 train과 test 분리
    #전처리 5: dataset 클래스에 데이터 저장
    #전처리 6: train_dataset에 random_split을 적용해서 train과 val로 분할

    #전처리 1 - 원하는 열만 남기기
    all_df = all_df[["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch","Fare", "Embarked" ]]
    
    #전처리 2 - Dataset클래스에 맞게 데이터 타입 변경
    #2-1 object타입인 Sex, Embarked를 One-Hot Encoding
    all_df = pd.get_dummies(all_df, columns = ["Sex", "Embarked"], drop_first=True) # drop_first: 첫번째 카테고리를 제고한다. -> 하나를 삭제하더라도 나머지가 False이면 당연히 삭제된 카테고리가 True이므로 삭제해도 문제 없다. 검색에서는 다중 공선성을 방지한다 하지만 아직 이해가 안된다. 
    #2-2 int64타입인 Pclass, SibSp, Parch을 float으로 변경
    all_df[["Pclass", "SibSp", "Parch"]] = all_df[["Pclass", "SibSp", "Parch"]].astype(float)
    #2-3 One-Hot Encoding으로 bool 타입이된 Sex와 Embarked를 float으로 변경한다.
    all_df[["Sex_male", "Embarked_Q", "Embarked_S"]] = all_df[["Sex_male", "Embarked_Q", "Embarked_S"]].astype(float)
    
    #전처리 3 - NaN값 이 있는 경우 해당 값을 평균값으로 대체
    all_df.fillna(all_df.mean(), inplace=True)#fillna: NaN값을 대체한다. 이 경우 각 열의 평균값으로 채우는 것이다. inplace는 원본을 수정할지 여부를 정하는 것으로 False이면 새로운 DataFrame을 반환한다.
    
    #전처리 4 - all_df를 train과 test로 분리
    train_df = all_df.iloc[:len(train_df),] 
    test_df = all_df.iloc[len(train_df):,] 
    
    #전처리 5 - 적절한 dataset 클래스에 데이터 저장
    #5-1: 데이터를 target과 feature로 분리
    train_df_target = train_df[["Survived"]]
    train_df_feature = train_df.drop(["Survived"], axis=1)
    test_df_feature = test_df.drop(["Survived"], axis=1, errors='ignore')
    #5-2: 데이터를 TitanicDataset에 저장
    train_dataset = TitanicDataset(train_df_feature, train_df_target)
    test_dataset = TitanicTestDataset(test_df_feature)
    
    #전처리 6 - random_split을 이용한 train과 validation 분리
    train_dataset, validation_dataset = random_split(train_dataset,[0.8,0.2])
    
    return train_dataset, validation_dataset, test_dataset

feature, target 정리
- target: Sruvived
- feature: Pclass, Sex_male, Age, SibSp, Parch, Fare, Embarked_Q, Embarked_S


In [5]:
from torch import nn, optim

class MyModel(nn.Module):
    def __init__(self, n_input, n_output):
        super().__init__()
        
        self.model = nn.Sequential( #안의 것을 순차적으로 실행
            nn.Linear(n_input, 30), 
            nn.ReLU(),
            nn.Linear(30, 30),
            nn.ReLU(),
            nn.Linear(30, n_output),
        ) # hidden은 뉴런 30짜리 2개, 활성화 함수는 ReLU 사용
        
    def forward(self, x):
        x = self.model(x)
        return x

In [6]:

if __name__ == '__main__':
    #타이타닉 데이터 로드
    train_dataset, validation_dataset, test_dataset = get_preprocessed_dataset()
    print("train_dataset: {0}, validation_dataset: {1}, test_dataset: {2}".format(len(train_dataset), len(validation_dataset), len(test_dataset)))
    print("#"*50, 1)
    
    #train 데이터를 "(인덱스) - (feature): (target)" 형태로 전체 출력하기 
    for idx, sample in enumerate(train_dataset):
        print("{0} - {1}: {2}".format(idx, sample['input'], sample['target']))
    print("#"*50, 2)
    
    #데이터셋을 DataLoader에 batch_size와 shuffle 유무를 설정해 저장하기
    train_data_loader = DataLoader(dataset = train_dataset, batch_size = 16, shuffle = True)
    validation_data_loader = DataLoader(dataset = validation_dataset, batch_size = 16, shuffle = True)
    test_data_loader = DataLoader(dataset = test_dataset, batch_size = len(test_dataset))
    
    print("[TRAIN]")
    #배치 단위로 idx와 feature의 구조, target 구조를 출력
    for idx, batch in enumerate(train_data_loader):
        print("{0} - {1}: {2}".format(idx, batch['input'].shape, batch['target'].shape))
    print("[VALIDATION]")
    for idx, batch in enumerate(validation_data_loader):
        print("{0} - {1}: {2}".format(idx, batch['input'].shape, batch['target'].shape))
        
    print("#" * 50, 3)
    
    print("[TEST]")
    #test_data_Loader의 데이터를 가져와 my_model에 입력해 예측 결과 출력하기
    #현재 model의 학습이 안된 상태이므로 학습하는 코드의 추가가 필요
    batch = next(iter(test_data_loader))
    print("{0}".format(batch['input'].shape))
    my_model = MyModel(n_input=8, n_output=2)
    output_batch = my_model(batch['input'])
    prediction_batch = torch.argmax(output_batch, dim=1)
    for idx, prediction in enumerate(prediction_batch, start=892):
        print(idx, prediction.item())

train_dataset: 713, validation_dataset: 178, test_dataset: 418
################################################## 1
0 - tensor([ 3.0000,  5.0000,  0.0000,  0.0000, 12.4750,  0.0000,  0.0000,  1.0000]): tensor([1])
1 - tensor([ 1.0000, 65.0000,  0.0000,  1.0000, 61.9792,  1.0000,  0.0000,  0.0000]): tensor([0])
2 - tensor([ 3.0000, 33.0000,  3.0000,  0.0000, 15.8500,  0.0000,  0.0000,  1.0000]): tensor([1])
3 - tensor([ 3.0000, 29.8811,  0.0000,  0.0000,  7.8292,  1.0000,  1.0000,  0.0000]): tensor([0])
4 - tensor([ 3.0000, 29.8811,  0.0000,  0.0000,  7.7500,  1.0000,  1.0000,  0.0000]): tensor([0])
5 - tensor([ 3.0000, 22.0000,  0.0000,  0.0000,  9.8375,  0.0000,  0.0000,  1.0000]): tensor([0])
6 - tensor([ 3.0000, 22.0000,  0.0000,  0.0000,  7.2292,  1.0000,  0.0000,  0.0000]): tensor([0])
7 - tensor([ 3.0000, 24.0000,  0.0000,  0.0000,  8.0500,  1.0000,  0.0000,  1.0000]): tensor([0])
8 - tensor([ 2., 52.,  0.,  0., 13.,  1.,  0.,  1.]): tensor([0])
9 - tensor([ 1.0000, 30.0000,  0.0

# 요구사항 2 - _01_code/_07_learning_and_optimization/f_my_model_training_with_argparse_wandb.py 를 이용해 코드 수정하기

In [7]:
import wandb
from datetime import datetime

#chekpoint의 경로를 설정하고, 만약 경로의 디렉토리가 없다면 디렉토리를 만든다.

In [8]:
from torch import nn, optim

class MyModel(nn.Module):
    def __init__(self, n_input, n_output):
        super().__init__()
        Activation_Function = wandb.config.Activation_Function
        self.model = nn.Sequential( #안의 것을 순차적으로 실행
            nn.Linear(n_input, wandb.config.n_hidden_unit_list[0]), 
            nn.ReLU(),
            nn.Linear(wandb.config.n_hidden_unit_list[0],wandb.config.n_hidden_unit_list[1]),
            nn.ReLU(),
            nn.Linear(wandb.config.n_hidden_unit_list[1], n_output),
        ) # hidden은 뉴런 30짜리 2개, 활성화 함수는 ReLU 사용
        
    def forward(self, x):
        x = self.model(x)
        return x

In [9]:
def get_model_and_optimizer():
    my_model = MyModel(n_input=8, n_output=2)
    optimizer = optim.SGD(my_model.parameters(), lr=wandb.config.learning_rate)
    
    return my_model, optimizer

In [10]:
def training_loop(model, optimizer, train_data_loader, validation_data_loader):
    n_epochs = wandb.config.epochs
    loss_fn = nn.CrossEntropyLoss()
    next_print_epoch = 100  #100 epoch마다 출력
    
    for epoch in range(1, n_epochs + 1):
        loss_train= 0.0
        num_trains = 0
        for train_batch in train_data_loader:
            input = train_batch['input']            #input, target = train_batch  이런 형태로 하면 input이 str 자료형이라는 오류가 생김
            target = train_batch['target'].view(-1)
            output_train = model(input)             #예측 시작
            loss = loss_fn(output_train, target)    #예측을 토대로 loss 계산
            
            loss_train += loss.item() # epoch동안 손실을 누적
            num_trains += 1
            
            #파라미터 최적화 진행
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        #validation에 대해서 train과 동일하게 적용해 validation_loss 계산
        loss_validation = 0.0
        num_validations = 0
        with torch.no_grad():
            for validation_batch in validation_data_loader:
                input = validation_batch['input']
                target = validation_batch['target'].view(-1)
                output_validation = model(input)
                loss = loss_fn(output_validation, target)
                loss_validation += loss.item()
                num_validations += 1
                
        #학습 결과로 나온 Loss를 wandb에 기록
        wandb.log({
            "Epoch": epoch,
            "Training Loss": loss_train/num_trains,
            "Validation Loss": loss_validation/num_validations,
        })
        
        if epoch >= next_print_epoch:
          print(
            f"Epoch {epoch}, "
            f"Training loss {loss_train / num_trains:.4f}, "
            f"Validation loss {loss_validation / num_validations:.4f}"
          )
          next_print_epoch += 100
    

In [11]:
def main(epochs=1000, batch_size=512, learning_rate=1e-3, activation_function=0):
    current_time_str = datetime.now().astimezone().strftime("%Y_%m_%d_%H_%M_%S")
    Activation_Function_list = [nn.ReLU(), nn.ELU(), nn.LeakyReLU(), nn.PReLU(), nn.SELU(), nn.Softplus(), nn.SiLU()]   #확인할 activation_function 리스트
    config = {
        'epochs': epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'n_hidden_unit_list': [30,30],
        'Activation_Function': Activation_Function_list[activation_function]
    }
    wandb.init( #wandb설정 값들
        mode = "online" if wandb else "disabled",
        project = "link_DL",
        notes = "HW2_Titanic",
        tags = ["my_model", "Titanic"],
        name = current_time_str,
        config = config
    )
    
    print(wandb.config)
    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #학습을 할 device 확인 및 지정
    print(f"Device: {device}")

    
    #사용할 데이터를 전처리해 로드
    train_dataset, validation_dataset, test_dataset = get_preprocessed_dataset()
    
    linear_model, optimizer = get_model_and_optimizer() # 모델과 최적화 방법을 로드
    linear_model.to(device)
    
    train_data_loader = DataLoader(dataset = train_dataset, batch_size = batch_size, shuffle = True)
    validation_data_loader = DataLoader(dataset = validation_dataset, batch_size = batch_size, shuffle = True)

    
    training_loop( # 학습을 진행
        model= linear_model,
        optimizer= optimizer,
        train_data_loader= train_data_loader,
        validation_data_loader= validation_data_loader,
    )
    wandb.finish()
    
    return linear_model

In [67]:
if __name__ == "__main__":
    epochs = 1000
    batch_size = 16
    learning_rate = 1e-3
    
    models = []
    
    #확인할 활성화 함수 목록: [nn.ReLU(), nn.ELU(), nn.LeakyReLU(), nn.PReLU(), nn.SELU(), nn.Softplus(), nn.SiLU()]
    for activation_function in range(7):
        models.append(main(epochs, batch_size, learning_rate, activation_function))

{'epochs': 1000, 'batch_size': 16, 'learning_rate': 0.001, 'n_hidden_unit_list': [30, 30], 'Activation_Function': 'ReLU()'}
Device: cpu
Epoch 100, Training loss 0.5776, Validation loss 0.5619
Epoch 200, Training loss 0.5384, Validation loss 0.5531
Epoch 300, Training loss 0.5250, Validation loss 0.5078
Epoch 400, Training loss 0.5113, Validation loss 0.4384
Epoch 500, Training loss 0.4861, Validation loss 0.4972
Epoch 600, Training loss 0.4634, Validation loss 0.4491
Epoch 700, Training loss 0.4680, Validation loss 0.5176
Epoch 800, Training loss 0.4535, Validation loss 0.4724
Epoch 900, Training loss 0.4514, Validation loss 0.3899
Epoch 1000, Training loss 0.4656, Validation loss 0.4531


0,1
Epoch,▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
Training Loss,█▇▇▆▆▆▆▅▅▅▅▄▄▄▃▃▄▃▃▃▃▃▂▃▂▂▂▂▂▁▂▂▁▂▂▂▁▁▁▁
Validation Loss,▇▄▄▄▄▃▄▃▃▃▃▃█▂▃▃▂▄▂▂▃▂▃▃▂▂▂▂▂▂▂▂▂▄▂▁▁▂▂▁

0,1
Epoch,1000.0
Training Loss,0.46558
Validation Loss,0.45313


{'epochs': 1000, 'batch_size': 16, 'learning_rate': 0.001, 'n_hidden_unit_list': [30, 30], 'Activation_Function': 'ELU(alpha=1.0)'}
Device: cpu
Epoch 100, Training loss 0.5811, Validation loss 0.5659
Epoch 200, Training loss 0.5374, Validation loss 0.5549
Epoch 300, Training loss 0.5026, Validation loss 0.5440
Epoch 400, Training loss 0.4733, Validation loss 0.6037
Epoch 500, Training loss 0.4773, Validation loss 0.5289
Epoch 600, Training loss 0.4500, Validation loss 0.5404
Epoch 700, Training loss 0.4418, Validation loss 0.4524
Epoch 800, Training loss 0.4406, Validation loss 0.4709
Epoch 900, Training loss 0.4349, Validation loss 0.4536
Epoch 1000, Training loss 0.4284, Validation loss 0.4471


0,1
Epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇███
Training Loss,███▇▇▇▇▆▇▆▅▅▅▅▄▃▃▃▃▂▂▂▂▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▂▁
Validation Loss,▅▅▅▄▄▅▄▄▅▅▆▄▃▄▄▄▄▃▃▃▃▃▃█▂▁▃▃▃▁▁▄▁▂▁▂▂▁▂▂

0,1
Epoch,1000.0
Training Loss,0.42842
Validation Loss,0.44708


{'epochs': 1000, 'batch_size': 16, 'learning_rate': 0.001, 'n_hidden_unit_list': [30, 30], 'Activation_Function': 'LeakyReLU(negative_slope=0.01)'}
Device: cpu
Epoch 100, Training loss 0.5831, Validation loss 0.5937
Epoch 200, Training loss 0.5582, Validation loss 0.5503
Epoch 300, Training loss 0.5239, Validation loss 0.6275
Epoch 400, Training loss 0.4938, Validation loss 0.5276
Epoch 500, Training loss 0.4789, Validation loss 0.5788
Epoch 600, Training loss 0.4744, Validation loss 0.5027
Epoch 700, Training loss 0.4468, Validation loss 0.4733
Epoch 800, Training loss 0.4454, Validation loss 0.5305
Epoch 900, Training loss 0.4361, Validation loss 0.4818
Epoch 1000, Training loss 0.4299, Validation loss 0.5562


0,1
Epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇███
Training Loss,██▇▇▆▆▆▆▅▅▅▅▅▅▄▃▃▃▄▃▂▃▃▃▂▂▂▂▂▂▂▂▁▂▂▁▁▁▁▁
Validation Loss,▃▄▄▃▃▃▄▃▃▃▃▃▃▃▃▃▃▃▂▃▃▃▃▃▂▂▄▁▂█▂▁▇▂▁▁▂▂▂▁

0,1
Epoch,1000.0
Training Loss,0.42992
Validation Loss,0.55616


{'epochs': 1000, 'batch_size': 16, 'learning_rate': 0.001, 'n_hidden_unit_list': [30, 30], 'Activation_Function': 'PReLU(num_parameters=1)'}
Device: cpu
Epoch 100, Training loss 0.5650, Validation loss 0.6132
Epoch 200, Training loss 0.5214, Validation loss 0.5511
Epoch 300, Training loss 0.5031, Validation loss 0.5059
Epoch 400, Training loss 0.4715, Validation loss 0.6201
Epoch 500, Training loss 0.4817, Validation loss 0.4809
Epoch 600, Training loss 0.4486, Validation loss 0.5172
Epoch 700, Training loss 0.4585, Validation loss 0.5354
Epoch 800, Training loss 0.4324, Validation loss 0.4999
Epoch 900, Training loss 0.4368, Validation loss 0.4646
Epoch 1000, Training loss 0.4323, Validation loss 0.5219


0,1
Epoch,▁▁▁▁▁▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▄▄▅▅▅▅▅▆▆▆▇▇▇▇▇▇█
Training Loss,█▇▆▆▅▄▅▅▄▄▄▄▄▄▃▃▃▂▃▂▃▃▃▂▂▂▂▂▂▁▂▂▁▁▁▁▁▁▂▁
Validation Loss,▇▇▇██▆▇▇▆▅▅▅▂▃▅▂▄█▆▆▄▆▃▆▆▇▁▇▃▂▂▃▄▄▁▂▄▁▁▃

0,1
Epoch,1000.0
Training Loss,0.4323
Validation Loss,0.52187


{'epochs': 1000, 'batch_size': 16, 'learning_rate': 0.001, 'n_hidden_unit_list': [30, 30], 'Activation_Function': 'SELU()'}
Device: cpu
Epoch 100, Training loss 0.5753, Validation loss 0.6362
Epoch 200, Training loss 0.5456, Validation loss 0.5706
Epoch 300, Training loss 0.5042, Validation loss 0.6075
Epoch 400, Training loss 0.4810, Validation loss 0.6511
Epoch 500, Training loss 0.4514, Validation loss 0.5459
Epoch 600, Training loss 0.4526, Validation loss 0.4673
Epoch 700, Training loss 0.4395, Validation loss 0.4703
Epoch 800, Training loss 0.4325, Validation loss 0.4599
Epoch 900, Training loss 0.4237, Validation loss 0.5540
Epoch 1000, Training loss 0.4348, Validation loss 0.4952


0,1
Epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇██
Training Loss,██▇▇▇▆▆▆▆▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▂▁▁▂▂▁▁▁▁
Validation Loss,▇▆▆▆█▆▆▅▄▄▃▄▅▄▃▄▃▂▃▂▂▂▁▂▁▃▂▇▁▂▂▁▁▂▁▁▅▃▅▁

0,1
Epoch,1000.0
Training Loss,0.43484
Validation Loss,0.49523


{'epochs': 1000, 'batch_size': 16, 'learning_rate': 0.001, 'n_hidden_unit_list': [30, 30], 'Activation_Function': 'Softplus(beta=1.0, threshold=20.0)'}
Device: cpu
Epoch 100, Training loss 0.5801, Validation loss 0.6005
Epoch 200, Training loss 0.5357, Validation loss 0.5657
Epoch 300, Training loss 0.5015, Validation loss 0.5448
Epoch 400, Training loss 0.4862, Validation loss 0.4649
Epoch 500, Training loss 0.4721, Validation loss 0.4905
Epoch 600, Training loss 0.4491, Validation loss 0.4573
Epoch 700, Training loss 0.4422, Validation loss 0.4308
Epoch 800, Training loss 0.4589, Validation loss 0.4911
Epoch 900, Training loss 0.4426, Validation loss 0.4659
Epoch 1000, Training loss 0.4274, Validation loss 0.4885


0,1
Epoch,▁▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇███
Training Loss,███▇▇▇▇▇▆▆▆▆▆▆▅▄▄▄▄▄▃▃▃▂▂▂▂▂▂▂▂▂▁▂▂▂▂▁▁▁
Validation Loss,▅▆▆▅▇▅▅▅▄▅▄▄▄▅▅▃██▂▃▂▂▃▁▁▄▂▆▂▂▁▅▁▂▁▁▁▁▂▃

0,1
Epoch,1000.0
Training Loss,0.42743
Validation Loss,0.48853


{'epochs': 1000, 'batch_size': 16, 'learning_rate': 0.001, 'n_hidden_unit_list': [30, 30], 'Activation_Function': 'SiLU()'}
Device: cpu
Epoch 100, Training loss 0.5984, Validation loss 0.5659
Epoch 200, Training loss 0.5756, Validation loss 0.5402
Epoch 300, Training loss 0.5440, Validation loss 0.5144
Epoch 400, Training loss 0.5177, Validation loss 0.4767
Epoch 500, Training loss 0.4911, Validation loss 0.4307
Epoch 600, Training loss 0.4834, Validation loss 0.4354
Epoch 700, Training loss 0.4774, Validation loss 0.4178
Epoch 800, Training loss 0.4616, Validation loss 0.4551
Epoch 900, Training loss 0.4544, Validation loss 0.4084
Epoch 1000, Training loss 0.4514, Validation loss 0.4340


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Epoch,▁▁▁▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇██
Training Loss,██▇▇▇▇▆▆▅▅▅▅▄▄▅▄▄▃▄▃▂▂▂▂▂▂▂▂▁▂▁▂▂▁▁▁▁▁▁▁
Validation Loss,█▇▆█▆▇▇▆▇▅█▅▄▅▄▂▂▃▁▂▃▂▁▂▄▁▂▅▄▁▁▂▁▁▂▁▃▂▂▂

0,1
Epoch,1000.0
Training Loss,0.45142
Validation Loss,0.43403


![wandb_activation_function_result](HW2_Titanic_Activation_Function.png)

[그림 1] 여러 activation_function별 결과



![wandb_config](HW2_Titanic_config.png)

그림 2] 2024_10_22_23_30_40의 config


epochs 1000을 기준으로 activation_function을 SiLU로 사용했을 때 validation_loss가 가장 작았다.
어느 시점에 epochs을 멈추는 것이 좋은지 알기 위해서 early stop을 이용해 overfitting이 발생하기 전의 모델을 사용해 test data를 확인하기로 했다.

In [42]:
class EarlyStopping: #모델을 저장하고, early stop 여부를 정하는 클래스
    def __init__(
            self, patience = 10, delta =0.0001, project_name = None, checkpoint_file_path = None, run_time_str = None
    ): #early_stop_patience, 얼마나 차이나는 것을 count 할것인가., 모델 저장 명, 파일을 저장할 경로, 모델 저장 명
        self.patience = patience
        self.counter = 0
        self.delta = delta
        self.val_loss_min = None
        self.file_path = os.path.join(
            checkpoint_file_path, f"{project_name}_checkpoint_{run_time_str}.pt"
        )
        self.latest_file_path = os.path.join(
            checkpoint_file_path, f"{project_name}_checkpoint_latest.pt"
        )
    
    def check_and_save(self, new_validation_loss, model): #val_loss를 확인하고, 작아졌는지 확인하고 count 여부를 정하며, 만약 counter가 patience보다 커지면 이를 알린다.
        early_stop = False
        message = None
        
        if self.val_loss_min is None:
            self.val_loss_min = new_validation_loss
            message =f"Early stopping is stated!"
        elif new_validation_loss < self.val_loss_min - self.delta:
            message =f'V_loss decreased ({self.val_loss_min:6.3f} --> {new_validation_loss:6.3f}). Saving model..'
            self.save_checkpoint(new_validation_loss, model)
            self.val_loss_min = new_validation_loss
            self.counter = 0
        else:
            self.counter += 1
            message = f'Early stopping counter: {self.counter} out of {self.patience}'
            if self.counter >= self.patience:
                early_stop = True
                message += "*** TRAIN EARLY STOPPED! ***"
            
        return message, early_stop
    
    def save_checkpoint(self, val_loss, model): #모델을 저장하는 함수
        torch.save(model.state_dict(), self.file_path)
        torch.save(model.state_dict(), self.latest_file_path)
        self.val_loss_min = val_loss
        

In [49]:
class ClassificationTrainer: # 모델의 학습을 담당하는 클래스
    def __init__(
            self, train_data_loader, validation_data_loader, test_data_loader, run_time_str, wandb, device, checkpoint_file_path
    ):
        self.train_data_loader = train_data_loader
        self.validation_data_loader = validation_data_loader
        self.test_data_loader = test_data_loader
        self.run_time_str = run_time_str
        self.wandb = wandb
        self.device = device
        self.checkpoint_file_path = checkpoint_file_path
        self.loss_fn = nn.CrossEntropyLoss()
        self.model, self.optimizer = get_model_and_optimizer()
        self.model.to(self.device)
        
        
    def do_train(self): # training 과정
        self.model.train()
        
        loss_train = 0.0
        num_corrects_train = 0
        num_trained_samples = 0
        num_trains = 0
        
        for train_batch in self.train_data_loader:
            input_train = train_batch["input"]
            target_train = train_batch["target"].view(-1)
            input_train = input_train.to(device=self.device)
            target_train = target_train.to(device = self.device)
            
            output_train = self.model(input_train)
            loss = self.loss_fn(output_train, target_train)
            loss_train += loss.item()
        
            predicted_train = torch.argmax(output_train, dim =1 )
            
            num_corrects_train += torch.sum(torch.eq(predicted_train, target_train)).item()
            num_trained_samples += len(input_train)
            num_trains += 1
            
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            
        train_loss = loss_train / num_trains
        train_accuracy = 100.0 * num_corrects_train / num_trained_samples
        return train_loss, train_accuracy
    
    def do_validation(self): # validation 과정
        self.model.eval()
        
        loss_validation = 0.0
        num_corrects_validation =0
        num_validated_samples = 0
        num_validations =0 
        
        with torch.no_grad():
            for validation_batch in self.validation_data_loader:
                input_validation = validation_batch["input"]
                target_validation = validation_batch["target"].view(-1)
                input_validation = input_validation.to(device = self.device)
                target_validation = target_validation.to(device = self.device)
                
                output_validation = self.model(input_validation)
                loss_validation += self.loss_fn(output_validation, target_validation).item()
                
                predicted_validation = torch.argmax(output_validation, dim = 1)
                num_corrects_validation += torch.sum(torch.eq(predicted_validation, target_validation)).item()
                num_validated_samples += len(input_validation)
                num_validations += 1
                
            validation_loss = loss_validation / num_validations
            validation_accuracy = 100.0 * num_corrects_validation / num_validated_samples
            
            return validation_loss, validation_accuracy
        
    def train_loop(self): # 위에서 만든 do_train과 do_validation을 이용해 학습
        early_stopping = EarlyStopping(
            patience = self.wandb.config.early_stop_patience, project_name = self.wandb.config.project_name, checkpoint_file_path = self.checkpoint_file_path, run_time_str = self.run_time_str
        )
        n_epochs = self.wandb.config.epochs
        
        for epoch in range(1, n_epochs + 1):
            train_loss, train_accuracy = self.do_train()
            
            if epoch == 1 or epoch % self.wandb.config.validation_intervals == 0:
                validation_loss, validation_accuracy = self.do_validation()
                
                message, early_stop = early_stopping.check_and_save(validation_loss, self.model)
                
                print(
                    f"[Epoch {epoch: >3}]"
                    f"T_loss: {train_loss:6.3f},"
                    f"T_accuracy: {train_accuracy:6.3f} | "
                    f"V_loss: {validation_loss:6.3f}, "
                    f"V_accuracy: {validation_accuracy:6.3f} | "
                    f"{message} | "
                )
                
                self.wandb.log({
                    "Epoch": epoch,
                    "Training loss": train_loss,
                    "Training accuracy (%)": train_accuracy,
                    "Validation loss": validation_loss,
                    "Validation accuracy (%)": validation_accuracy,
                })
                
                if early_stop:
                    break
                    
    def do_test(self): # 학습 후 만들어진 모델을 이용해서 test_data를 예측한 결과를 csv파일로 내보내는 함수
        self.model.eval()
        
        predictions = []
        indices = []
        
        with torch.no_grad():
            test_batch = next(iter(self.test_data_loader))
            output_test = self.model(test_batch["input"])
            predicted_test = torch.argmax(output_test, dim= 1)
            for idx, prediction in enumerate(predicted_test, start= 892):
                predictions.append(prediction.item())
                indices.append(idx)
        results_df = pd.DataFrame({
            'PassengerId': indices,
            'Survived': predictions
        })
        
        results_df.to_csv('submission.csv', index=False)
        

In [50]:
def main(epochs=1000, batch_size=512, learning_rate=1e-3, early_stop_patience = 10, validation_intervals = 10):
    current_time_str = datetime.now().astimezone().strftime("%Y_%m_%d_%H_%M_%S")
    CHECKPOINT_FILE_PATH = os.path.join(os.getcwd(), "checkpoints")
    config = {
        'epochs': epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'n_hidden_unit_list': [30,30],
        'Activation_Function': nn.SiLU(),
        'early_stop_patience': early_stop_patience,
        'validation_intervals': validation_intervals,
        'project_name': "Titanic"
    }
    wandb.init( #wandb설정 값들
        mode = "online" if wandb else "disabled",
        project = "link_DL_titanic",
        notes = "HW2_Titanic",
        tags = ["my_model", "Titanic", "SiLU"],
        name = current_time_str,
        config = config
    )
    
    print(wandb.config)
    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(f"Device: {device}")

    
    #데이터, 모델, 최적화 방법 로드
    train_dataset, validation_dataset, test_dataset = get_preprocessed_dataset()
    
    linear_model, optimizer = get_model_and_optimizer()
    linear_model.to(device)
    
    train_data_loader = DataLoader(dataset = train_dataset, batch_size = batch_size, shuffle = True)
    validation_data_loader = DataLoader(dataset = validation_dataset, batch_size = batch_size, shuffle = True)
    test_data_loader = DataLoader(dataset = test_dataset, batch_size = len(test_dataset))
    
    classification_trainer = ClassificationTrainer(
        train_data_loader=train_data_loader, 
        validation_data_loader=validation_data_loader, 
        test_data_loader = test_data_loader,
        run_time_str=current_time_str, 
        wandb=wandb, device=device, 
        checkpoint_file_path=CHECKPOINT_FILE_PATH
    )
    classification_trainer.train_loop()
    
    wandb.finish()
    
    classification_trainer.do_test()
    
    return linear_model

In [45]:
if __name__ == "__main__":
    epochs = 1000
    batch_size = 16
    learning_rate = 1e-3
    early_stop_patience = 10
    validation_intervals =10
    
    model = main(epochs, batch_size, learning_rate, early_stop_patience, validation_intervals)

VBox(children=(Label(value='0.019 MB of 0.019 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Epoch,▁
Training accuracy (%),▁
Training loss,▁
Validation accuracy (%),▁
Validation loss,▁

0,1
Epoch,1.0
Training accuracy (%),64.65638
Training loss,0.67124
Validation accuracy (%),64.04494
Validation loss,0.63988


{'epochs': 1000, 'batch_size': 16, 'learning_rate': 0.001, 'n_hidden_unit_list': [30, 30], 'Activation_Function': 'SiLU()', 'early_stop_patience': 10, 'validation_intervals': 10, 'project_name': 'Titanic'}
Device: cpu
[Epoch   1]T_loss:  0.990,T_accuracy: 47.546 | V_loss:  0.647, V_accuracy: 69.101 | Early stopping is stated! | 
[Epoch  10]T_loss:  0.600,T_accuracy: 68.864 | V_loss:  0.592, V_accuracy: 69.663 | V_loss decreased ( 0.647 -->  0.592). Saving model.. | 
[Epoch  20]T_loss:  0.600,T_accuracy: 69.425 | V_loss:  0.613, V_accuracy: 67.978 | Early stopping counter: 1 out of 10 | 
[Epoch  30]T_loss:  0.592,T_accuracy: 68.163 | V_loss:  0.611, V_accuracy: 67.416 | Early stopping counter: 2 out of 10 | 
[Epoch  40]T_loss:  0.596,T_accuracy: 69.846 | V_loss:  0.642, V_accuracy: 67.416 | Early stopping counter: 3 out of 10 | 
[Epoch  50]T_loss:  0.590,T_accuracy: 68.864 | V_loss:  0.604, V_accuracy: 65.730 | Early stopping counter: 4 out of 10 | 
[Epoch  60]T_loss:  0.589,T_accuracy:

0,1
Epoch,▁▁▂▂▃▃▃▄▄▄▅▅▆▆▆▇▇██
Training accuracy (%),▁▇▇▇▇▇▇▇▇▇█████████
Training loss,█▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
Validation accuracy (%),▅▆▄▃▃▁▄▅▄▅▅▆▇▇▆▇▇█▇
Validation loss,█▂▄▄▇▄▄▄▁▂▄▇▂▂▂▃▅▄▄

0,1
Epoch,180.0
Training accuracy (%),72.65077
Training loss,0.5557
Validation accuracy (%),70.22472
Validation loss,0.60512


![Early_stop1](HW2_Titanic_Early_stop1.png)

[그림 3] early_stop적용 결과
 
early_stop_patience = 10
validation_intervals = 10
인 경우로 18epoch만에 학습이 종료되었고, 8epoch의 경우가 최적의 epoch으로 나왔지만, 이전의 validation_loss 와 비교했을 때 더 큰값을 가지고 있었다.

early_stop_patience를 100으로 늘리고, epochs을 10000으로 변경해서 테스트 진행

In [48]:
if __name__ == "__main__":
    epochs = 10000
    batch_size = 16
    learning_rate = 1e-3
    early_stop_patience = 100
    validation_intervals =10
    
    model = main(epochs, batch_size, learning_rate, early_stop_patience, validation_intervals)

{'epochs': 10000, 'batch_size': 16, 'learning_rate': 0.001, 'n_hidden_unit_list': [30, 30], 'Activation_Function': 'SiLU()', 'early_stop_patience': 100, 'validation_intervals': 10, 'project_name': 'Titanic'}
Device: cpu
[Epoch   1]T_loss:  0.695,T_accuracy: 58.205 | V_loss:  0.709, V_accuracy: 69.663 | Early stopping is stated! | 
[Epoch  10]T_loss:  0.613,T_accuracy: 68.583 | V_loss:  0.587, V_accuracy: 69.663 | V_loss decreased ( 0.709 -->  0.587). Saving model.. | 
[Epoch  20]T_loss:  0.613,T_accuracy: 69.004 | V_loss:  0.601, V_accuracy: 66.854 | Early stopping counter: 1 out of 100 | 
[Epoch  30]T_loss:  0.605,T_accuracy: 69.986 | V_loss:  0.614, V_accuracy: 70.225 | Early stopping counter: 2 out of 100 | 
[Epoch  40]T_loss:  0.602,T_accuracy: 70.266 | V_loss:  0.628, V_accuracy: 70.225 | Early stopping counter: 3 out of 100 | 
[Epoch  50]T_loss:  0.599,T_accuracy: 69.986 | V_loss:  0.564, V_accuracy: 70.225 | V_loss decreased ( 0.587 -->  0.564). Saving model.. | 
[Epoch  60]T_lo

0,1
Epoch,▁▁▁▁▁▂▃▃▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▅▆▇▇▇▇▇█████
Training accuracy (%),▁▄▄▅▄▅▅▆▇▆▇▇▇▇█▇▇▇██▇██▇███▇████████████
Training loss,██▇▇▆▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁
Validation accuracy (%),▁▁▂▁▂▆▄▆▆▇▇▅▇▅▅▅▅█▅▇█▇▄▄▄▄▅▇▄█▅▅▆▅██▆▅▅▇
Validation loss,███▃▃▂▂▅▃▃▂▇▅▂▃▃▄▆▃▂▄▁▇▂▂▃▂▂▁▇▁▅▂▄▄▅▆▄█▅

0,1
Epoch,2670.0
Training accuracy (%),82.32819
Training loss,0.40477
Validation accuracy (%),76.40449
Validation loss,0.51743


![Titanic_early_stop2](HW2_Titanic_Early_stop2.png)

[그림 4] 수정 후 early_stop


# Kaggle 제출 결과
![Kaggle_score1](Kaggle_score.png)

# 숙제 후기
어째서인지 모르겠지만, 코드를 그대로 따라했을 때 train_loop에서 'input, target = train_batch'를 한 경우 input이 'str' type으로 나와서 오류가 발생하는 문제가 발생했다. 어때서 이렇게 되는 것인지 궁금합니다. 