### dataset 클래스
- 다음과 같이 데이터 로딩함
    * Data File
    * DataFrame, Numpy (전처리)
    * Tensor
    * Dataset (피쳐+타켓)
    * Dataloader (데이터 로드 및 배치 처리)

### 데이터셋 & 데이터로더 살펴보기
- Pytorch 에서 배치크기만 데이터를 조절하기 위한 메커니즘
- Dataset : 사용 데이터를 기반으로 사용자 정의 클래스 작성
- Dataloader : 지정된 Dataset 에서 지정된 batch size 만큼 피쳐와 타켓을 추출하여 전달


### (1) 모듈 로딩 및 데이터 준비
<hr>

In [1]:
### 모듈 로딩
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
array = [[10,20,30], [20,30,40], [40,50,60], [50,60,70] , [70,80,90]]
x_data = torch.IntTensor(array, device=DEVICE)
y_data = torch.FloatTensor([[np.mean(x)] for x in array], device=DEVICE)

In [4]:
x_data.shape, x_data.ndim, y_data.shape, y_data.ndim

(torch.Size([5, 3]), 2, torch.Size([5, 1]), 2)

### (2) 데이터셋 생성
<hr>

- #### (2-1) TensorDataset 활용 : Dataset의 sub_class

In [5]:
# TensorDataset 클래스 로딩
from torch.utils.data import TensorDataset

In [6]:
dataset = TensorDataset(x_data, y_data)
dataset

<torch.utils.data.dataset.TensorDataset at 0x764ec819cb20>

In [7]:
## __getitem()__()가 호출됨

dataset[0], dataset.__getitem__(0), dataset.__len__(), dataset.tensors

((tensor([10, 20, 30], dtype=torch.int32), tensor([20.])),
 (tensor([10, 20, 30], dtype=torch.int32), tensor([20.])),
 5,
 (tensor([[10, 20, 30],
          [20, 30, 40],
          [40, 50, 60],
          [50, 60, 70],
          [70, 80, 90]], dtype=torch.int32),
  tensor([[20.],
          [30.],
          [50.],
          [60.],
          [80.]])))

#### [2-2] 사용자 정의 데이터셋 생성

In [8]:
### 데이터준비
from sklearn.datasets import load_iris

X, y = load_iris(return_X_y=True, as_frame=False)
data = load_iris(as_frame=True)

- 만약 타켓 이름을 숫자로 바꾸고 싶은 경우 LabelEncoder, cat.codes 등을 활용 

In [9]:
df = data['frame']
featuredf = df[df.columns[0:-1]]
targetdf = df[df.columns[-1]].to_numpy().reshape(-1,1)

In [10]:
Xt = torch.FloatTensor(X, device=DEVICE)
yt = torch.IntTensor(y, device=DEVICE)
# Xt, yt

In [11]:
### 사용자정의 DataSet 클래스
# 데이터의 Tensor 변환
class CustomDataset(Dataset):

    # 초기화 함수
    def __init__(self, X, y):
        # 부모 객체로 초기화
        super().__init__()

        # x, y 데이터 ==> ndarray
        if isinstance(X, pd.DataFrame) and isinstance(y, pd.Series):
            self.X = X.values
            self.y = y.values
        else:
            self.X = X
            self.y = y

        # ndarray ===> tensor화
        self.X = torch.tensor(X, device=DEVICE, dtype=torch.float).to(DEVICE)
        self.y = torch.tensor(y, device=DEVICE, dtype=torch.long).reshape(-1,).to(DEVICE)


    # 데이터셋의 갯수 체크 함수
    def __len__(self):
        return self.X.shape[0]

    # 특정 인덱스 데이터 + 라벨 반환 콜백함수 (callback function)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = CustomDataset(X, targetdf)
loader = DataLoader(dataset, batch_size=32, shuffle=True)



#### (2-2) 데이터셋 생성

In [12]:
dataset[0]

(tensor([5.1000, 3.5000, 1.4000, 0.2000]), tensor(0))

#### [2-3] 학습용, 검증용, 테스트용 Dataset 분할

<hr>

In [13]:
### => PyTorch
from torch.utils.data import random_split

# 학습용, 검증용, 테스트 데이터 비율 = 7 : 1 : 2
train_size = int(0.7 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

# 비율을 제공하여 3개 데이터로 나눌 수 있다.
seed = torch.Generator().manual_seed(42)
train_set, val_set, test_set = random_split(dataset, [train_size, val_size, test_size], seed)


train_set.__len__(), val_set.__len__(), test_set.__len__()

(105, 15, 30)

In [14]:
[print(x) for x in [ '데이터의 원래 인덱스 => ',train_set.indices, val_set.indices, test_set.indices]]

데이터의 원래 인덱스 => 
[42, 95, 30, 64, 52, 35, 130, 40, 82, 17, 108, 94, 68, 97, 117, 127, 41, 44, 57, 140, 149, 32, 23, 102, 16, 113, 71, 18, 67, 66, 0, 25, 101, 112, 91, 3, 59, 116, 86, 84, 106, 142, 43, 39, 26, 98, 93, 20, 87, 19, 120, 114, 7, 63, 76, 89, 36, 45, 37, 56, 58, 122, 51, 145, 24, 21, 105, 62, 15, 11, 48, 133, 88, 50, 6, 134, 111, 8, 49, 75, 69, 124, 4, 147, 80, 100, 99, 141, 47, 107, 13, 109, 129, 28, 38, 53, 121, 5, 55, 31, 73, 74, 54, 29, 12]
[22, 104, 81, 1, 103, 125, 85, 2, 96, 128, 27, 118, 77, 110, 146]
[72, 139, 131, 60, 65, 92, 135, 83, 14, 34, 137, 10, 119, 9, 148, 79, 78, 70, 144, 143, 123, 115, 61, 132, 90, 46, 126, 136, 33, 138]


[None, None, None, None]

In [15]:
for y in (print(x) for x in [x for x in torch.rand(10)]): pass

tensor(0.0756)
tensor(0.4418)
tensor(0.5369)
tensor(0.2991)
tensor(0.2700)
tensor(0.5641)
tensor(0.4180)
tensor(0.9769)
tensor(0.4710)
tensor(0.2614)


### (3) DataLoader 생성

- 학습용
- 검증용
- 테스트용

위 세가지 종류의 데이터로 나누어야함

<hr>

In [16]:
SIZE = 10
trainDL = DataLoader(train_set, batch_size=SIZE, shuffle=False, drop_last=False)
valDL = DataLoader(val_set, batch_size=SIZE, shuffle=True, drop_last=False) # drop last를 통해 epoch 개수보다 모자른 데이터 학습을 제외시킬 수 있음
testDL = DataLoader(test_set, batch_size=SIZE, shuffle=True)


In [17]:
# epoch당 반복단위 ==> iterator
# DataLoader는 iterating 시 epoch size 행만큼의 x, y가 나옴
for _, (x, y) in enumerate(trainDL):
    ## 로더에서 가지고온 데이터 만큼 학습 진행
    print(f'{_ + 1}번째')
    print(torch.concat([x, y.reshape(-1,1)], dim=1))

1번째
tensor([[4.4000, 3.2000, 1.3000, 0.2000, 0.0000],
        [5.7000, 3.0000, 4.2000, 1.2000, 1.0000],
        [4.8000, 3.1000, 1.6000, 0.2000, 0.0000],
        [5.6000, 2.9000, 3.6000, 1.3000, 1.0000],
        [6.9000, 3.1000, 4.9000, 1.5000, 1.0000],
        [5.0000, 3.2000, 1.2000, 0.2000, 0.0000],
        [7.4000, 2.8000, 6.1000, 1.9000, 2.0000],
        [5.0000, 3.5000, 1.3000, 0.3000, 0.0000],
        [5.8000, 2.7000, 3.9000, 1.2000, 1.0000],
        [5.1000, 3.5000, 1.4000, 0.3000, 0.0000]])
2번째
tensor([[6.7000, 2.5000, 5.8000, 1.8000, 2.0000],
        [5.6000, 2.7000, 4.2000, 1.3000, 1.0000],
        [6.2000, 2.2000, 4.5000, 1.5000, 1.0000],
        [6.2000, 2.9000, 4.3000, 1.3000, 1.0000],
        [7.7000, 3.8000, 6.7000, 2.2000, 2.0000],
        [6.1000, 3.0000, 4.9000, 1.8000, 2.0000],
        [4.5000, 2.3000, 1.3000, 0.3000, 0.0000],
        [5.1000, 3.8000, 1.9000, 0.4000, 0.0000],
        [4.9000, 2.4000, 3.3000, 1.0000, 1.0000],
        [6.7000, 3.1000, 5.6000, 2.4000, 

### (4) 모델 클래스 정의
<hr>

- 입/출력 피쳐수
- 층 수 
- 타켓 출력
- 은닉층의 노드수


  

- 구조 설계
    * 입력층 : 입력 <= 피쳐 갯수, iris 4개
    * 은닉층 : 사용자가 임의로 정의
    * 출력층 : 출력 <= [분류] 타켓 클래스 갯수 [회귀] 1개


In [18]:
class CustomModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        self.seq = nn.Sequential()
        self.seq.add_module('fc1', nn.Linear(input_size, hidden_size, device=DEVICE))
        self.seq.add_module('fc1_act', nn.ReLU())
        self.seq.add_module('fc2', nn.Linear(hidden_size, num_classes, device=DEVICE))

        # self.fc1 = nn.Linear(input_size, hidden_size)
        # self.fc1_act = nn.ReLU()
        # self.fc2 = nn.Linear(hidden_size, num_classes)
        # self.fc2_act = nn.Softmax(dim=1)

    def forward(self, x):
        # x = self.fc1(x)
        # x = self.fc1_act(x)
        # x = self.fc2(x)
        # x = self.fc2_act(x)
        x = self.seq(x)
        return x 



### (5) 학습 준비 : 실행디바이스, 모델, 최적화, 손실함수, 학습횟수, 학습함수, 평가함수, 예측함수
<hr>


In [19]:
# 실행 디바이스 설정
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 학습횟수
EPOCHS = 10000

# 모델 인스턴스
flower = CustomModel(featuredf.shape[1], 12, np.unique(targetdf).size)

# 손실함수 
LOSS_FN = nn.CrossEntropyLoss().to(DEVICE)

In [20]:
import torchmetrics

metric = torchmetrics.classification.MulticlassAccuracy(num_classes=3, average='micro')
metric_train = torchmetrics.classification.MulticlassAccuracy(num_classes=3, average='micro')

def training(model, loss_fn, optimizer, train_dl, epochs=EPOCHS):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='min', patience=5, eps=1e-8)

    tmptrainacc = []
    for _, (x, y) in enumerate(train_dl):
        x = x.to(DEVICE)
        y = y.to(DEVICE)
        # y = nn.functional.one_hot(y, 3).float()

        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        acc = metric(y_pred, y)
        tmptrainacc.append(acc)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    
    tmpacc = []
    for _, (x_test, y_test) in enumerate(testDL):
        x_test = x_test.to(DEVICE)
        y_test = y_test.to(DEVICE)
        # y_test = nn.functional.one_hot(y_test, 3).float()

        y_test_predict = model(x_test)
        acc = metric(y_test_predict, y_test)
        tmpacc.append(acc.item())
    y_test_predict = model(x_test.to(DEVICE))
    print(f"{y.shape, y_pred.shape}")
    
    # print(f'Epoch: {epoch+1}, Loss: {loss.item()}, Accuracy : train = {np.mean(tmptrainacc)} test = {np.mean(tmpacc)}')

    return loss

training(flower, LOSS_FN, torch.optim.Adam, trainDL, 10)

flower.eval()

(torch.Size([5]), torch.Size([5, 3]))


CustomModel(
  (seq): Sequential(
    (fc1): Linear(in_features=4, out_features=12, bias=True)
    (fc1_act): ReLU()
    (fc2): Linear(in_features=12, out_features=3, bias=True)
  )
)

In [21]:
### 검증 및 평가 진행함수
# 매개변수 dataLoader : 검증 또는 테스트 데이터셋에 대한 Loader
# 

def testing(model, loss_fn, dataloader):

    model.eval()
    with torch.no_grad():

        tmptrainacc = []
        for _, (x, y) in enumerate(dataloader):
            x = x.to(DEVICE)
            y = y.to(DEVICE)
            # y = nn.functional.one_hot(y, 3).float()

            y_pred = model(x)
            loss = loss_fn(y_pred, y)
            tmptrainacc.append(loss)

    return np.mean(tmptrainacc)      
        # tmpacc = []
        # for _, (x_test, y_test) in enumerate(dataloader):
        #     x_test = x_test.to(DEVICE)
        #     y_test = y_test.to(DEVICE)
        #     # y_test = nn.functional.one_hot(y_test, 3).float()

        #     y_test_predict = model(x_test)

        #     tmpacc.append(acc.item())
        # y_test_predict = model(x_test.to(DEVICE))
        # print(f"{y.shape, y_pred.shape}")
        
        # print(f'Epoch: {epoch+1}, Loss: {loss.item()}, Accuracy : train = {np.mean(tmptrainacc)} test = {np.mean(tmpacc)}')

In [22]:
## == > 지정된 횟수 만큼 처음부터 끝까지 학습 및 검증 진행
## == > 목표 : 최적(Error 최소화)의 W, b를 가진 모델 완성
## == > 

valList = []
for eps in range(EPOCHS):
    train_loss = training(flower, LOSS_FN, torch.optim.Adam, trainDL, 1)
    val_loss = testing(flower, LOSS_FN, valDL)

    print(f'Epoch: {eps+1}, Loss : train = {train_loss} test = {val_loss}')
    # 조기 종료 기준 ==> 조건 : val_loss가 지정된 횟수 (예: 5) 이상 개선이 안되면 학습 종료
    valList.append(val_loss)
    if np.mean(valList[-5:]) > np.mean(valList[-10:-5]):
        break

(torch.Size([5]), torch.Size([5, 3]))
Epoch: 1, Loss : train = 0.7861799001693726 test = 0.8946115970611572
(torch.Size([5]), torch.Size([5, 3]))
Epoch: 2, Loss : train = 0.7295676469802856 test = 0.6560657024383545
(torch.Size([5]), torch.Size([5, 3]))
Epoch: 3, Loss : train = 0.5609229803085327 test = 0.5008491277694702
(torch.Size([5]), torch.Size([5, 3]))
Epoch: 4, Loss : train = 0.46419721841812134 test = 0.4513162672519684
(torch.Size([5]), torch.Size([5, 3]))
Epoch: 5, Loss : train = 0.3899132311344147 test = 0.357729434967041
(torch.Size([5]), torch.Size([5, 3]))
Epoch: 6, Loss : train = 0.3366197943687439 test = 0.31142929196357727
(torch.Size([5]), torch.Size([5, 3]))
Epoch: 7, Loss : train = 0.293018639087677 test = 0.23709315061569214
(torch.Size([5]), torch.Size([5, 3]))
Epoch: 8, Loss : train = 0.2463933229446411 test = 0.20581971108913422
(torch.Size([5]), torch.Size([5, 3]))
Epoch: 9, Loss : train = 0.20545415580272675 test = 0.20599254965782166
(torch.Size([5]), torch.

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


(torch.Size([5]), torch.Size([5, 3]))
Epoch: 19, Loss : train = 0.1123715415596962 test = 0.08442850410938263
(torch.Size([5]), torch.Size([5, 3]))
Epoch: 20, Loss : train = 0.09909794479608536 test = 0.08671922236680984
(torch.Size([5]), torch.Size([5, 3]))
Epoch: 21, Loss : train = 0.09178783744573593 test = 0.09012433886528015
(torch.Size([5]), torch.Size([5, 3]))
Epoch: 22, Loss : train = 0.08629320561885834 test = 0.09874676167964935
(torch.Size([5]), torch.Size([5, 3]))
Epoch: 23, Loss : train = 0.05255081504583359 test = 0.12854909896850586
(torch.Size([5]), torch.Size([5, 3]))
Epoch: 24, Loss : train = 0.07656274735927582 test = 0.1044299453496933
(torch.Size([5]), torch.Size([5, 3]))
Epoch: 25, Loss : train = 0.04362393915653229 test = 0.13350088894367218
