## 데이터 분할

현재 원본 데이터 셋은 Rawdata에 저장되어 있으며, 각 이미지의 분류 클래스가 폴더로 구분되어 있는 형태 <br>
_e.g. Apple\_\_healthy, Apple\_\_Black\_rot_ <br><br>
이 데이터 셋을 train, val, test 세 데이터 셋으로 분할 해야함

### [데이터 분할을 위한 폴더 생성]

In [2]:
import os
import shutil

original_dataset_dir = './Rawdata'
classes_list = os.listdir(original_dataset_dir)

base_dir = './dataset'

if not os.path.isdir(base_dir):
    os.mkdir(base_dir)

train_dir = os.path.join(base_dir, 'train')
val_dir = os.path.join(base_dir, 'val')
test_dir = os.path.join(base_dir, 'test')

if not os.path.isdir(train_dir):
    os.mkdir(train_dir)
if not os.path.isdir(val_dir):
    os.mkdir(val_dir)
if not os.path.isdir(test_dir):
    os.mkdir(test_dir)

for clss in classes_list:
    clss_dir = os.path.join(train_dir, clss)
    if not os.path.isdir(clss_dir):
        os.mkdir(clss_dir)
    clss_dir = os.path.join(val_dir, clss)
    if not os.path.isdir(clss_dir):
        os.mkdir(clss_dir)
    clss_dir = os.path.join(test_dir, clss)
    if not os.path.isdir(clss_dir):
        os.mkdir(clss_dir)

### [데이터 분할과 클래스별 데이터 수 확인]

In [3]:
import math

print(classes_list)

for clss in classes_list:
    if (clss == '.DS_Store'): continue
    path = os.path.join(original_dataset_dir, clss)
    fnames = os.listdir(path)

    train_size = math.floor(len(fnames) * 0.6)
    val_size = math.floor(len(fnames) * 0.2)
    test_size = math.floor(len(fnames) * 0.2)

    train_fnames = fnames[:train_size]
    print('Train size(', clss, '): ', len(train_fnames))
    for fname in train_fnames:
        src = os.path.join(path, fname)
        dst = os.path.join(os.path.join(train_dir, clss), fname)
        if not os.path.exists(dst):
            shutil.copyfile(src, dst)
    
    val_fnames = fnames[train_size:(train_size+val_size)]
    print('Val size(', clss, '): ', len(val_fnames))
    for fname in val_fnames:
        src = os.path.join(path, fname)
        dst = os.path.join(os.path.join(val_dir, clss), fname)
        if not os.path.exists(dst):
            shutil.copyfile(src, dst)
    
    test_fnames = fnames[(train_size+val_size):]
    print('Test size(', clss, '): ', len(test_fnames))
    for fname in test_fnames:
        src = os.path.join(path, fname)
        dst = os.path.join(os.path.join(test_dir, clss), fname)
        if not os.path.exists(dst):
            shutil.copyfile(src, dst)


['Strawberry___healthy', 'Grape___Black_rot', 'Potato___Early_blight', 'Cherry___Powdery_mildew', 'Tomato___Target_Spot', '.DS_Store', 'Peach___healthy', 'Potato___Late_blight', 'Tomato___Late_blight', 'Tomato___Tomato_mosaic_virus', 'Pepper,_bell___healthy', 'Tomato___Leaf_Mold', 'Grape___Leaf_blight_(Isariopsis_Leaf_Spot)', 'Apple___Cedar_apple_rust', 'Tomato___Bacterial_spot', 'Grape___healthy', 'Corn___Cercospora_leaf_spot Gray_leaf_spot', 'Tomato___Early_blight', 'Grape___Esca_(Black_Measles)', 'Tomato___healthy', 'Corn___Northern_Leaf_Blight', 'Tomato___Tomato_Yellow_Leaf_Curl_Virus', 'Cherry___healthy', 'Apple___Apple_scab', 'Tomato___Spider_mites Two-spotted_spider_mite', 'Corn___Common_rust', 'Peach___Bacterial_spot', 'Pepper,_bell___Bacterial_spot', 'Tomato___Septoria_leaf_spot', 'Corn___healthy', 'Apple___Black_rot', 'Apple___healthy', 'Strawberry___Leaf_scorch', 'Potato___healthy']
Train size( Strawberry___healthy ):  273
Val size( Strawberry___healthy ):  91
Test size( Str

### [베이스라인 모델 학습을 위한 준비]

_Rawdata -> Rawdata Processing -> Dataset -> DataLoader_

_num\_workers 인수는 GPU와 관련이 있다._

In [7]:
import torch

USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device('cuda' if USE_CUDA else 'cpu')

BATCH_SIZE = 256
EPOCH = 30

import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder

# transforms.Compose 함수는 이미지 전처리에 사용됨
# Image Augmentation이나 Rotation 할때도 사용됨
transform_base = transforms.Compose([transforms.Resize((64, 64)), transforms.ToTensor()]) 

# ImageFolder 함수는 하나의 클래스가 하나의 폴더에 대응될때 사용됨
train_ds = ImageFolder(root='./dataset/train', transform=transform_base)
val_ds = ImageFolder(root='./dataset/val', transform=transform_base)

from torch.utils.data import DataLoader

train_loader = torch.utils.data.DataLoader(train_ds, batch_size = BATCH_SIZE, shuffle=True, num_workers=4)
val_loader = torch.utils.data.DataLoader(val_ds, batch_size = BATCH_SIZE, shuffle=True, num_workers=4)

In [11]:
first_batch = train_loader.__iter__().__next__()

print('{:15s} | {:<25s} | {}'.format('name', 'type', 'size'))
print('{:15s} | {:<25s} | {}'.format('Num of Batch', '', len(train_loader)))
print('{:15s} | {:<25s} | {}'.format('first_batch', str(type(first_batch)), len(first_batch)))
print('{:15s} | {:<25s} | {}'.format('first_batch[0]', str(type(first_batch[0])), first_batch[0].shape))
print('{:15s} | {:<25s} | {}'.format('first_batch[1]', str(type(first_batch[1])), first_batch[1].shape))

name            | type                      | size
Num of Batch    |                           | 94
first_batch     | <class 'list'>            | 2
first_batch[0]  | <class 'torch.Tensor'>    | torch.Size([256, 3, 64, 64])
first_batch[1]  | <class 'torch.Tensor'>    | torch.Size([256])


### [베이스라인 모델 설계]

In [10]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, 3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.conv3 = nn.Conv2d(64, 64, 3, padding=1)

        self.fc1 = nn.Linear(4096, 512)
        self.fc2 = nn.Linear(512, 33)
    
    def forward(self, x):

        x = self.conv1(x)
        x = F.relu(x)
        x = self.pool(x)
        x = F.dropout(x, p=0.25, training=self.training)

        x = self.conv2(x)
        x = F.relu(x)
        x = self.pool(x)
        x = F.dropout(x, p=0.25, training=self.training)

        x = self.conv3(x)
        x = F.relu(x)
        x = self.pool(x)
        x = F.dropout(x, p=0.25, training=self.training)

        x = x.view(-1, 4096) # Flatten
        x = self.fc1(x)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.fc2(x)

        return F.log_softmax(x, dim=1)

model_base = Net().to(DEVICE)
optimizer = optim.Adam(model_base.parameters(), lr=0.001)



### [모델 학습을 위한 함수]

In [12]:
def train(model, train_loader, optimizer):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(DEVICE), target.to(DEVICE)
        optimizer.zero_grad()
        output = model(data)
        loss = F.cross_entropy(output, target)
        loss.backward()
        optimizer.step()

### [모델 평가를 위한 함수]

In [13]:
def evaluate(model, test_loader):
    model.eval()
    test_loss = 0
    correct = 0

    with torch.no_grad:
        for data, target in test_loader:
            data, target = data.to(DEVICE), target.to(DEVICE)
            output = model(data)
            test_loss += F.cross_entropy(output, target, reduction='sum').item()

            pred = output.max(1, keepdim=True)[1]

            print(pred.shape)
            break

            correct += pred.eq(target.view_as(pred)).sum().item()
    test_loss /= len(test_loader.dataset) # test dataset의 전체 loss
    test_acc = 100 * correct / len(test_loader.dataset)
    return test_loss, test_acc

### [모델 학습 실행하기]

In [None]:
import time
