In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import torchvision.models as models
import os
import shutil
import time
import pandas as pd
import random

In [2]:
data_dir = './data/dog-breed-identification'  # original data set
label_file, train_dir, test_dir = 'labels.csv', 'train', 'test'
new_data_dir = './data/train_valid'  # processed data set
valid_ratio = 0.1  # the ratio of the validation set
# set random seed
random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed(0)

def mkdir_if_not_exist(path):
    if not os.path.exists(os.path.join(*path)):
        os.makedirs(os.path.join(*path))
        
def reorg_dog_data(data_dir, label_file, train_dir, test_dir, new_data_dir, valid_ratio):
    # extract the traing label
    labels = pd.read_csv(os.path.join(data_dir, label_file))
    id2label = {Id: label for Id, label in labels.values}  # (key: value): (id: label)

    # shuffle the training set
    train_files = os.listdir(os.path.join(data_dir, train_dir))
    random.shuffle(train_files)    

    # process and split the training set
    valid_ds_size = int(len(train_files) * valid_ratio)  # size of the validation set
    for i, file in enumerate(train_files):
        img_id = file.split('.')[0]  # get the id of the image
        img_label = id2label[img_id]
        if i < valid_ds_size:
            mkdir_if_not_exist([new_data_dir, 'valid', img_label])
            shutil.copy(os.path.join(data_dir, train_dir, file),
                        os.path.join(new_data_dir, 'valid', img_label))
        else:
            mkdir_if_not_exist([new_data_dir, 'train', img_label])
            shutil.copy(os.path.join(data_dir, train_dir, file),
                        os.path.join(new_data_dir, 'train', img_label))
        
reorg_dog_data(data_dir, label_file, train_dir, test_dir, new_data_dir, valid_ratio)