## DataLoader与Dataset

**DataLoader的基本结构**

- 功能

    数据集读取核心

- Sampler 

    生成数据索引
    
- Dataset

    读取Image和对应的Label

    DataLoader(dataset,batch_size=1,shuffle=False,sampler=None,batch_sampler=None,num_workers=0,collate_fn=None,pin_memory=False,drop_last=False,timeout=0,work_init_fn=None,multiprocessing_context=None)
    
    
- 功能

    构建可迭代的数据装载器

- dataset

    Dataset类，决定数据从哪读取及如何读取

- batch_size

    批量大小

- num_workers

    是否多进程读取数据

- shuffle

    每个epoch是否乱序
    
- drop_last

    当样本量不能被batchsize整除时，是否丢弃剩余部分一部分数据

**Epoch、Iteration和Batchsize的关系**

样本总数：80 

Batchsize：8

Iteration：10

- 所以1个epoch10个Iteration

**torch.utils.data.Dataset**

- 功能

    Dataset抽象类，所有自定义的Dataset需要继承它，并且复写\__getitem\__()

- getitem:

    接受一个索引，返回一个样本
    
        class Dataset(object):
        
            def __getitem__(self,index):
                raise NotImplementedError
            
            def __add__(self,other):
                return ConcatDataset([self,other])
                

**路径拼接函数**

    os.path.join(path1,path2)
    
- path

    路径，格式为'C:/yyy/yyy_data/'或者'abc'类型

In [34]:
# ===============================  exmaple 1 ===============================
# 通过os.path.join()创建路径

diry=os.path.join('E:/Jupyter/','pytorch')
print(diry)

E:/Jupyter/pytorch


    root,dirs,files=os.walk(path)

- path

    文件路径
    
- root

    是当前正在遍历的这个文件夹的本身的地址

- dirs

    是一个 list ，内容是该文件夹中所有的目录的名字(不包括子目录)

- files

     同样是 list , 内容是该文件夹中所有的文件(不包括子目录)

    os.listdir(path)

- 功能

    用于返回指定的文件夹包含的文件或文件夹的名字的列表

    filter(function, iterable)

- 功能

    用于过滤序列，过滤掉不符合条件的元素

## 数据集划分

In [59]:
import os
import random
import shutil


def makedir(new_dir):
    if not os.path.exists(new_dir):
        os.makedirs(new_dir)


In [60]:
##设置文件路径
dataset_dir = os.path.join("E:", "\Jupyter", "pytorch_example", "data",
                           "RMB_data")
split_dir = os.path.join("E:", "\Jupyter", "pytorch_example", "data",
                         "rmb_split")

train_dir = os.path.join(split_dir, "train")
valid_dir = os.path.join(split_dir, "valid")
test_dir = os.path.join(split_dir, "test")

#设置训练集，验证集，测试集比例
train_pct = 0.8
valid_pct = 0.1
test_pct = 0.1
print(dataset_dir)

##归类文件
for root, dirs, files in os.walk(dataset_dir):
    print("-------------------------")
    print(root, dirs, files)
    for sub_dir in dirs:
        print("---------------------------")
        print(sub_dir)
        print("---------------------------")
        imgs = os.listdir(os.path.join(root, sub_dir))
        print(imgs)
        imgs = list(filter(lambda x: x.endswith('.jpg'), imgs))
        print("---------------------------")
        print(imgs)
        random.shuffle(imgs)
        img_count = len(imgs)

        #print("ok")

        train_point = int(img_count * train_pct)
        valid_point = int(img_count * (train_pct + valid_pct))

        for i in range(img_count):
            if i < train_point:
                out_dir = os.path.join(train_dir, sub_dir)
            elif i < valid_point:
                out_dir = os.path.join(valid_dir, sub_dir)
            else:
                out_dir = os.path.join(test_dir, sub_dir)

            makedir(out_dir)

            target_path = os.path.join(out_dir, imgs[i])
            src_path = os.path.join(dataset_dir, sub_dir, imgs[i])

            shutil.copy(src_path, target_path)

        print('Class:{}, train:{}, valid:{}, test:{}'.format(
            sub_dir, train_point, valid_point - train_point,
            img_count - valid_point))

E:\Jupyter\pytorch_example\data\RMB_data
-------------------------
E:\Jupyter\pytorch_example\data\RMB_data ['1', '100'] []
---------------------------
1
---------------------------
['01B68AKT.jpg', '01EIM65B.jpg', '01LNYXO4.jpg', '01MF2W5S.jpg', '01NISKCG.jpg', '02C4V1SW.jpg', '03WGM2XG.jpg', '03WV6GFZ.jpg', '049I6MVB.jpg', '04A32I57.jpg', '04MGL637.jpg', '04QE2KHA.jpg', '04QGLB16.jpg', '04RWK2B5.jpg', '04VRAHK2.jpg', '04YVW9CN.jpg', '059GS728.jpg', '05MLGSGI.jpg', '05MO1N93.jpg', '067TZA8C.jpg', '069N3OK2.jpg', '073LW92O.jpg', '07GVXBMG.jpg', '07HTXU3W.jpg', '07IUEGQX.jpg', '07PVUGTB.jpg', '07R6PKIX.jpg', '08596RNG.jpg', '08C3EHPG.jpg', '09F2SGOT.jpg', '09PUM1HY.jpg', '0B89KOA3.jpg', '0BGHNV6P.jpg', '0BRO7XVG.jpg', '0C4UDH9S.jpg', '0CNU427V.jpg', '0CTO7MER.jpg', '0D29EFZO.jpg', '0D6HCAXL.jpg', '0D73KYGN.jpg', '0DLF8NU6.jpg', '0DLW9G7O.jpg', '0DRZXTK3.jpg', '0E4QRCTS.jpg', '0E5Q62TM.jpg', '0E6AGCOW.jpg', '0EBSK2GF.jpg', '0EMSWVIR.jpg', '0EP9R4N8.jpg', '0EZ7ND18.jpg', '0F9CEKGH.jpg', '

## 数据读取

In [61]:
# 参数设置
MAX_EPOCH = 10
BATCH_SIZE = 16
LR = 0.01
log_interval = 10
val_interval = 1

In [62]:
'''
数据集读取
'''
import os
import random
from PIL import Image
from torch.utils.data import Dataset
import torchvision.transforms as transforms
from torch.utils.data import DataLoader


random.seed(1)
rmb_label = {"1": 0, "100": 1}


class RMBDataset(Dataset):  ##继承Dataset类
    def __init__(self, data_dir, transform=None):
        """
        rmb面额分类任务的Dataset
        :param data_dir: str, 数据集所在路径
        :param transform: torch.transform，数据预处理
        """
        self.label_name = {"1": 0, "100": 1}
        self.data_info = self.get_img_info(data_dir)  # data_info是一个列表，存储所有图片路径和标签，在DataLoader中通过index读取样本
        self.transform = transform

    def __getitem__(self, index):
        path_img, label = self.data_info[index]
        img = Image.open(path_img).convert('RGB')  # 0~255

        if self.transform is not None:
            img = self.transform(img)  # 在这里做transform，转为tensor等等

        return img, label

    def __len__(self):
        return len(self.data_info)

    @staticmethod
    def get_img_info(data_dir):
        data_info = list()
        for root, dirs, _ in os.walk(data_dir):
            # 遍历类别
            for sub_dir in dirs:
                img_names = os.listdir(os.path.join(root, sub_dir))
                img_names = list(
                    filter(lambda x: x.endswith('.jpg'), img_names))

                # 遍历图片
                for i in range(len(img_names)):
                    img_name = img_names[i]
                    path_img = os.path.join(root, sub_dir, img_name)
                    label = rmb_label[sub_dir]
                    data_info.append((path_img, int(label)))

        return data_info

In [63]:
##图像转为张量，图像处理
norm_mean = [0.485, 0.456, 0.406]
norm_std = [0.229, 0.224, 0.225]

train_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(norm_mean, norm_std),
])

valid_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(norm_mean, norm_std),
])

# 构建MyDataset实例
train_data = RMBDataset(data_dir=train_dir, transform=train_transform)
valid_data = RMBDataset(data_dir=valid_dir, transform=valid_transform)

# 构建DataLoder
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(dataset=valid_data, batch_size=BATCH_SIZE)

In [73]:
for X,y in train_loader:
    
    print(X)
    print("------------------")
    print(y)
    break

tensor([[[[-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
          [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
          [-2.1179,  0.4508,  0.4508,  ...,  1.0844,  1.2214,  1.3584],
          ...,
          [-2.1179,  0.9303,  0.8961,  ...,  1.6667,  1.7180,  1.7009],
          [-2.1179,  0.9474,  0.9474,  ...,  1.6324,  1.6838,  1.7009],
          [-2.1179,  0.9646,  0.9646,  ...,  1.6495,  1.7009,  1.7180]],

         [[-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
          [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
          [-2.0357,  0.6779,  0.6779,  ...,  1.4132,  1.5532,  1.6758],
          ...,
          [-2.0357,  1.1856,  1.1681,  ...,  1.8508,  1.9384,  2.0259],
          [-2.0357,  1.2556,  1.2381,  ...,  1.9734,  2.0084,  2.0434],
          [-2.0357,  1.2906,  1.2556,  ...,  2.0084,  2.0434,  2.0609]],

         [[-1.8044, -1.8044, -1.8044,  ..., -1.8044, -1.8044, -1.8044],
          [-1.8044, -1.8044, -