In [97]:
import torch
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader


class DiabetesDataset(Dataset):  # 从抽象类Dataset继承 创建糖尿病子类
    def __init__(self, filepath) -> None:
        """init中传入数据集
           如果是结构化数据集，比较小，可以全部写入
           如果是非结构化数据集，如图像、语音，太大了，则不可一次全写入内存"""
        xy = np.loadtxt(filepath, delimiter=',', dtype=np.float32) # 此处都载入内存了
        self.len = xy.shape[0]  # xy (759,9)
        self.x_data = torch.from_numpy(xy[:,:-1])
        self.y_data = torch.from_numpy(xy[:,[-1]])

    # getitem方法是将来实例化类后支持下标索引操作 即dataset[index]会调用此魔法方法
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index] # 同时返回x和y，组成元组

    def __len__(self):  # len()时会调用此魔法方法
        return self.len


dataset = DiabetesDataset('../PyTorch深度学习实践/diabetes.csv/diabetes.csv')  # 然后就可以利用自定义的类实例化；
train_loader = DataLoader(dataset=dataset,batch_size=32,shuffle=True,num_workers=0)  
# dataset传入数据集，shuffle打乱，num_workers读取数据集到内存时需要几个进程并行读取数据
dataset

<__main__.DiabetesDataset at 0x22a49f13310>

In [98]:
# 759/32 = 23.多; 故分成了24个batch 最后一个batch 为23个
# for i, data in enumerate(train_loader):
    # print(i, len(data), len(data[0]))


In [99]:
class Model(torch.nn.Module):
    def __init__(self) -> None:
        super(Model, self).__init__()
        # 三个线性模型
        self.linear1 = torch.nn.Linear(8,6)
        self.linear2 = torch.nn.Linear(6,4)
        self.linear3 = torch.nn.Linear(4,1)
        # 一个非线性模型
        self.sigmoid = torch.nn.Sigmoid() # 没有参数
        # torch.nn.functional.sigmoid()函数与nn.sigmoid()模块不同
        ## 如果要用ReLU激活函数  在forward中的最后一层还是应该用Sigmoid
    
    def forward(self, x):
        # 若是简单的序列式模型 就用一个变量x 防止手误出bug
        x = self.sigmoid(self.linear1(x)) # o1 = self.sigmoid(self.linear(x))
        x = self.sigmoid(self.linear2(x)) # o2 = self.sigmoid(self.linear(o1))
        x = self.sigmoid(self.linear3(x)) # o3 = self.sigmoid(self.linear(o2))
        return x

model = Model()

criterion = torch.nn.BCELoss(size_average=False)

optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)

引入mini-batch之后，训练时需要两个循环：<br>
外层循环为遍历total_epoch,每次迭代执行一个epoch<br>
每个内循环为遍历total_batch，每次迭代执行一个mini-batch

In [100]:
# training cycle

for epoch in range(1000):
    # 对train_loader做迭代
    for i, data in enumerate(train_loader,0): # 从0开始enumerate的枚举
        # prepare data
        inputs, labels = data # data每次拿的第i个批量的数据(包含32个数据 最后一个batch有23个)
        # forward
        y_pred = model(inputs)
        loss = criterion(y_pred, labels)
        print('epoch:',epoch,'iteration:', i, loss.item())
        # backward
        optimizer.zero_grad()
        loss.backward()
        # updata
        optimizer.step()

epoch: 0 iteration: 0 24.23246192932129
epoch: 0 iteration: 1 22.53582191467285
epoch: 0 iteration: 2 21.82805633544922
epoch: 0 iteration: 3 21.52286148071289
epoch: 0 iteration: 4 20.919841766357422
epoch: 0 iteration: 5 22.611297607421875
epoch: 0 iteration: 6 20.538747787475586
epoch: 0 iteration: 7 20.85770606994629
epoch: 0 iteration: 8 19.10392189025879
epoch: 0 iteration: 9 21.709867477416992
epoch: 0 iteration: 10 20.650543212890625
epoch: 0 iteration: 11 20.648658752441406
epoch: 0 iteration: 12 21.719900131225586
epoch: 0 iteration: 13 19.07552719116211
epoch: 0 iteration: 14 21.80670166015625
epoch: 0 iteration: 15 22.323516845703125
epoch: 0 iteration: 16 21.163497924804688
epoch: 0 iteration: 17 21.172016143798828
epoch: 0 iteration: 18 20.650936126708984
epoch: 0 iteration: 19 21.17947769165039
epoch: 0 iteration: 20 21.165775299072266
epoch: 0 iteration: 21 19.064706802368164
epoch: 0 iteration: 22 20.592077255249023
epoch: 0 iteration: 23 15.469459533691406
epoch: 1 it