## 本地数据集

In [19]:
import torch
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

# 本地数据集需要自建Dataset子类
class DiabetesDataset(Dataset):  # 从抽象类Dataset继承 创建糖尿病子类
    def __init__(self, filepath) -> None:
        """init中传入数据集
           如果是结构化数据集，比较小，可以全部写入
           如果是非结构化数据集，如图像、语音，太大了，则不可一次全写入内存"""
        xy = np.loadtxt(filepath, delimiter=',', dtype=np.float32) # 此处都载入内存了
        self.len = xy.shape[0]  # xy (759,9)
        self.x_data = torch.from_numpy(xy[:,:-1])
        self.y_data = torch.from_numpy(xy[:,[-1]])

    # getitem方法是将来实例化类后支持下标索引操作 即dataset[index]会调用此魔法方法
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index] # 同时返回x和y，组成元组

    def __len__(self):  # len()时会调用此魔法方法
        return self.len


dataset = DiabetesDataset('../data/diabetes/diabetes.csv')
train_loader = DataLoader(dataset=dataset,batch_size=32,shuffle=True,num_workers=0)  
# dataset传入数据集，shuffle打乱，num_workers读取数据集到内存时需要几个进程并行读取数据
dataset

<__main__.DiabetesDataset at 0x2398de2a4d0>

In [20]:
# 759/32 = 23.多; 故分成了24个batch 最后一个batch 为23个
# for i, data in enumerate(train_loader):
    # print(i, len(data), len(data[0]))


## 以下为分类训练示例(效果貌似一般)

In [21]:
class Model(torch.nn.Module):
    def __init__(self) -> None:
        super(Model, self).__init__()
        # 三个线性模型
        self.linear1 = torch.nn.Linear(8,6)
        self.linear2 = torch.nn.Linear(6,4)
        self.linear3 = torch.nn.Linear(4,1)
        # 一个非线性模型
        self.sigmoid = torch.nn.Sigmoid() # 没有参数
        # torch.nn.functional.sigmoid()函数与nn.sigmoid()模块不同
        ## 如果要用ReLU激活函数  在forward中的最后一层还是应该用Sigmoid
    
    def forward(self, x):
        # 若是简单的序列式模型 就用一个变量x 防止手误出bug
        x = self.sigmoid(self.linear1(x)) # o1 = self.sigmoid(self.linear(x))
        x = self.sigmoid(self.linear2(x)) # o2 = self.sigmoid(self.linear(o1))
        x = self.sigmoid(self.linear3(x)) # o3 = self.sigmoid(self.linear(o2))
        return x

model = Model()

criterion = torch.nn.BCELoss(size_average=False)

optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)

In [22]:
# training cycle

for epoch in range(500):
    # 对train_loader做迭代
    for i, data in enumerate(train_loader,0): # 从0开始enumerate的枚举
        # prepare data
        inputs, labels = data # data每次拿的第i个批量的数据(包含32个数据 最后一个batch有23个)
        # forward
        y_pred = model(inputs)
        loss = criterion(y_pred, labels)
        print('epoch:',epoch,'iteration:', i, loss.item())
        # backward
        optimizer.zero_grad()
        loss.backward()
        # updata
        optimizer.step()

epoch: 0 iteration: 0 20.691707611083984
epoch: 0 iteration: 1 23.10970687866211
epoch: 0 iteration: 2 21.023141860961914
epoch: 0 iteration: 3 20.578542709350586
epoch: 0 iteration: 4 21.21695327758789
epoch: 0 iteration: 5 19.967792510986328
epoch: 0 iteration: 6 19.278419494628906
epoch: 0 iteration: 7 23.358896255493164
epoch: 0 iteration: 8 18.802093505859375
epoch: 0 iteration: 9 20.622520446777344
epoch: 0 iteration: 10 19.47469711303711
epoch: 0 iteration: 11 21.820911407470703
epoch: 0 iteration: 12 21.194232940673828
epoch: 0 iteration: 13 19.4700927734375
epoch: 0 iteration: 14 21.219255447387695
epoch: 0 iteration: 15 21.81133460998535
epoch: 0 iteration: 16 17.743412017822266
epoch: 0 iteration: 17 22.594655990600586
epoch: 0 iteration: 18 19.375568389892578
epoch: 0 iteration: 19 21.2530460357666
epoch: 0 iteration: 20 19.963092803955078
epoch: 0 iteration: 21 20.596799850463867
epoch: 0 iteration: 22 23.199752807617188
epoch: 0 iteration: 23 14.284022331237793
epoch: 1 i