In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt

from torch.utils.data import Dataset, DataLoader

# 加载数据

## Relevant Definition

- Epoch: One forward and backward pass of the entire training set.
- Batch-Size: Number of training examples used per step.
- Iteration: Number of steps/batches in an epoch.

## Important Tools

- DataSet: an abstract class used to store your data.
- DataLoader: a class to load data in batches.

In [2]:
# Step.1
class TitanicDataset(Dataset):
    """ Diabetes dataset. """
    def __init__(self, filepath) -> None:
        dataset = np.loadtxt(filepath, delimiter=',', dtype=np.float32, skiprows=1)
        self.len = dataset.shape[0]
        self.x_data = torch.from_numpy(dataset[:, 0:8])
        self.y_data = torch.from_numpy(dataset[:, [8]])
    
    def __getitem__(self, index) -> None:
        return self.x_data[index], self.y_data[index]
    
    def __len__(self) -> int:
        return self.len

dataset = TitanicDataset('./dataset/diabetes/diabetes.csv')
train_loader = DataLoader(dataset=dataset, batch_size=32, shuffle=True, num_workers=0) # Set num_worker to 0, not using the multi-processing on Windows.

# Step.2
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = torch.nn.Linear(8, 6)
        self.linear2 = torch.nn.Linear(6, 4)
        self.linear3 = torch.nn.Linear(4, 1)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x):
        x = self.sigmoid(self.linear1(x))
        x = self.sigmoid(self.linear2(x))
        x = self.sigmoid(self.linear3(x))
        return x
    
model = Model()

# Step.3
criterion = torch.nn.BCELoss(reduction='mean')
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

# Step.4
for epoch in range(1000):
    print(f'\nEpoch {epoch+1} training...')
    for i, (inputs, labels) in enumerate(train_loader):
        y_pred = model(inputs)
        loss = criterion(y_pred, labels)
        print(f'\tStep {i+1}, Loss {loss.item():.4f}')
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


Epoch 1 training...
	Step 1, Loss 0.7043
	Step 2, Loss 0.6922
	Step 3, Loss 0.6896
	Step 4, Loss 0.6761
	Step 5, Loss 0.6756
	Step 6, Loss 0.6715
	Step 7, Loss 0.6657
	Step 8, Loss 0.6622
	Step 9, Loss 0.6425
	Step 10, Loss 0.6950
	Step 11, Loss 0.6670
	Step 12, Loss 0.6775
	Step 13, Loss 0.6655
	Step 14, Loss 0.6204
	Step 15, Loss 0.6572
	Step 16, Loss 0.6445
	Step 17, Loss 0.6388
	Step 18, Loss 0.6458
	Step 19, Loss 0.6297
	Step 20, Loss 0.6584
	Step 21, Loss 0.6771
	Step 22, Loss 0.6422
	Step 23, Loss 0.6783
	Step 24, Loss 0.6229

Epoch 2 training...
	Step 1, Loss 0.6132
	Step 2, Loss 0.6440
	Step 3, Loss 0.5798
	Step 4, Loss 0.6730
	Step 5, Loss 0.6251
	Step 6, Loss 0.6288
	Step 7, Loss 0.6933
	Step 8, Loss 0.5911
	Step 9, Loss 0.5668
	Step 10, Loss 0.5633
	Step 11, Loss 0.6752
	Step 12, Loss 0.6126
	Step 13, Loss 0.6588
	Step 14, Loss 0.7141
	Step 15, Loss 0.6975
	Step 16, Loss 0.6220
	Step 17, Loss 0.7708
	Step 18, Loss 0.6907
	Step 19, Loss 0.6371
	Step 20, Loss 0.6525
	Step 21

## 作业

- 完成Kaggle上的[Titanic数据集](https://www.kaggle.com/c/titanic/data)的机器学习模型训练

In [15]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Step.1
class TitanicDataset(Dataset):
    """ Titanic dataset. """
    def __init__(self, filepath) -> None:
        dataset = pd.read_csv(filepath, na_values=0.0)
        # 将 sex 列中的 male 替换成 0，female 替换成 1
        dataset['Sex'] = dataset['Sex'].replace({'male': 0, 'female': 1})

        # 创建一个 LabelEncoder 对象
        le = LabelEncoder()
        # 对 Name 列进行编码，把 string 转换成整数
        for col in 'Name', 'Cabin', 'Embarked', 'Ticket':
            dataset[col] = le.fit_transform(dataset[col])
        
        dataset = dataset.to_numpy(dtype=np.float32)

        self.len = dataset.shape[0]
        self.x_data = torch.from_numpy(dataset[:, [i for i in range(dataset.shape[1]) if i != 1]])
        self.y_data = torch.from_numpy(dataset[:, [1]])
    
    def __getitem__(self, index) -> None:
        return self.x_data[index], self.y_data[index]
    
    def __len__(self) -> int:
        return self.len

dataset = TitanicDataset('./dataset/titanic/train.csv')
train_loader = DataLoader(dataset=dataset, batch_size=32, shuffle=True, num_workers=0) # Set num_worker to 0, not using the multi-processing on Windows.

# Step.2
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = torch.nn.Linear(11, 8)
        self.linear2 = torch.nn.Linear(8, 6)
        self.linear3 = torch.nn.Linear(6, 3)
        self.linear4 = torch.nn.Linear(3, 1)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x):
        x = self.linear1(x)
        x = self.linear2(x)
        x = self.linear3(x)
        x = self.sigmoid(self.linear4(x))
        return x
    
model = Model()

# Step.3
criterion = torch.nn.BCELoss(reduction='mean')
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

# Step.4
for epoch in range(1000):
    print(f'\nEpoch {epoch+1} training...')
    for i, (inputs, labels) in enumerate(train_loader):
        y_pred = model(inputs)
        y_pred = torch.clamp(y_pred, min=0, max=1)
        loss = criterion(y_pred, labels)
        print(f'\tStep {i+1}, Loss {loss.item():.4f}')
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


Epoch 1 training...


RuntimeError: all elements of input should be between 0 and 1