In [128]:
import pandas as pd
df = pd.read_csv(
    'breast_cancer_data/breast_cancer.csv', index_col=False)
print('df=',df)
X = df[df.columns[0:-1]].values
y = df[df.columns[-1]].values
print(X.shape, y.shape)


df=      mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0         17.990         10.38          122.80     1001.0          0.11840   
1         20.570         17.77          132.90     1326.0          0.08474   
2         19.690         21.25          130.00     1203.0          0.10960   
3         11.420         20.38           77.58      386.1          0.14250   
4         20.290         14.34          135.10     1297.0          0.10030   
5         12.450         15.70           82.57      477.1          0.12780   
6         18.250         19.98          119.60     1040.0          0.09463   
7         13.710         20.83           90.20      577.9          0.11890   
8         13.000         21.82           87.50      519.8          0.12730   
9         12.460         24.04           83.97      475.9          0.11860   
10        16.020         23.24          102.70      797.8          0.08206   
11        15.780         17.89          103.60      781.0   

[569 rows x 31 columns]
(569, 30) (569,)


In [4]:
from sklearn.model_selection import train_test_split
# 按照 0.8 和 0.2 的比例随机划分数据集合
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1234)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((455, 30), (455,), (114, 30), (114,))

In [5]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
# 对特征进行标准化，标签不要标准化，因为标签只有 0 和 1
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_train

array([[-0.36180827, -0.26521011, -0.31715702, ..., -0.07967528,
        -0.52798733,  0.2506337 ],
       [-0.8632675 ,  0.71560604, -0.85646012, ..., -0.76980239,
         0.44312729, -0.20987332],
       [-0.4334453 ,  0.32513895, -0.41286667, ..., -0.06601541,
        -1.1169427 ,  0.0329492 ],
       ...,
       [-0.479293  , -0.17689018, -0.45697634, ..., -0.20261414,
         0.18670009,  0.17414996],
       [ 1.16835876, -0.15364809,  1.17466524, ...,  0.26789258,
         0.19828067, -0.23394164],
       [-0.40765597, -1.29715887, -0.42826344, ..., -0.78042674,
        -0.88036793, -0.80355834]])

In [6]:
import torch
import numpy as np

X_train = torch.from_numpy(X_train.astype(np.float32))
X_test = torch.from_numpy(X_test.astype(np.float32))
y_train = torch.from_numpy(y_train.astype(np.float32))
y_test = torch.from_numpy(y_test.astype(np.float32))

# 将标签也转为 2 维，否则放入模型之中训练时，可能出错
y_train = y_train.view(y_train.shape[0], 1)
y_test = y_test.view(y_test.shape[0], 1)
X_train.size(), y_train.size()

(torch.Size([455, 30]), torch.Size([455, 1]))

In [7]:
import torch.nn as nn
# 我们的模型是一个线性函数+激活函数的非线性模型
# modle(x) = sigmoid(w*x+b)


class Model(nn.Module):
    def __init__(self, n_input_features):
        super(Model, self).__init__()
        self.linear = nn.Linear(n_input_features, 1)

    def forward(self, x):
        # torch 中已经定义了 sigmoid 函数模型
        y_pred = torch.sigmoid(self.linear(x))
        return y_pred


# 获得样本量和特征数
n_samples, n_features = X.shape
# 模型的初始化
model = Model(n_features)
model

Model(
  (linear): Linear(in_features=30, out_features=1, bias=True)
)

In [8]:
# 损失和优化器的定义
# 迭代次数
num_epochs = 100
# 学习率
learning_rate = 0.01
# 二元交叉熵损失
criterion = nn.BCELoss()
# SGD 优化器
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
criterion, optimizer

(BCELoss(), SGD (
 Parameter Group 0
     dampening: 0
     lr: 0.01
     momentum: 0
     nesterov: False
     weight_decay: 0
 ))

In [120]:
for epoch in range(num_epochs):
    y_pred = model(X_train)
    loss = criterion(y_pred, y_train)
    # 后向传播、梯度更新、梯度清空
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    if (epoch+1) % 10 == 0:
        print(f'epoch: {epoch+1}, loss = {loss.item():.4f}')
print("模型训练完毕！！")

epoch: 10, loss = 0.0950
epoch: 20, loss = 0.0946
epoch: 30, loss = 0.0942
epoch: 40, loss = 0.0938
epoch: 50, loss = 0.0934
epoch: 60, loss = 0.0930
epoch: 70, loss = 0.0926
epoch: 80, loss = 0.0923
epoch: 90, loss = 0.0919
epoch: 100, loss = 0.0915
模型训练完毕！！


In [122]:
index = np.random.randint(0, len(X_test))
acc=0
for i in range(len(X_test)):
    y_predicted = model(X_test[i])
    # 小于 0.5 则输出 0 ，大于0.5 则输出 1
    y_predicted_cls = y_predicted.round()

    # 将结果转为 numpy类型
    real = y_test[i].detach().numpy()[0]
    predict = y_predicted_cls.detach().numpy()[0]
    if(real == predict): acc+=1
    #print("第 {} 条测试数据的真实结果为 {} ，预测结果为 {} ".format(i, real, predict))
acc=acc/len(X_test) 
print('accuracy: ',acc)

accuracy:  0.9122807017543859


In [116]:
with torch.no_grad():
    y_predicted = model(X_test)
    y_predicted_cls = y_predicted.round()
    acc = y_predicted_cls.eq(y_test).sum().numpy() / float(y_test.shape[0])
    print(f'accuracy: {acc.item():.4f}')

accuracy: 0.9035
