# 二分类问题-HR数据集

In [17]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## 数据处理

In [18]:
# 读取数据
data = pd.read_csv('./dataset/HR.csv')
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,part,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [19]:
# 数据基本信息
print(data.info())
print(data.part.unique())
print(data.salary.unique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   part                   14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB
None
['sales' 'accounting' 'hr' 'technical' 'support' 'management' 'IT'
 'product_mng' 'marketing' 'RandD']
['low' 'medium' 'high']


In [20]:
# 数据预处理
# 简单的数据分析
# data.groupby(['salary', 'part']).size()
# .get_dummies()方法可以将分类数据转换为one-hot编码
data = data.join(pd.get_dummies(data.part).astype(int)).join(pd.get_dummies(data.salary).astype(int))
# 删除原来的分类数据
data.drop(columns=['part', 'salary'], inplace=True)
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,IT,RandD,...,hr,management,marketing,product_mng,sales,support,technical,high,low,medium
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   IT                     14999 non-null  int32  
 9   RandD                  14999 non-null  int32  
 10  accounting             14999 non-null  int32  
 11  hr                     14999 non-null  int32  
 12  management             14999 non-null  int32  
 13  marketing              14999 non-null  int32  
 14  product_mng            14999 non-null  int32  
 15  sa

In [22]:
# 查看离职率
print(data.left.value_counts())
# 全部预测为不离职
print(data.left.value_counts()[0] / data.left.value_counts().sum())

left
0    11428
1     3571
Name: count, dtype: int64
0.7619174611640777


In [23]:
# 处理结果数据
Y_data = data.left.values.reshape(-1, 1)
Y = torch.from_numpy(Y_data).type(torch.FloatTensor)
print(Y.shape)

torch.Size([14999, 1])


In [24]:
# 处理特征数据
# 使用列表推导式，获取除了'left'列之外的所有列
# [c for c in data.columns if c != 'left']
# 使用.values方法，将DataFrame转换为numpy数组
X_data = data[[c for c in data.columns if c != 'left']].values
X = torch.from_numpy(X_data).type(torch.FloatTensor)
print(X.shape)

torch.Size([14999, 20])


## 创建模型

In [25]:
# from torch import nn
# # 自定义模型：逻辑回归模型
# class Logistic(nn.Module):  # 继承nn.Module
#     def __init__(self):     # 初始化所有的层
#         super().__init__()  # 继承父类中所有的属性和方法
#         self.lin_1 = nn.Linear(20, 64)  # 定义第一层线性层，输入维度为20，输出维度为64
#         self.lin_2 = nn.Linear(64, 64)  # 定义第二层线性层，输入维度为64，输出维度为64
#         self.lin_3 = nn.Linear(64, 1)   # 定义第三层线性层，输入维度为64，输出维度为1
#         self.activate = nn.ReLU()       # 定义ReLU激活函数
#         self.sigmoid = nn.Sigmoid()     # 定义Sigmoid激活函数
#     def forward(self, input):   # 前向传播函数，定义模型的运算过程，覆盖父类中的forward方法
#         x = self.lin_1(input)   # 将输入数据传入第一层线性层
#         x = self.activate(x)    # ReLU激活函数
#         x = self.lin_2(x)       # 将激活后的数据传入第二层线性层
#         x = self.activate(x)    # ReLU激活函数
#         x = self.lin_3(x)       # 将激活后的数据传入第三层线性层
#         x = self.sigmoid(x)     # Sigmoid激活函数
#         return x

In [26]:
# 模型改写
from torch import nn
import torch.nn.functional as F # 函数式API，调用方便使代码更简洁
class Logistic(nn.Module):  # 继承nn.Module
    def __init__(self):     # 初始化所有的层
        super().__init__()  # 继承父类中所有的属性和方法
        self.lin_1 = nn.Linear(20, 64)  # 定义第一层线性层，输入维度为20，输出维度为64
        self.lin_2 = nn.Linear(64, 64)  # 定义第二层线性层，输入维度为64，输出维度为64
        self.lin_3 = nn.Linear(64, 1)   # 定义第三层线性层，输入维度为64，输出维度为1
    def forward(self, input):   # 前向传播函数，定义模型的运算过程，覆盖父类中的forward方法
        x = F.relu(self.lin_1(input))   # 将输入数据传入第一层线性层，并使用ReLU激活函数
        x = F.relu(self.lin_2(x))       # 将激活后的数据传入第二层线性层，并使用ReLU激活函数
        x = F.sigmoid(self.lin_3(x))     # 将激活后的数据传入第三层线性层，并使用Sigmoid激活函数
        return x

In [27]:
# 封装模型和优化器的创建，提高代码复用性
lr = 0.0001
def get_model():
    model = Logistic()
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    return model, opt
print(get_model())
model, opt = get_model()

(Logistic(
  (lin_1): Linear(in_features=20, out_features=64, bias=True)
  (lin_2): Linear(in_features=64, out_features=64, bias=True)
  (lin_3): Linear(in_features=64, out_features=1, bias=True)
), Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.0001
    maximize: False
    weight_decay: 0
))


## 模型训练

In [28]:
loss_fn = nn.BCELoss()

In [29]:
# 分割数据集，分批次进行训练
batch = 64
no_of_batches = len(data)//batch
epochs = 100

### 1. 手动分批次训练

In [None]:
# 分批次循环训练
for epoch in range(epochs):
    for i in range(no_of_batches):     # 按照批次进行训练
        start = i * batch              # 每个批次的起始索引
        end = start + batch                # 每个批次的结束索引
        x = X[start: end]
        y = Y[start: end]

        # Forward pass
        y_pred = model(x)
        # Compute loss: BCELoss expects the target to be between 0 and 1
        loss = loss_fn(y_pred, y)
        # Gradient reset
        opt.zero_grad()
        # Backward pass
        loss.backward()
        # Update the gradients
        opt.step()
    with torch.no_grad():
        print('epoch:', epoch, '   ', 'loss:', loss_fn(model(X), Y).data.item())

### 2. 使用dataset重构模型训练过程

PyTorch有一个抽象的Dataset类。Dataset可以是任何具有__len__函数和__getitem__作为对其进行索引的方法的函数。PyTorch的TensorDataset是一个包装张量的Dataset。通过定义索引的长度和方式，这也为我们提供了沿张量的第一维进行迭代，索引和切片的方法。这将使我们在训练的同一行中更容易访问自变量和因变量。下面将自定义HRDataset类创建为的Dataset的子类。

In [30]:
from torch.utils.data import TensorDataset
HRdataset = TensorDataset(X, Y)
# print(HRdataset[2: 5])
model, opt = get_model()

In [31]:
for epoch in range(epochs):
    for i in range(no_of_batches):
        x, y = HRdataset[i * batch: i * batch + batch]
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        opt.zero_grad()
        loss.backward()
        opt.step()
    with torch.no_grad():
        print('epoch:', epoch, '   ', 'loss:', loss_fn(model(X), Y).data.item())

epoch: 0     loss: 0.6582443118095398
epoch: 1     loss: 0.6570234894752502
epoch: 2     loss: 0.6577951312065125
epoch: 3     loss: 0.6454842686653137
epoch: 4     loss: 0.628150224685669
epoch: 5     loss: 0.6211585998535156
epoch: 6     loss: 0.6205790638923645
epoch: 7     loss: 0.6005281805992126
epoch: 8     loss: 0.5924190878868103
epoch: 9     loss: 0.5849792957305908
epoch: 10     loss: 0.5797553658485413
epoch: 11     loss: 0.5741011500358582
epoch: 12     loss: 0.5700239539146423
epoch: 13     loss: 0.5669781565666199
epoch: 14     loss: 0.5640222430229187
epoch: 15     loss: 0.5618149042129517
epoch: 16     loss: 0.5604045391082764
epoch: 17     loss: 0.5593510270118713
epoch: 18     loss: 0.5652616620063782
epoch: 19     loss: 0.5600892901420593
epoch: 20     loss: 0.5585730671882629
epoch: 21     loss: 0.5585653781890869
epoch: 22     loss: 0.5586372017860413
epoch: 23     loss: 0.5587912797927856
epoch: 24     loss: 0.5585929751396179
epoch: 25     loss: 0.55789661407470

### 3. 使用DataLoader重构模型训练过程

Pytorch DataLoader负责管理批次，DataLoader从Dataset创建，自动为我们提供每个小批量，使遍历批次变得更容易，无需使用`HRdataset[i * batch: i * batch + batch]`

In [38]:
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

HRdataset = TensorDataset(X, Y)
HRdataloader = DataLoader(HRdataset, batch_size=batch)

In [39]:
model, opt = get_model()

In [None]:
for epoch in range(epochs):
    for x, y in HR_dl:
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        opt.zero_grad()
        loss.backward()
        opt.step()
print('epoch:', epoch, '   ', 'loss:', loss_fn(model(X), Y))

# 添加验证

前面我们只是试图建立一个合理的训练循环以用于我们的训练数据。实际上，您始终还应该具有一个验证集，以识别您是否过度拟合。

训练数据的乱序（shuffle）对于防止批次与过度拟合之间的相关性很重要。另一方面，无论我们是否乱序验证集，验证损失都是相同的。由于shufle需要额外的开销，因此shuffle验证数据没有任何意义。

我们将为验证集使用批大小，该批大小是训练集的两倍。这是因为验证集不需要反向传播，因此占用的内存更少（不需要存储梯度）。我们利用这一优势来使用更大的批量，并更快地计算损失。

!pip install sklearn -i https://pypi.douban.com/simple/

In [42]:
from sklearn.model_selection import train_test_split

In [43]:
train_x, test_x, train_y, test_y = train_test_split(X_data, Y_data)

In [None]:
train_x.shape, train_y.shape, test_x.shape, test_y.shape

In [45]:
train_x = torch.from_numpy(train_x).type(torch.FloatTensor)
test_x = torch.from_numpy(test_x).type(torch.FloatTensor)
train_y = torch.from_numpy(train_y).type(torch.FloatTensor)
test_y = torch.from_numpy(test_y).type(torch.FloatTensor)

In [46]:
train_ds = TensorDataset(train_x, train_y)
train_dl = DataLoader(train_ds, batch_size=batch, shuffle=True)

valid_ds = TensorDataset(test_x, test_y)
valid_dl = DataLoader(valid_ds, batch_size=batch * 2)

# 定义计算正确率函数

In [49]:
def accuracy(out, yb):
    preds = (out>0.5).type(torch.IntTensor)
    return (preds == yb).float().mean()

model.train()在训练之前调用代表训练模式

model.eval() 推理之前进行调用代表推理模式

不同的模式仅会在使用nn.BatchNorm2d ，nn.Dropout等层时以确保这些不同阶段的行为正确。

In [54]:
epochs = 500

In [None]:
model, opt = get_model()

for epoch in range(epochs+1):
    model.train()
    for xb, yb in train_dl:
        pred = model(xb)
        loss = loss_fn(pred, yb)

        loss.backward()
        opt.step()
        opt.zero_grad()
    if epoch%50==0:
        model.eval()
        with torch.no_grad():
            valid_loss = sum(loss_fn(model(xb), yb) for xb, yb in valid_dl)
            acc_mean = np.mean([accuracy(model(xb), yb) for xb, yb in valid_dl])
        print(epoch, valid_loss / len(valid_dl), acc_mean)

# 优化

In [57]:
class Logistic(nn.Module):
    def __init__(self):
        super().__init__()
        self.lin_1 = nn.Linear(20, 64)
        self.lin_2 = nn.Linear(64, 64)
        self.lin_3 = nn.Linear(64, 64)
        self.lin_4 = nn.Linear(64, 1)
        self.activate = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    def forward(self, input):
        x = self.lin_1(input)
        x = self.activate(x)
        x = self.lin_2(x)
        x = self.activate(x)
        x = self.lin_3(x)
        x = self.activate(x)
        x = self.lin_4(x)
        x = self.sigmoid(x)
        return x

In [None]:
model, opt = get_model()

acc_val = []
acc_train = []

for epoch in range(epochs+1):
    model.train()
    for xb, yb in train_dl:
        pred = model(xb)
        loss = loss_fn(pred, yb)

        loss.backward()
        opt.step()
        opt.zero_grad()
    if epoch%50==0:
        model.eval()
        with torch.no_grad():
            valid_loss = sum(loss_fn(model(xb), yb) for xb, yb in valid_dl)
            acc_mean_train = np.mean([accuracy(model(xb), yb) for xb, yb in train_dl])
            acc_mean_val = np.mean([accuracy(model(xb), yb) for xb, yb in valid_dl])
        acc_train.append(acc_mean_train)
        acc_val.append(acc_mean_val)
        print(epoch, valid_loss / len(valid_dl), acc_mean_train, acc_mean_val)

# 创建fit（）和get_data（）

In [72]:
def loss_batch(model, loss_func, xb, yb, opt=None):
    loss = loss_func(model(xb), yb)

    if opt is not None:
        loss.backward()
        opt.step()
        opt.zero_grad()

    return loss.item(), len(xb)

In [73]:
import numpy as np

def fit(epochs, model, loss_func, opt, train_dl, valid_dl):
    for epoch in range(epochs):
        model.train()
        for xb, yb in train_dl:
            loss_batch(model, loss_func, xb, yb, opt)

        model.eval()
        with torch.no_grad():
            losses, nums = zip(
                *[loss_batch(model, loss_func, xb, yb) for xb, yb in valid_dl]
            )
        val_loss = np.sum(np.multiply(losses, nums)) / np.sum(nums)

        print(epoch, val_loss)

In [74]:
def get_data(train_ds, valid_ds, bs):
    return (
        DataLoader(train_ds, batch_size=bs, shuffle=True),
        DataLoader(valid_ds, batch_size=bs * 2),
    )

### 现在，我们获取数据加载器和拟合模型的整个过程可以在3行代码中运行：

In [None]:
train_dl, valid_dl = get_data(train_ds, valid_ds, batch)
model, opt = get_model()
fit(epochs, model, loss_fn, opt, train_dl, valid_dl)