# 逻辑回归实现
> numpy实现
> 使用a9a.txt的是数据集做二分类

In [1]:
import random

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_svmlight_file
import numpy as np
import matplotlib.pyplot as plt
from utils import Accumulator

## 读取数据集
> 使用train_test_split函数

In [2]:
x_val, y_val = load_svmlight_file('./a9a.txt', n_features=123)
x_train, y_train = load_svmlight_file('./a9a_train.txt', n_features=123)


## 数据预处理
观察可以看到分类为-1,1

为了方便损失函数的计算我们把-1转变成0

In [3]:
y_val[y_val == -1] = 0
y_train[y_train == -1] = 0

# 转换成为np
x_train = np.array(x_train.todense())
x_val = np.array(x_val.todense())
y_train = np.array(y_train).reshape(len(y_train), 1)
y_val = np.array(y_val).reshape(len(y_val), 1)

## 参数设置

In [4]:


# 插入偏置量
b = np.ones(x_train.shape[0])
x_train = np.insert(x_train, 123, values=b, axis=1)

b = np.ones(x_val.shape[0])
x_val = np.insert(x_val, 123, values=b, axis=1)

# 超参数
theta = np.random.normal(size=(x_train.shape[1], 1))

x_train

array([[0., 0., 1., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

## 定义损失函数

In [5]:
def sigmoid(z):
    """sigmoid函数"""
    return 1 / (1 + np.exp(-z))

def logisreg(x, w):
    """逻辑回归模型"""
    return x @ w

def cost(model, y):
    """损失函数（交叉熵）"""
    model = model.reshape(y.shape)
    first = -y.T @ np.log(sigmoid(model))
    second = (1 - y).T @ np.log(1 - sigmoid(model))
    return (first - second) / len(y)

def step_gradient(w_cur, x_value, y_value, lr):
    """
    梯度下降
    :param w_cur:
    :param x_value:
    :param y_value:
    :param lr: 学习率
    :return: 更新后的w参数
    """
    grad = 1 / len(y_value) * x_value.T @ (sigmoid(x_value @ w_cur) - y_value)
    new_w = w_cur - lr * grad
    return new_w



## 计算准确性函数

In [6]:
# 预测检验
def accuracy(y_hat, y, threshold=0.7):  # 阈值设置为0.7
    """计算预测正确数量"""
    if len(y_hat.shape) > 1:
        y_hat = sigmoid(y_hat)
    y_hat[y_hat <= 0.7] = 0.
    y_hat[y_hat > 0.7] = 1.
    cmp = y_hat == y
    return float(cmp.sum())


# 评估任意模型net的准确率
def evaluate_accuracy(model, y):
    """计算在指定数据集上模型的精度"""
    metric = Accumulator(2)  # 正确预测数, 预测总数
    metric.add(accuracy(model, y), len(y))
    return metric[0] / metric[1]

def data_iter(batch_size, features, labels):
    num_examples = len(features)  # 样本数
    # 这些样本是随机读取的
    indices = list(range(num_examples))
    random.shuffle(indices)
    for i in range(0, num_examples, batch_size):
        batch_indices = np.array(
            indices[i: min(i + batch_size, num_examples)])
        yield features[batch_indices], labels[batch_indices]


## 训练模型

In [7]:
lr = 0.03
num_epochs = 200
model =logisreg
loss = cost
batch_size = 1000

for epoch in range(num_epochs):
    for X, y in data_iter(batch_size, x_train, y_train):
        l = loss(model(X, theta), y).sum()   # 计算损失函数
        theta = step_gradient(theta, X, y, lr)
    # 验证
    l_val = loss(model(x_val, theta), y_val).sum()
    print(f'epoch {epoch + 1}, loss_train: {l:f}')  # 训练集损失函数值
    print(f'epoch {epoch + 1}, loss_val: {l_val:f}')  # 验证集瞬时函数值
    print(f'epoch {epoch + 1}, accuracy: {evaluate_accuracy(model(x_val, theta), y_val):f}')  # 验证集准确度
    print("=================================================================================")

epoch 1, loss_train: 0.865168
epoch 1, loss_val: 0.836950
epoch 1, accuracy: 0.712118
epoch 2, loss_train: 0.690736
epoch 2, loss_val: 0.705450
epoch 2, accuracy: 0.751305
epoch 3, loss_train: 0.722224
epoch 3, loss_val: 0.664761
epoch 3, accuracy: 0.764941
epoch 4, loss_train: 0.637168
epoch 4, loss_val: 0.643748
epoch 4, accuracy: 0.771451
epoch 5, loss_train: 0.654321
epoch 5, loss_val: 0.628170
epoch 5, accuracy: 0.773601
epoch 6, loss_train: 0.676959
epoch 6, loss_val: 0.614944
epoch 6, accuracy: 0.775874
epoch 7, loss_train: 0.700486
epoch 7, loss_val: 0.603037
epoch 7, accuracy: 0.779805
epoch 8, loss_train: 0.598529
epoch 8, loss_val: 0.592172
epoch 8, accuracy: 0.782691
epoch 9, loss_train: 0.550900
epoch 9, loss_val: 0.582186
epoch 9, accuracy: 0.785271
epoch 10, loss_train: 0.550091
epoch 10, loss_val: 0.572988
epoch 10, accuracy: 0.786745
epoch 11, loss_train: 0.525847
epoch 11, loss_val: 0.564464
epoch 11, accuracy: 0.788342
epoch 12, loss_train: 0.587021
epoch 12, loss_va