In [4]:
import numpy as np
import pandas as pd

# 自定义逻辑斯蒂回归lf类

In [114]:
class lf(object):
    def __init__(self):
        self.w = 0                  #权重
        self.b = 0                  #截距
        self.trainSet = 0           #训练集特征
        self.label = 0              #训练集标签
        self.learning_rate = 0.05   #学习率
        self.n_iters = 1000         #实际迭代次数
        self.accurancy = None       #准确率
        self.tol = 0.001           #停止迭代的容忍度
        self.llList = []            #记录似然值的列表
    
    def train(self, X, y, n_iters=1000, learning_rate=0.01):
        self.trainSet = X
        self.label = y
        self.__train_gradient(n_iters, learning_rate)
        return
    
    #求p(y=1|x)以及似然值LL
    def PVandLLV(self, X, Y, W):
        wx = np.dot(X, W.T) #点积
        p_value = np.exp(wx) / (1 + np.exp(wx))
        LLarray = -1.*np.multiply(Y, wx) + np.log(1 + np.exp(wx))
        return p_value, LLarray.sum()
    
    #求梯度矩阵
    def __calGradient(self, X, Y, Ypre):
        Gw = -1.*np.multiply((Y - Ypre), X).sum(axis=0)
        return Gw
    
    
    def __train_gradient(self, n_iters, learning_rate):
        n_samples, n_features = self.trainSet.shape
        X = self.trainSet
        y = self.label
        #合并w和b，在X尾部添加一列全是1的特征
        X2 = np.hstack((X, np.ones((n_samples, 1))))
        #将y转置变为(n_samples,1)的矩阵
        Y = np.expand_dims(y, axis=1)
        #初始化特征系数W
        W = np.zeros((1, n_features+1))
        #初始化误差，更新前后的误差之差，训练次数
        Ypreprob, LL0 = self.PVandLLV(X2, Y, W)
        self.llList.append(LL0)
        deltaLL = np.inf
        n = 0
        while (n < n_iters) and (LL0 > self.tol) and (abs(deltaLL) > self.tol):
            #计算梯度，更新W
            gra = self.__calGradient(X2, Y, Ypreprob)
            W = W - learning_rate*gra / n_samples
            #计算更新后的误差，并留下来
            Ypreprob, LL1 = self.PVandLLV(X2, Y, W)
            deltaLL = LL0 - LL1
            LL0 = LL1
            self.llList.append(LL0)
            n += 1
        self.n_iters = n
        # flatten 返回一个一维数组
        self.w = W.flatten()[:-1]
        self.b = W.flatten()[-1]
        Ypre = np.argmax(np.column_stack((1 - Ypreprob,Ypreprob)), axis=1)
        self.accurancy = sum(Ypre==y) / n_samples
        #print("迭代次数：{}，似然值：{}，准确率：{}".format(self.n_iters, self.llList[-1], self.accurancy))
        print("w:{}\nb:{}".format(self.w, self.b))
        print("score:",self.accurancy)
        return

In [115]:
from sklearn.datasets import make_classification
if __name__ == "__main__":
    # 自动生成数据
    X, y = make_classification(n_samples=1000, n_features=5, random_state=123)
    logit_gd = lf()
    logit_gd.train(X, y)

w:[-0.05957069  0.33460339 -0.03802806  1.8243648   0.42519204]
b:0.0413666592885683
score: 0.947


# 调用sklearn的LogisticRegression

In [112]:
from sklearn.linear_model import LogisticRegression

X, y = make_classification(n_samples=1000, n_features=5, random_state=123)
logit_sklearn = LogisticRegression(solver="lbfgs")
logit_sklearn.fit(X, y)
print("w:{}\nb:{}".format(logit_sklearn.coef_[0], logit_sklearn.intercept_[0]))
print("score:",logit_sklearn.score(X, y))

w:[-0.30739251  0.88326724 -0.11476345  4.2841313   0.91362391]
b:0.46607030594821364
score: 0.95
