#### 批量梯度下降法

In [2]:
import numpy as np
import matplotlib.pyplot as plt

In [3]:
m = 100000
x = np.random.normal(size = m)
X = x.reshape(-1, 1)
y = 4. * x + 3. +np.random.normal(0, 3, size = m)

In [9]:
def J(theta, X_b, y):
    try:
        return np.sum((y - X_b.dot(theta)) ** 2) /len(y)
    except:
        return float('inf')
    
def dJ(theta, X_b, y):
    return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(y)

def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
    theta = initial_theta
    i_iters = 0
    while i_iters < n_iters:
        gradient = dJ(theta, X_b, y)
        last_theta = theta
        theta = theta - eta * gradient
        if (abs(J(last_theta, X_b, y) - J(theta, X_b, y)) < epsilon):
             break
        i_iters += 1
    return theta


In [10]:
%%time
X_b = np.hstack([np.ones((len(X), 1)), X])
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01
theta = gradient_descent(X_b, y, initial_theta, eta)
theta

Wall time: 5.56 s


array([3.0049957 , 4.00911537])

#### 随机梯度下降法

In [11]:
def dJ_sgd(theta, X_b_i, y_i):
    return X_b_i.T.dot(X_b_i.dot(theta) - y_i) * 2.

def sgd(X_b, y, initial_theta, n_iters):
    
    def learning_rate(t): #计算学习率（步长）
        return t0 / (t + t1)
    
    theta = initial_theta
    for cur_iter in range(n_iters):
        rand_i = np.random.randint(len(X_b))
        gradient = dJ_sgd(theta, X_b[rand_i], y[rand_i])
        theta = theta - learning_rate(cur_iter) * gradient
    return theta

In [12]:
%%time
X_b = np.hstack([np.ones((len(X), 1)), X])
initial_theta = np.zeros(X_b.shape[1])
theta = sgd(X_b, y, initial_theta, n_iters=len(X_b) // 3) #n_iters检查的样本个数，这里是为了演示随机的快速，不一定就这样设置
theta

Wall time: 875 ms


array([3.02164026, 4.02886955])

#### 使用我们自己的SGD

In [20]:
import numpy as np
import matplotlib.pyplot as plt
m = 100000
x = np.random.normal(size = m)
X = x.reshape(-1, 1)
y = 4. * x + 3. +np.random.normal(0, 3, size = m)

from LinearRegression import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit_sgd(X, y, n_iters=2)

print(lin_reg.coef_)#序数
print(lin_reg.interception_)#截距

[3.97552478]
2.988719828241306


#### 真实数据下测试自行实现的SGD

In [22]:
import numpy as np
from sklearn import datasets

boston = datasets.load_boston()
X = boston.data
y = boston.target
X = X[y < 50.0]
y = y[y < 50.0]

from model_selection import train_test_split #切分数据
X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)

from preprocessing import StandardScaler #归一化
standardScaler = StandardScaler()
standardScaler.fit(X_train)
X_train_standard = standardScaler.tranform(X_train)
X_test_standard = standardScaler.tranform(X_test)

from LinearRegression import LinearRegression
lin_reg = LinearRegression()
%time lin_reg.fit_sgd(X_train_standard, y_train, n_iters=2)
lin_reg.score(X_test_standard, y_test) #结果精度是不够（正确精度是0.8129），还要增加循环次数

Wall time: 8.98 ms


0.7911189802097698

In [23]:
%time lin_reg.fit_sgd(X_train_standard, y_train, n_iters=50)
lin_reg.score(X_test_standard, y_test) #结果精度差不多（正确精度是0.8129）

Wall time: 192 ms


0.8132588958621523

In [30]:
%time lin_reg.fit_sgd(X_train_standard, y_train, n_iters=1000)
lin_reg.score(X_test_standard, y_test) #这里貌似就过头了

Wall time: 4.46 s


0.8130056901553238

#### sklearn中的SGD

In [31]:
from sklearn.linear_model import SGDRegressor #SGDRegressor在linear_model中，只能解决线性模型
sgd_reg = SGDRegressor()
%time sgd_reg.fit(X_train_standard, y_train)
sgd_reg.score(X_test_standard, y_test)

Wall time: 5 ms


0.8120347719772759

In [32]:
sgd_reg = SGDRegressor(n_iter_no_change=100)
%time sgd_reg.fit(X_train_standard, y_train)
sgd_reg.score(X_test_standard, y_test)

Wall time: 28 ms


0.8130956690110988

sklearn中的SGD使用了很多的优化方案，因此与我们自行实现的算法相比速度快很多