# 随机梯度下降

In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
m=100000
x=np.random.normal(size=m)
X=x.reshape(-1,1)
y=4.*x+3.+np.random.normal(0,3,size=m)

In [8]:
def J(theta,X_b,y):
    try:
        return np.sum((y-X_b.dot(theta))**2)/len(y)
    except:
        return float('inf')
def dJ(theta,X_b,y):
    return X_b.T.dot(X_b.dot(theta)-y)*2./len(y)
def gradient_descent(X_b,y,initial_theta,eta,n_iters=1e4,epsilon=1e-8):
    theta=initial_theta
    cur_iter=0
    while cur_iter<n_iters:
        gradient = dJ(theta,X_b,y)
        last_theta=theta
        theta=theta-eta*gradient
        if (abs(J(theta,X_b,y)-J(last_theta,X_b,y))<epsilon):
            break
        cur_iter+=1
    return theta

In [9]:
%%time
X_b=np.hstack([np.ones((len(X),1)),X])
initial_theta=np.zeros(X_b.shape[1])
eta=0.01
theta=gradient_descent(X_b,y,initial_theta,eta)

CPU times: user 2.07 s, sys: 19 ms, total: 2.09 s
Wall time: 1.07 s


In [10]:
theta

array([2.97238273, 3.98521944])

In [11]:
def dJ_sgd(theta,X_b_i,y_i):
    return X_b_i.T.dot(X_b_i.dot(theta)-y_i)*2.

In [12]:
def sgd(X_b,y,initial_theta,n_iters):
    t0=5
    t1=50
    def learning_rate(t):
        return t0/(t+t1)
    theta=initial_theta
    for cur_iter in range(n_iters):
        rand_i=np.random.randint(len(X_b))
        gradient=dJ_sgd(theta,X_b[rand_i],y[rand_i])
        theta=theta-learning_rate(cur_iter)*gradient
    return theta

In [14]:
%%time
X_b=np.hstack([np.ones((len(X),1)),X])
initial_theta=np.zeros(X_b.shape[1])
theta=sgd(X_b,y,initial_theta,n_iters=len(X_b)//3)

CPU times: user 306 ms, sys: 4.6 ms, total: 310 ms
Wall time: 316 ms


In [15]:
theta

array([2.94887059, 4.04657574])

# 使用我们的SGD

In [2]:
import numpy as np
import matplotlib.pyplot as plt

In [3]:
m=100000
x=np.random.normal(size=m)
X=x.reshape(-1,1)
y=4.*x+3.+np.random.normal(0,3,size=m)

In [4]:
from playML.LinearRegression import LinearRegression

In [5]:
lin_reg=LinearRegression()

In [6]:
lin_reg.fit_sgd(X,y,n_iters=2)

In [7]:
lin_reg._theta[1]

3.989419139278298

In [49]:
from sklearn import datasets
boston=datasets.load_boston()
X=boston.data
y=boston.target
X=X[y<50.0]
y=y[y<50.0]

In [50]:
from playML.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,seed=666)

In [51]:
from sklearn.preprocessing import StandardScaler
standardScaler=StandardScaler()
standardScaler.fit(X_train)
X_train_standard=standardScaler.transform(X_train)
X_test_standard=standardScaler.transform(X_test)

In [52]:
from playML.LinearRegression import LinearRegression
lin_reg=LinearRegression()
%time lin_reg.fit_sgd(X_train_standard,y_train,n_iters=3)
lin_reg.score(X_test_standard,y_test)

CPU times: user 12.6 ms, sys: 3.61 ms, total: 16.2 ms
Wall time: 13.7 ms


0.8143115872074977

# scikit_learn中的SGD

In [53]:
from sklearn.linear_model import SGDRegressor
sgd_reg=SGDRegressor()
%time sgd_reg.fit(X_train_standard,y_train)
sgd_reg.score(X_test_standard,y_test)

CPU times: user 4.46 ms, sys: 3.9 ms, total: 8.36 ms
Wall time: 9.27 ms




0.8066310485823219

In [61]:
sgd_reg=SGDRegressor(max_iter=100,tol=1e-3)
%time sgd_reg.fit(X_train_standard,y_train)
sgd_reg.score(X_test_standard,y_test)

CPU times: user 2.82 ms, sys: 927 µs, total: 3.75 ms
Wall time: 2.3 ms


0.812739671193203