如何调试梯度

In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#先生成自己的数据
np.random.seed(666)
#随机生成X
X = np.random.random(size = (1000, 10))

In [3]:
#生成theta theta最好不要随机生成，要不不好比较和预测到的theta的大小关系
true_theta = np.arange(1, 12, dtype = float)

In [4]:
X_b = np.hstack([np.ones((len(X), 1)), X])

In [5]:
y = X_b.dot(true_theta) + np.random.normal(size = 1000)

In [6]:
def J(theta, X_b, y):
    try:
        return np.sum((y - X_b.dot(theta)) ** 2) / len(y)
    except:
        return float('inf')

In [7]:
def dJ_math(theta, X_b, y):
    return 2 * X_b.T.dot(X_b.dot(theta) - y) / len(y)

In [8]:
def dJ_degub(theta, X_b, y, epsilon = 0.01):
    res = np.empty(len(theta))
    for i in range(len(theta)):
        theta_1 = theta.copy()
        theta_1[i] += epsilon
        theta_2 = theta.copy()
        theta_2[i] -= epsilon
        #注意这里必须是J(theta + epsilon) - (theta - epsilon) / 2 * epsilon
        res[i] = (J(theta_1, X_b, y) - J(theta_2, X_b, y)) / (2 * epsilon)
    return res



In [9]:
def gradient_descent(dJ, X_b, y, initial_theta, eta, n_iters = 1e4, epsilon= 1e-8):
    #python还能在参数表里传函数！？厉害了
    theta = initial_theta
    cur_iter = 0

    while cur_iter < n_iters:
        gradient_descent = dJ(theta, X_b, y)
        last_theta = theta
        theta = theta - eta * gradient_descent
        if(abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
            break
        cur_iter += 1
    return theta

In [10]:
X_b = np.hstack([np.ones((len(X), 1)), X])

In [12]:
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01


In [13]:
%time theta = gradient_descent(dJ_degub, X_b, y, initial_theta, eta)

CPU times: user 8.44 s, sys: 83 ms, total: 8.52 s
Wall time: 8.58 s


In [14]:
theta

array([ 1.1251597 ,  2.05312521,  2.91522497,  4.11895968,  5.05002117,
        5.90494046,  6.97383745,  8.00088367,  8.86213468,  9.98608331,
       10.90529198])

In [15]:
%time theta = gradient_descent(dJ_math, X_b, y, initial_theta, eta)

CPU times: user 4.54 s, sys: 157 ms, total: 4.7 s
Wall time: 1.26 s


In [16]:
theta

array([ 1.1251597 ,  2.05312521,  2.91522497,  4.11895968,  5.05002117,
        5.90494046,  6.97383745,  8.00088367,  8.86213468,  9.98608331,
       10.90529198])

总结：  
1.dJ_degub是可行的  
2.dJ_degub的速度较慢  

如果涉及梯度公式推导，但是又没有信心能算对，可以先用dJ_degub的方式，在小规模的数据集上，看一看求出来的theta是多少（相当于先看了参考答案），然后再用自己算出来的梯度公式去算theta，看看自己的梯度公式算对了没  
这个dJ_degub函数是可以复用的