## Least Angle Regression(最小角回归)
- 适用于高维度数据的线性回归
- 可用于特征选择

In [10]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

### 前向选择（Forward Selection）算法
- 最小角回归算法的前导算法，与最小角回归的部分原理相似
    1. 将X看做n个m维向量，即每个属性对应一个向量；
    2. 从n个向量中挑选与y最为接近(夹角最小，余弦最大)的向量记为$X^*$，使用y在$X^*$上的投影$\bar{y}$来逼近y，得到残差$y = y - \bar{y}$，其中$\bar{y} = X^* \times w^*$，$w^* = \frac{X^*y}{{\|X^*\|}^2}$；
    3. 判断是否已经使用了所有的n个X向量或者参差为0，若是，则退出，返回$w$.
- 适用于较高维度的数据

In [99]:
class ForwardSelection(object):
    def __init__(self):
        self.theta = None

    def fit(self, train_x, train_y):
        self.train(train_x, train_y)

    def train(self, train_x, train_y):
        m , n = train_x.shape
        train_x = np.copy(train_x)
        if len(train_y.shape) == 1:
            train_y = train_y.reshape(-1, 1)
        self.theta = np.zeros((1, n))
        # 各个向量的模
        norm_of_vector = np.sqrt(np.sum(np.square(train_x), axis=0))
        columns_sets = set(range(n))
        while len(columns_sets) > 0:
#             print train_y
            if np.all(train_y == 0): # 残差为0
                break
            norm_of_y = np.sqrt(np.sum(np.square(train_y)))
            # 计算余弦距离
            max_cos_dis = 0
            max_index = -1
            for i in columns_sets:
                cur_dis = np.sum(train_x[:, i]*train_y) / (norm_of_vector[i]*norm_of_y)
                if abs(cur_dis) > abs(max_cos_dis):
                    max_cos_dis = cur_dis
                    max_index = i
            self.theta[0][max_index] = max_cos_dis * (norm_of_y / norm_of_vector[max_index])
            train_y -= train_x[:, max_index].reshape(-1, 1) * self.theta[0][max_index]
            columns_sets.remove(max_index)
            print max_index
    def predict(self, x):
        return np.dot(x, self.theta.T)

### 前向梯度（Forward Stagewise）算法
- 最小角回归算法的前导算法，与最小角回归的部分原理相似，不像前向选择算法一样一次性使用投影，而是在最接近(余弦距离最小)的自变量$X^*$的方向上移动一小步，在观察与残差$\bar{y}$最接近的自变量，直到残差足够小。 
    1. 确定一个学习率$\alpha$, 从n个向量中挑选与y最为接近(夹角最小，余弦最大)的向量记为𝑋∗，使用y在$𝑋^∗$上的投影$\bar{y}$的一个等比例缩小$\alpha \bar{y}$来逼近y，得到残差$y = y - \alpha \bar{y}$，其中$\bar{y} = X^* w^*$,$w^*=\alpha \frac{X^*y}{\|X^*\|}$，累加$w^*$；
    2. 当参差非常小的时候，退出程序，返回$w$.

In [101]:
class ForwardStagewise(object):
    def __init__(self, alpha=0.01, epsilon=0.2, epochs = 2000, verbose=False):
        self.theta = None
        self.alpha = alpha
        self.epsilon = epsilon
        self.epochs = epochs
        self.verbose = verbose

    def fit(self, train_x, train_y):
        self.train(train_x, train_y)

    def train(self, train_x, train_y):
        m , n = train_x.shape
        train_x = np.copy(train_x)
        if len(train_y.shape) == 1:
            train_y = train_y.reshape(-1, 1)
        self.theta = np.zeros((1, n))
        # 各个向量的模
        norm_of_vector = np.sqrt(np.sum(np.square(train_x), axis=0))
        i = 0
        while np.max(train_y) > self.epsilon and i < self.epochs:
            norm_of_y = np.sqrt(np.sum(np.square(train_y)))
            cur_dis = np.sum(train_x*train_y, axis=0) / (norm_of_vector*norm_of_y)
            max_index = np.argmax(np.abs(cur_dis))
            max_cos_dis = cur_dis[max_index]
            
            delta_theta = self.alpha * max_cos_dis * (norm_of_y / norm_of_vector[max_index])
            self.theta[0][max_index] += delta_theta
            train_y -= train_x[:, max_index].reshape(-1, 1) * delta_theta
            i += 1
            if self.verbose is True and i % 100 == 0:
                print "the %d epoch, the $\max{y}$ %f" \
                % (i, np.max(train_y))
    def predict(self, x):
        return np.dot(x, self.theta.T)

### 最小角回归
- 前向选择算法与前向梯度算法的结合
- 暂时未完成，待推导公式

In [None]:
class LARS(object):
    def __init__(self):
        self.theta = None
    def fit(self, train_x, train_y):
        self.train(train_x, train_y)
    def train(self, train_x, train_y):
        m , n = train_x.shape
        train_x = np.copy(train_x)
        if len(train_y.shape) == 1:
            train_y = train_y.reshape(-1, 1)
        self.theta = np.zeros((1, n))
        # 各个向量的模
        norm_of_vector = np.sqrt(np.sum(np.square(train_x), axis=0))
        columns_sets = set(range(n))
        while len(columns_sets) > 0:
#             print train_y
            if np.all(train_y == 0): # 残差为0
                break
            norm_of_y = np.sqrt(np.sum(np.square(train_y)))
            # 计算余弦距离
            max_cos_dis = 0
            max_index = -1
            for i in columns_sets:
                cur_dis = np.sum(train_x[:, i]*train_y) / (norm_of_vector[i]*norm_of_y)
                if abs(cur_dis) > abs(max_cos_dis):
                    max_cos_dis = cur_dis
                    max_index = i
            self.theta[0][max_index] = max_cos_dis * (norm_of_y / norm_of_vector[max_index])
            train_y -= train_x[:, max_index].reshape(-1, 1) * self.theta[0][max_index]
            columns_sets.remove(max_index)
            print max_index
    def predict(self,x):
        return np.dot(x, self.theta.T)

In [76]:
def regression(train_x, train_y, test_x, test_y, model):
    lr = model
    lr.train(train_x, train_y)
    pre_y = lr.predict(test_x)
    
    print('Coefficients: \n', lr.theta)
    # The mean squared error
    print("Mean squared error: %.2f"
      % mean_squared_error(test_y, pre_y))
    # Explained variance score: 1 is perfect prediction
    print('Variance score: %.2f' % r2_score(test_y, pre_y))

In [97]:
def regression_sklearn(train_x, train_y, test_x, test_y):
    lr = linear_model.Lars()
    lr.fit(train_x, train_y)
    pre_y = lr.predict(test_x)
    
    print('Coefficients: \n', lr.coef_, lr.intercept_)
    # The mean squared error
    print("Mean squared error: %.2f"
      % mean_squared_error(test_y, pre_y))
    # Explained variance score: 1 is perfect prediction
    print('Variance score: %.2f' % r2_score(test_y, pre_y))

In [78]:
def load_data():
    diabetes = datasets.load_diabetes()
    # Use only one feature
    diabetes_X = diabetes.data

    # Split the data into training/testing sets
    diabetes_X_train = diabetes_X[:-20]
    diabetes_X_test = diabetes_X[-20:]

    # Split the targets into training/testing sets
    diabetes_y_train = diabetes.target[:-20]
    diabetes_y_test = diabetes.target[-20:]
    return diabetes_X_train, diabetes_X_test, diabetes_y_train, diabetes_y_test

In [94]:
train_x, test_x, train_y, test_y = load_data()

In [100]:
regression(train_x, train_y, test_x, test_y, ForwardSelection())

5
4
6
2
9
0
3
8
1
7
('Coefficients: \n', array([[  8636.31950834,  -3034.07551717,  11485.73831596,   6887.27470672,
        -14440.18471778, -15512.54537759, -13395.61180117,   2049.88273377,
          3428.58066008, -11008.72220713]]))
Mean squared error: 4478218.34
Variance score: -925.94


In [109]:
regression(train_x, train_y, test_x, test_y, 
           ForwardStagewise(alpha=0.0002, epochs=500000, verbose=False))

('Coefficients: \n', array([[  -0.39280859,    1.84270242,    0.        ,   -1.73871215,
         150.60341141, -115.03129434,  -72.89602412,  -28.26894827,
         -54.58247376,    3.14641988]]))
Mean squared error: 20771.14
Variance score: -3.30


In [98]:
regression_sklearn(train_x, train_y, test_x, test_y)

('Coefficients: \n', array([  3.23840759e-03,  -2.37443589e+02,   2.65533652e+02,
         3.27960544e+02,  -8.54144580e+02,   5.21377321e+02,
         1.26005906e+02,   1.96058915e+02,   6.05978036e+02,
         7.59081972e+01]), 152.76679422403396)
Mean squared error: 2476.04
Variance score: 0.49
