### 正规方程

#### 二元一次方程

$W = (X^TX)^{-1}X^Ty$

In [1]:
# 线性回归，到底是什么呢？
# 可以解决什么问题呢？

# 解方程
# # x + y = 14
# # 2x - y = 10
# 注意此方程无截距（x是x1，y是x2）

import numpy as np
X = np.array([[1,1],[2,-1]])
y = np.array([14,10])

# 正规方程使用 linalg 线性代数
# w = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)  # @也是点乘
w = np.linalg.inv(X.T.dot(X)).dot(X.T)@y
w

array([8., 6.])

In [2]:
from sklearn.linear_model import LinearRegression # 线性回归

model = LinearRegression(fit_intercept=True) # 默认这个参数是True，一般情况下， 使用True
model.fit(X,y) # 建模，训练

# coef斜率，线性方程，系数 == 斜率，截距
# 上面的方程（求解方程的变量，系数，斜率 + 截距），有没有截距？
display(model.coef_, model.intercept_)

array([-0.8,  1.6])

13.2

In [3]:
from sklearn.linear_model import LinearRegression # LinearRegression线性回归，本质就是正规方程的封装，可以解方程

model = LinearRegression(fit_intercept=False) # 默认这个参数是True，一般情况下， 使用True
model.fit(X,y) # 建模，训练

# coef斜率，线性方程，系数 == 斜率，截距
# 上面的方程（求解方程的变量，系数，斜率 + 截距），有没有截距？
display(model.coef_, model.intercept_) # 0.0是model.intercept_的结果

array([8., 6.])

0.0

In [5]:
# 带截距，计算结果也是吻合的
# x + y = 14
# -0.8 + 1.6 + 13.2 = 14

# 2x - y = 10
# 2*-0.8 - 1.6 + 13.2 = 10

#### 八元一次方程

In [4]:
# 上面八元一次方程对应的X数据
X = np.array([[  0 ,14 , 8 ,  0 ,  5,  -2,   9,  -3],
 [ -4 , 10 ,  6 ,  4 ,-14 , -2 ,-14  , 8],
 [ -1 , -6  , 5 ,-12 ,  3 , -3 ,  2 , -2],
 [  5 , -2  , 3 , 10  , 5 , 11 ,  4  ,-8],
 [-15 ,-15  ,-8 ,-15 ,  7 , -4, -12 ,  2],
 [ 11 ,-10 , -2 ,  4  , 3 , -9 , -6 ,  7],
 [-14 ,  0 ,  4 , -3  , 5 , 10 , 13 ,  7],
 [ -3 , -7 , -2 , -8  , 0 , -6 , -5 , -9]])
# 对应的y
y = np.array([ 339 ,-114  , 30 , 126, -395 , -87 , 422, -309])
display(X,y)

array([[  0,  14,   8,   0,   5,  -2,   9,  -3],
       [ -4,  10,   6,   4, -14,  -2, -14,   8],
       [ -1,  -6,   5, -12,   3,  -3,   2,  -2],
       [  5,  -2,   3,  10,   5,  11,   4,  -8],
       [-15, -15,  -8, -15,   7,  -4, -12,   2],
       [ 11, -10,  -2,   4,   3,  -9,  -6,   7],
       [-14,   0,   4,  -3,   5,  10,  13,   7],
       [ -3,  -7,  -2,  -8,   0,  -6,  -5,  -9]])

array([ 339, -114,   30,  126, -395,  -87,  422, -309])

In [6]:
# x1 到 x8分别是：1.,  5., 15.,  3.,  8.,  4., 17., 12.
w = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
w

array([ 1.,  5., 15.,  3.,  8.,  4., 17., 12.])

In [7]:
model = LinearRegression(fit_intercept=False)
model.fit(X,y)
display(model.coef_,model.intercept_)

array([ 1.,  5., 15.,  3.,  8.,  4., 17., 12.])

0.0

#### 八元一次方程+截距【12】

In [8]:
y_new = y + 64
y_new

array([ 403,  -50,   94,  190, -331,  -23,  486, -245])

In [9]:
# 思考一个问题，增加了截距，相当于b ，b是一个未知数
w = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y_new)
w

array([-9.53423295, -3.18237714, 32.89293368, 13.94370726, 12.88345187,
       -5.64066992, 14.95379584,  8.62431708])

In [22]:
# 把截距项加进来
# X_new = np.concatenate([np.ones(shape =(8,1)), X],axis = 1)
X_new = np.hstack((np.ones(shape =(8,1)), X))
X_new

array([[  1.,   0.,  14.,   8.,   0.,   5.,  -2.,   9.,  -3.],
       [  1.,  -4.,  10.,   6.,   4., -14.,  -2., -14.,   8.],
       [  1.,  -1.,  -6.,   5., -12.,   3.,  -3.,   2.,  -2.],
       [  1.,   5.,  -2.,   3.,  10.,   5.,  11.,   4.,  -8.],
       [  1., -15., -15.,  -8., -15.,   7.,  -4., -12.,   2.],
       [  1.,  11., -10.,  -2.,   4.,   3.,  -9.,  -6.,   7.],
       [  1., -14.,   0.,   4.,  -3.,   5.,  10.,  13.,   7.],
       [  1.,  -3.,  -7.,  -2.,  -8.,   0.,  -6.,  -5.,  -9.]])

In [13]:
X_new.shape

(8, 9)

In [14]:
# 思考一个问题，增加了截距，相当于b ，未知数
w = np.linalg.inv(X_new.T.dot(X_new)).dot(X_new.T).dot(y_new)
w

# 注意这个解依然不对，X_new是8行9列，9个未知数，只有8个方程

array([-1388.2421875 ,     3.00732422,    14.62841797,   -14.55664062,
         -41.01953125,    -1.73413086,    16.4140625 ,    19.17218018,
          19.93164062])

In [15]:
# 要想有唯一解，需要增加一行
# 随意构建一个方程，解
w = np.array([ 1.,  5., 15.,  3.,  8.,  4., 17., 12.])
b = 64
X9 = np.random.randint(-10,10,size = 8) # x1 ~ x8【方程任意】
y9 = w.dot(X9) + b # 得到第 9个方程的目标值

# 合并数据
y_new = np.concatenate([y_new,[y9]])
y_new # 9个目标值了

array([ 403.,  -50.,   94.,  190., -331.,  -23.,  486., -245.,  192.])

In [20]:
display(np.concatenate([[1], X9]),np.concatenate([[1], X9]).shape)  # 注意是1维的

display([np.concatenate([[1], X9])])  # 

array([  1,   6,   4, -10,   4,   3,   5,   8,   5])

(9,)

[array([  1,   6,   4, -10,   4,   3,   5,   8,   5])]

In [23]:
# 执行一次
X_new = np.concatenate([X_new,[np.concatenate([[1], X9])]]) # 注意加了一层[]，因为X_new是2维的
display(X_new, X_new.shape)

array([[  1.,   0.,  14.,   8.,   0.,   5.,  -2.,   9.,  -3.],
       [  1.,  -4.,  10.,   6.,   4., -14.,  -2., -14.,   8.],
       [  1.,  -1.,  -6.,   5., -12.,   3.,  -3.,   2.,  -2.],
       [  1.,   5.,  -2.,   3.,  10.,   5.,  11.,   4.,  -8.],
       [  1., -15., -15.,  -8., -15.,   7.,  -4., -12.,   2.],
       [  1.,  11., -10.,  -2.,   4.,   3.,  -9.,  -6.,   7.],
       [  1., -14.,   0.,   4.,  -3.,   5.,  10.,  13.,   7.],
       [  1.,  -3.,  -7.,  -2.,  -8.,   0.,  -6.,  -5.,  -9.],
       [  1.,   6.,   4., -10.,   4.,   3.,   5.,   8.,   5.]])

(9, 9)

In [24]:
### 思考一个问题，增加了截距，相当于b ，未知数
w = np.linalg.inv(X_new.T.dot(X_new)).dot(X_new.T).dot(y_new)
w

array([64.,  1.,  5., 15.,  3.,  8.,  4., 17., 12.])

In [25]:
model = LinearRegression(fit_intercept=True)

model.fit(X_new,y_new)

display(model.coef_,model.intercept_)

array([ 0.,  1.,  5., 15.,  3.,  8.,  4., 17., 12.])

64.0

In [26]:
model = LinearRegression(fit_intercept=False)

model.fit(X_new,y_new)

display(model.coef_,model.intercept_)

array([64.,  1.,  5., 15.,  3.,  8.,  4., 17., 12.])

0.0

他们正在讨论（X^TX)^-1X^TY为什么不能等于X^-1Y

$（X^TX)^{-1}X^TY$

$X^{-1}Y$

In [152]:
X

array([[  0,  14,   8,   0,   5,  -2,   9,  -3],
       [ -4,  10,   6,   4, -14,  -2, -14,   8],
       [ -1,  -6,   5, -12,   3,  -3,   2,  -2],
       [  5,  -2,   3,  10,   5,  11,   4,  -8],
       [-15, -15,  -8, -15,   7,  -4, -12,   2],
       [ 11, -10,  -2,   4,   3,  -9,  -6,   7],
       [-14,   0,   4,  -3,   5,  10,  13,   7],
       [ -3,  -7,  -2,  -8,   0,  -6,  -5,  -9]])

In [153]:
X.T

array([[  0,  -4,  -1,   5, -15,  11, -14,  -3],
       [ 14,  10,  -6,  -2, -15, -10,   0,  -7],
       [  8,   6,   5,   3,  -8,  -2,   4,  -2],
       [  0,   4, -12,  10, -15,   4,  -3,  -8],
       [  5, -14,   3,   5,   7,   3,   5,   0],
       [ -2,  -2,  -3,  11,  -4,  -9,  10,  -6],
       [  9, -14,   2,   4, -12,  -6,  13,  -5],
       [ -3,   8,  -2,  -8,   2,   7,   7,  -9]])

### 正规方程推导

In [158]:
theta = np.random.randint(0,10,size = 8)
theta

array([2, 5, 4, 8, 0, 3, 7, 0])

In [160]:
X = np.random.randint(0,10,size = 8)
X

array([4, 1, 3, 9, 5, 4, 6, 2])

In [161]:
y = X.dot(theta)
y

151

In [162]:
theta.dot(X)

151

### 线性回归-预测房价

In [1]:
from sklearn.linear_model import LinearRegression

from sklearn import datasets

from sklearn.model_selection import train_test_split

In [2]:
import warnings
warnings.filterwarnings('ignore')

加载波士顿房价【数据类型无所谓：癌症、糖尿病、身高】

In [None]:
# data = datasets.load_boston()  load_boston` has been removed from scikit-learn since version 1.2
# data

# In this special case, you can fetch the dataset from the original
# source::

#     import pandas as pd
#     import numpy as np

#     data_url = "http://lib.stat.cmu.edu/datasets/boston"
#     raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
#     data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
#     target = raw_df.values[1::2, 2]

In [4]:
import pandas as pd
import numpy as np

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

In [11]:
feature_names = np.array(['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT'])

In [7]:
data[:2]

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, 0.0000e+00, 5.3800e-01,
        6.5750e+00, 6.5200e+01, 4.0900e+00, 1.0000e+00, 2.9600e+02,
        1.5300e+01, 3.9690e+02, 4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, 0.0000e+00, 4.6900e-01,
        6.4210e+00, 7.8900e+01, 4.9671e+00, 2.0000e+00, 2.4200e+02,
        1.7800e+01, 3.9690e+02, 9.1400e+00]])

In [9]:
target[:5]

array([24. , 21.6, 34.7, 33.4, 36.2])

提取数据

In [12]:
# X = data['data'] # 特征，影响房价的指标
# y = data['target'] # 房价

X = data # 特征，影响房价的指标
y = target # 房价
display(X.shape,y.shape,X[:5],y[:5])

(506, 13)

(506,)

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, 0.0000e+00, 5.3800e-01,
        6.5750e+00, 6.5200e+01, 4.0900e+00, 1.0000e+00, 2.9600e+02,
        1.5300e+01, 3.9690e+02, 4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, 0.0000e+00, 4.6900e-01,
        6.4210e+00, 7.8900e+01, 4.9671e+00, 2.0000e+00, 2.4200e+02,
        1.7800e+01, 3.9690e+02, 9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, 0.0000e+00, 4.6900e-01,
        7.1850e+00, 6.1100e+01, 4.9671e+00, 2.0000e+00, 2.4200e+02,
        1.7800e+01, 3.9283e+02, 4.0300e+00],
       [3.2370e-02, 0.0000e+00, 2.1800e+00, 0.0000e+00, 4.5800e-01,
        6.9980e+00, 4.5800e+01, 6.0622e+00, 3.0000e+00, 2.2200e+02,
        1.8700e+01, 3.9463e+02, 2.9400e+00],
       [6.9050e-02, 0.0000e+00, 2.1800e+00, 0.0000e+00, 4.5800e-01,
        7.1470e+00, 5.4200e+01, 6.0622e+00, 3.0000e+00, 2.2200e+02,
        1.8700e+01, 3.9690e+02, 5.3300e+00]])

array([24. , 21.6, 34.7, 33.4, 36.2])

#### 拆分数据

算法建模，数据结构：**特征数据必须是二维的**
- 第一维表示样本数量
- 第二维表示特征

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 50)
display(X_train.shape,X_test.shape)

(456, 13)

(50, 13)

#### 建模和预测

In [14]:
model = LinearRegression(fit_intercept=True)# 正规方程，封装好了

model.fit(X_train,y_train)# 训练，建模

In [15]:
y_ = model.predict(X_test) # 保留的50个样本

display('算法预测的房价值：',y_.round(1))

display('真实房价是多少：',y_test)

'算法预测的房价值：'

array([18. , 27.4, 26.7, 38.6,  1. , 40.1, 22.2, 31.9, 13.1, 15.6, 19.8,
       37.3, 11.9, 20.7, 20.6, 29.1,  4.4, 29.8, 27.2, 13.9, 12.4, 21.4,
       24.5, 19. , 32.1, 36.9, 41.9, 23.2, 24.7, 27.1, 12.7, 35.9, 17.4,
       18.6, 27.9, 20. ,  8.5, 30. , 20.7, 23.3, 18.4, 11.8, 21.6, 13.6,
       19.1, 23.9, 18. , 15.7, 29.6, 35.5])

'真实房价是多少：'

array([14.1, 23.7, 24.8, 45.4, 17.9, 46. , 21.1, 31.1, 15.6, 10.4, 15.2,
       44. , 16.3, 18.7, 13.3, 24.3,  8.4, 30.1, 25.2, 13.3, 13.4, 21.2,
       24.7, 18.6, 27.5, 42.3, 50. , 33. , 23.9, 22. ,  7.5, 36.5, 14.9,
       15.4, 36.2, 19.1,  7.2, 25.1, 24.5, 24.4, 16. , 12. , 21.7,  8.3,
       16.4, 23.4, 14.9, 15.6, 23. , 33.4])

#### 模型评估

In [16]:
model.score(X_test,y_test)  # R^2

0.7949504625135702

In [14]:
# 就算是用训练数据，得分也不是1
# 不是死记硬背，根据数据提取 规律，方程，并不是所有的数据都吻合
model.score(X_train,y_train)

0.7398774710790202

In [11]:
from sklearn.metrics import mean_squared_error # 均方误差

In [12]:
# 均方误差，越小越好！
mean_squared_error(y_test,y_)

31.198469141420972

In [13]:
# 模型非常好，预测非常准，完全一样！
# y_test == y_ 根据这个公式进行计算，结果是：0
((y_test - y_)**2).mean()

31.198469141420972

模型上线，模型部署