In [2]:
# 引入相关科学计算包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
plt.style.use("ggplot")      
import seaborn as sns

In [3]:
from sklearn import datasets
boston = datasets.load_boston()     # 返回一个类似于字典的类
X = boston.data
y = boston.target
features = boston.feature_names
boston_data = pd.DataFrame(X,columns=features)
boston_data["Price"] = y
boston_data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Price
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [4]:
from sklearn import linear_model      # 引入线性回归方法
lin_reg = linear_model.LinearRegression()       # 创建线性回归的类
lin_reg.fit(X,y)        # 输入特征X和因变量y进行训练
print("模型系数：",lin_reg.coef_)             # 输出模型的系数
print("模型得分：",lin_reg.score(X,y))    # 输出模型的决定系数R^2

模型系数： [-1.08011358e-01  4.64204584e-02  2.05586264e-02  2.68673382e+00
 -1.77666112e+01  3.80986521e+00  6.92224640e-04 -1.47556685e+00
  3.06049479e-01 -1.23345939e-02 -9.52747232e-01  9.31168327e-03
 -5.24758378e-01]
模型得分： 0.7406426641094094


#### 广义可加模型（GAM）
广义可加模型GAM实际上是线性模型推广至非线性模型的一个框架，在这个框架中，每一个变量都用一个非线性函数来代替，但是模型本身保持整体可加性。GAM模型不仅仅可以用在线性回归的推广，还可以将线性分类模型进行推广。具体的推广形式是：

标准的线性回归模型：

$$y_i = w_0 + w_1 x_{i1} + ... + w_p x_{ip} + b_i$$

GAM模型框架：

$$y_i = w_0 + \Sigma_{j=1}^p f_j(x_{ij}) + b_i$$

和多项式回归模型的区别似乎在于GAM都是一次项，具体内容后续查找学习后补充

In [6]:
from pygam import LinearGAM
gam = LinearGAM().fit(boston_data[boston.feature_names], y)
gam.summary()

LinearGAM                                                                                                 
Distribution:                        NormalDist Effective DoF:                                    103.2423
Link Function:                     IdentityLink Log Likelihood:                                 -1589.7653
Number of Samples:                          506 AIC:                                             3388.0152
                                                AICc:                                            3442.7649
                                                GCV:                                               13.7683
                                                Scale:                                              8.8269
                                                Pseudo R-Squared:                                   0.9168
Feature Function                  Lambda               Rank         EDoF         P > x        Sig. Code   
s(0)                              [0.

 
Please do not make inferences based on these values! 

Collaborate on a solution, and stay up to date at: 
github.com/dswah/pyGAM/issues/163 

  This is separate from the ipykernel package so we can avoid doing imports until


#### 多项式回归

In [5]:
from sklearn.preprocessing import PolynomialFeatures
X_arr = np.arange(6).reshape(3, 2)
print("原始X为：\n",X_arr)

poly = PolynomialFeatures(2)
print("2次转化X：\n",poly.fit_transform(X_arr))

poly = PolynomialFeatures(interaction_only=True)
print("2次转化X：\n",poly.fit_transform(X_arr))

原始X为：
 [[0 1]
 [2 3]
 [4 5]]
2次转化X：
 [[ 1.  0.  1.  0.  0.  1.]
 [ 1.  2.  3.  4.  6.  9.]
 [ 1.  4.  5. 16. 20. 25.]]
2次转化X：
 [[ 1.  0.  1.  0.]
 [ 1.  2.  3.  6.]
 [ 1.  4.  5. 20.]]


#### 回归树

In [7]:
from sklearn.tree import DecisionTreeRegressor    
reg_tree = DecisionTreeRegressor(criterion = "mse",min_samples_leaf = 5)
reg_tree.fit(X,y)
reg_tree.score(X,y)

0.9376307599929274

#### SVM

SVM的推导具体可以参考李航老师的《统计学习方法》和刘建平老师的博客

学习SVM时建议学习下SMO算法，加速SVM的计算过程

In [8]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler     # 标准化数据
from sklearn.pipeline import make_pipeline   # 使用管道，把预处理和模型形成一个流程

reg_svr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
reg_svr.fit(X, y)
reg_svr.score(X,y)

0.7024525421955277

#### 特征提取：向前逐步回归

In [10]:
#定义向前逐步回归函数
def forward_select(data,target):
    variate=set(data.columns)  #将字段名转换成字典类型
    variate.remove(target)  #去掉因变量的字段名
    selected=[]
    current_score,best_new_score=float('inf'),float('inf')  #目前的分数和最好分数初始值都为无穷大（因为AIC越小越好）
    #循环筛选变量
    while variate:
        aic_with_variate=[]
        for candidate in variate:  #逐个遍历自变量
            formula="{}~{}".format(target,"+".join(selected+[candidate]))  #将自变量名连接起来
            aic=ols(formula=formula,data=data).fit().aic  #利用ols训练模型得出aic值
            aic_with_variate.append((aic,candidate))  #将第每一次的aic值放进空列表
        aic_with_variate.sort(reverse=True)  #降序排序aic值
        best_new_score,best_candidate=aic_with_variate.pop()  #最好的aic值等于删除列表的最后一个值，以及最好的自变量等于列表最后一个自变量
        if current_score>best_new_score:  #如果目前的aic值大于最好的aic值
            variate.remove(best_candidate)  #移除加进来的变量名，即第二次循环时，不考虑此自变量了
            selected.append(best_candidate)  #将此自变量作为加进模型中的自变量
            current_score=best_new_score  #最新的分数等于最好的分数
            print("aic is {},continuing!".format(current_score))  #输出最小的aic值
        else:
            print("for selection over!")
            break
    formula="{}~{}".format(target,"+".join(selected))  #最终的模型式子
    print("final formula is {}".format(formula))
    model=ols(formula=formula,data=data).fit()
    return(model)

In [12]:
import statsmodels.api as sm #最小二乘
from statsmodels.formula.api import ols #加载ols模型
forward_select(data=boston_data,target="Price")

aic is 3286.974956900157,continuing!
aic is 3171.5423142992013,continuing!
aic is 3114.0972674193326,continuing!
aic is 3097.359044862759,continuing!
aic is 3069.438633167217,continuing!
aic is 3057.9390497191152,continuing!
aic is 3048.438382711162,continuing!
aic is 3042.274993098419,continuing!
aic is 3040.154562175143,continuing!
aic is 3032.0687017003256,continuing!
aic is 3021.726387825062,continuing!
for selection over!
final formula is Price~LSTAT+RM+PTRATIO+DIS+NOX+CHAS+B+ZN+CRIM+RAD+TAX


<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x1e18dc6a888>

In [13]:
lm=ols("Price~LSTAT+RM+PTRATIO+DIS+NOX+CHAS+B+ZN+CRIM+RAD+TAX",data=boston_data).fit()
lm.summary()

0,1,2,3
Dep. Variable:,Price,R-squared:,0.741
Model:,OLS,Adj. R-squared:,0.735
Method:,Least Squares,F-statistic:,128.2
Date:,"Thu, 18 Mar 2021",Prob (F-statistic):,5.54e-137
Time:,21:57:26,Log-Likelihood:,-1498.9
No. Observations:,506,AIC:,3022.0
Df Residuals:,494,BIC:,3072.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,36.3411,5.067,7.171,0.000,26.385,46.298
LSTAT,-0.5226,0.047,-11.019,0.000,-0.616,-0.429
RM,3.8016,0.406,9.356,0.000,3.003,4.600
PTRATIO,-0.9465,0.129,-7.334,0.000,-1.200,-0.693
DIS,-1.4927,0.186,-8.037,0.000,-1.858,-1.128
NOX,-17.3760,3.535,-4.915,0.000,-24.322,-10.430
CHAS,2.7187,0.854,3.183,0.002,1.040,4.397
B,0.0093,0.003,3.475,0.001,0.004,0.015
ZN,0.0458,0.014,3.390,0.001,0.019,0.072

0,1,2,3
Omnibus:,178.43,Durbin-Watson:,1.078
Prob(Omnibus):,0.0,Jarque-Bera (JB):,787.785
Skew:,1.523,Prob(JB):,8.6e-172
Kurtosis:,8.3,Cond. No.,14700.0


#### 岭回归

很神奇的东西，讲不清的玩意

In [15]:
from sklearn import linear_model
reg_rid = linear_model.Ridge(alpha=.5)
reg_rid.fit(X,y)
reg_rid.score(X,y)

0.739957023371629

#### Lasso回归

In [16]:
from sklearn import linear_model
reg_lasso = linear_model.Lasso(alpha = 0.5)
reg_lasso.fit(X,y)
reg_lasso.score(X,y)

0.7140164719858566

#### SVR 结合管道进行调优

In [17]:
# 我们先来对未调参的SVR进行评价： 
from sklearn.svm import SVR     # 引入SVR类
from sklearn.pipeline import make_pipeline   # 引入管道简化学习流程
from sklearn.preprocessing import StandardScaler # 由于SVR基于距离计算，引入对数据进行标准化的类
from sklearn.model_selection import GridSearchCV  # 引入网格搜索调优
from sklearn.model_selection import cross_val_score # 引入K折交叉验证
from sklearn import datasets


boston = datasets.load_boston()     # 返回一个类似于字典的类
X = boston.data
y = boston.target
features = boston.feature_names
pipe_SVR = make_pipeline(StandardScaler(),
                                                         SVR())
score1 = cross_val_score(estimator=pipe_SVR,
                                                     X = X,
                                                     y = y,
                                                     scoring = 'r2',
                                                      cv = 10)       # 10折交叉验证
print("CV accuracy: %.3f +/- %.3f" % ((np.mean(score1)),np.std(score1)))

CV accuracy: 0.187 +/- 0.649


In [18]:
# 下面我们使用网格搜索来对SVR调参：
from sklearn.pipeline import Pipeline
pipe_svr = Pipeline([("StandardScaler",StandardScaler()),
                                                         ("svr",SVR())])
param_range = [0.0001,0.001,0.01,0.1,1.0,10.0,100.0,1000.0]
param_grid = [{"svr__C":param_range,"svr__kernel":["linear"]},  # 注意__是指两个下划线，一个下划线会报错的
                            {"svr__C":param_range,"svr__gamma":param_range,"svr__kernel":["rbf"]}]
gs = GridSearchCV(estimator=pipe_svr,
                                                     param_grid = param_grid,
                                                     scoring = 'r2',
                                                      cv = 10)       # 10折交叉验证
gs = gs.fit(X,y)
print("网格搜索最优得分：",gs.best_score_)
print("网格搜索最优参数组合：\n",gs.best_params_)

网格搜索最优得分： 0.6081303070817479
网格搜索最优参数组合：
 {'svr__C': 1000.0, 'svr__gamma': 0.001, 'svr__kernel': 'rbf'}


In [19]:
# 下面我们使用随机搜索来对SVR调参：
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform  # 引入均匀分布设置参数
pipe_svr = Pipeline([("StandardScaler",StandardScaler()),
                                                         ("svr",SVR())])
distributions = dict(svr__C=uniform(loc=1.0, scale=4),    # 构建连续参数的分布
                     svr__kernel=["linear","rbf"],                                   # 离散参数的集合
                    svr__gamma=uniform(loc=0, scale=4))

rs = RandomizedSearchCV(estimator=pipe_svr,
                                                     param_distributions = distributions,
                                                     scoring = 'r2',
                                                      cv = 10)       # 10折交叉验证
rs = rs.fit(X,y)
print("随机搜索最优得分：",rs.best_score_)
print("随机搜索最优参数组合：\n",rs.best_params_)

随机搜索最优得分： 0.298837644097963
随机搜索最优参数组合：
 {'svr__C': 4.312255903485075, 'svr__gamma': 0.8691126169055075, 'svr__kernel': 'linear'}
