# 岭回归和Lasso回归预测糖尿病发生
## 岭回归

In [1]:
# 导入包
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import Ridge,RidgeCV

In [2]:
# 读取数据
diabetes = pd.read_excel(r'/Users/jiabaohuang/python/data/diabetes.xlsx')
diabetes.head()

Unnamed: 0,AGE,SEX,BMI,BP,S1,S2,S3,S4,S5,S6,Y
0,59,2,32.1,101.0,157,93.2,38.0,4.0,4.8598,87,151
1,48,1,21.6,87.0,183,103.2,70.0,3.0,3.8918,69,75
2,72,2,30.5,93.0,156,93.6,41.0,4.0,4.6728,85,141
3,24,1,25.3,84.0,198,131.4,40.0,5.0,4.8903,89,206
4,50,1,23.0,101.0,192,125.4,52.0,4.0,4.2905,80,135


In [3]:
# 构造自变量（剔除年龄，性别和因变量）
predictors = diabetes.columns[2:-1]
predictors
# 划分训练集和测试集
x_train,x_test,y_train,y_test = model_selection.train_test_split(diabetes[predictors], diabetes['Y'], 
                                                                 test_size = 0.2, random_state = 1234)

In [4]:
# 构造Lambda值
Lambdas = np.logspace(-5,2,200)

# 岭回归的交叉验证
# 设置交叉验证的参数，对每个Lambda值都执行10重交叉验证
ridge_cv = RidgeCV( alphas=Lambdas, normalize=True, scoring='neg_mean_squared_error', cv=10 )
# 模型拟合
ridge_cv.fit(x_train,y_train)
# 返回最优解
ridge_best_Lambda = ridge_cv.alpha_
ridge_best_Lambda

0.014649713983072863

In [5]:
# 导入包
from sklearn.metrics import mean_squared_error
# 基于最佳Lambda建模
ridge = Ridge(alpha=ridge_best_Lambda, normalize=True)
ridge.fit(x_train,y_train)
# 返回岭回归系数
pd.Series( index = ['Intercept'] + x_train.columns.tolist(), data = [ridge.intercept_] +ridge.coef_.tolist())
# 预测
ridge_predict = ridge.predict(x_test)
# 预测效果验证
RMSE = np.sqrt(mean_squared_error(y_test,ridge_predict))
RMSE

53.11911788753519

## Lasso回归

In [6]:
# 导入第三方模块函数
from sklearn.linear_model import Lasso, LassoCV
# Lasso回归交叉验证
lasso_cv = LassoCV(alphas=Lambdas, normalize=True, cv=10, max_iter=10000 )
lasso_cv.fit(x_train,y_train)
# 输出最佳Lambda值
lasso_best_Lambda = lasso_cv.alpha_
lasso_best_Lambda

0.06294988990221888

In [7]:
# 基于最佳Lambda值建模
lasso = Lasso( alpha=lasso_best_Lambda, normalize=True, max_iter=10000 )
lasso.fit(x_train,y_train)
# 返回Lasso回归最佳系数
pd.Series( index = ['Intercept'] + x_train.columns.to_list(), data = [lasso.intercept_] + lasso.coef_.tolist())
# 预测
lasso_predict = lasso.predict(x_test)
# 预测效果验证
RMSE1 = np.sqrt(mean_squared_error(y_test,lasso_predict))
RMSE1

53.061437258225745

## 线性回归预测(多元)

In [8]:
from statsmodels import api as sms

# 为自变量x添加常数列1，用于拟合截距项
x_train2 = sms.add_constant(x_train)
x_test2 = sms.add_constant(x_test)

# 构建多元线性回归模型
linear = sms.OLS(y_train,x_train2).fit()
# 返回线性回归模型的系数
linear.params

# 模型的预测
linear_predict = linear.predict(x_test2)
# 预测误差
RMSE2 = np.sqrt(mean_squared_error(y_test,linear_predict))
RMSE2

53.42623939722987