# 作业4-线性回归练习
### 红酒质量预测

- winequality-red是一个关于红酒质量的数据集， 该数据集包括1599条红酒数据。
- 文件：winequality-red.csv
- 预测的目标是quality字段。
- 特征值总共11项, 如下:
    1. fixed acidity
    2. volatile acidity
    3. citric acid
    4. residual sugar
    5. chlorides
    6. free sulfur dioxide
    7. total sulfur dioxide
    8. density
    9. pH
    10. sulphates
    11. alcohol

### 作业要求：导出为：作业4-班级-学号-姓名.pdf
1. 参考课堂demo，新建一个ipynb文件。
2. 请完成数据加载和数据预处理，需要自己将数据拆分为训练集、测试集。
3. 数据标准化
4. 使用线性回归模型进行预测。
5. 查看R2决定系数和MSE指标
6. 保存模型和加载模型
7. 个人总结和问题

### (1) 数据加载

In [30]:
# (1) 数据加载
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号


data = pd.read_csv('./data/winequality-red.csv', encoding='gbk')
data.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [31]:
# 查看数据基本情况，数据类型和缺失
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [32]:
# 数据描述性指标
data.describe()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


### (2) 拆分特征和目标

In [49]:
# (2) 拆分特征和目标
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split

dataset = load_wine()
X = np.array([data['fixed acidity'], data['volatile acidity'], data['citric acid'], data['residual sugar'], data['chlorides'], data['free sulfur dioxide'], data['total sulfur dioxide'], data['density'], data['pH'], data['sulphates'], data['alcohol']]).reshape(1599, 11)
y = np.array(data['quality'])

X.shape, y.shape

((1599, 11), (1599,))

### （3）拆分训练集和测试集

In [52]:
# （3）拆分训练集和测试集 0.25
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=666 )
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1199, 11), (400, 11), (1199,), (400,))

### (4) 特征数据标准化 

In [57]:
# (4) 特征数据标准化 
from sklearn.preprocessing import StandardScaler  # z-score标准化

sc = StandardScaler() # 初始化
sc.fit(X_train) # 训练 求解X_train的均值和标准差
X_train_std = sc.transform(X_train) # transform 数据转换
X_test_std = sc.transform(X_test)

In [58]:
# 最后，处理完数据，再确认一遍数据的维度

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1199, 11), (400, 11), (1199,), (400,))

### (5) 构建模型-线性回归

In [61]:
# (5) 构建模型-线性回归

from sklearn.linear_model import LinearRegression
lr2 = LinearRegression()

lr2.fit(X_train_std, y_train)

print("在训练集训练得到参数组合为：\n其中w:\n{}\nb：\n{}".format(lr2.coef_, lr2.intercept_))

在训练集训练得到参数组合为：
其中w:
[-0.06082279  0.05100664  0.12121216 -0.00871569 -0.02731451  0.01852696
 -0.05788926 -0.00299838  0.00145419  0.02772895  0.04966823]
b：
5.619683069224354


### （6）模型评价

In [62]:
# （6）模型评价
from sklearn.metrics import mean_squared_error #MSE均方差错误
from sklearn.metrics import r2_score # R2系数

y_train_pred = lr2.predict(X_train_std) # 评价训练集


print("在训练集上的决定系数r2得分为：", r2_score(y_train, y_train_pred))
print("在训练集上的均方误差MSE为：", mean_squared_error(y_train, y_train_pred))

在训练集上的决定系数r2得分为： 0.023760351941806923
在训练集上的均方误差MSE为： 0.6485809556493713


In [63]:
# 评价测试集
y_test_pred = lr2.predict(X_test_std)


print("在训练集上的决定系数r2得分为：", r2_score(y_test, y_test_pred))
print("在训练集上的均方误差MSE为：", mean_squared_error(y_test, y_test_pred))


在训练集上的决定系数r2得分为： -0.04621141225054326
在训练集上的均方误差MSE为： 0.6389997753173255


### （7） 保存模型

In [65]:
import joblib
joblib.dump(lr2, 'boston_linear_reg.pkl')
print('保存成功')

保存成功
