In [5]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression as LR 
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score as EVS 
from sklearn.metrics import mean_squared_error as MSE 

In [6]:
data = pd.read_csv('data.csv')
x = data[['给料量','转速','温度']]
y = data['挥发份']

In [7]:
x.head()

Unnamed: 0,给料量,转速,温度
0,1.0,70,650
1,1.0,80,750
2,1.0,90,850
3,1.2,70,650
4,1.2,80,750


In [8]:
y.head()

0     9.2
1     8.8
2     7.4
3    11.8
4    10.2
Name: 挥发份, dtype: float64

In [9]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.1,random_state=1)
x_train.shape

(8, 3)

In [12]:
x_test

Unnamed: 0,给料量,转速,温度
8,1.5,90,850


In [10]:
reg = LR().fit(x_train,y_train)
yhat = reg.predict(x_test)
yhat

array([10.65081967])

In [11]:
print(reg.coef_)
print(reg.intercept_)
# y = 10.77 + 5.825*给料量 + (-1.031e-03)*转速 + (-1.031e-02)*温度

[ 5.82513661e+00 -1.03121788e-03 -1.03121788e-02]
10.771276307958665


In [13]:
MSE(y_test,yhat)

1.1042219833378093

In [14]:
np.sqrt(MSE(y_test,yhat))/y_test.mean()

0.10946038251366105

In [15]:
-cross_val_score(reg,x,y,cv=3,scoring='neg_mean_squared_error')

array([2.62572531, 0.90708333, 5.61166667])

In [16]:
cross_val_score(reg,x,y,cv=3,scoring='r2').mean()

-1.9408090398952502

In [17]:
r2_score(y_test,yhat)

nan

## 多元非线性回归
PolynomalFeatures(degree=2,interaction_only=False,include_bias=False)

参数说明：
degree=2，表示多项式的变化维度为2，即^2

interaction_only=False，表示是否只使用a*b，默认为False

include_bias=False，是否添加一列全部等于1的偏置项

e.g. PolynomalFeatures(degree=2,interaction_only=False,include_bias=False) 就是对特征数据进行多项式转化，当特征为a与b时，相当于多出来了3个特征，即a^2,a*b,b^2

In [18]:
from sklearn.preprocessing import PolynomialFeatures

po = PolynomialFeatures(degree=2,interaction_only=False,include_bias=False)
x_poly = po.fit_transform(x)
pd.DataFrame(x_poly).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,70.0,650.0,1.0,70.0,650.0,4900.0,45500.0,422500.0
1,1.0,80.0,750.0,1.0,80.0,750.0,6400.0,60000.0,562500.0
2,1.0,90.0,850.0,1.0,90.0,850.0,8100.0,76500.0,722500.0
3,1.2,70.0,650.0,1.44,84.0,780.0,4900.0,45500.0,422500.0
4,1.2,80.0,750.0,1.44,96.0,900.0,6400.0,60000.0,562500.0


In [19]:
x_poly = pd.DataFrame(x_poly,
                      columns=['给料量','转速','温度','给料量^2','给料量_转速','给料量_温度','转速^2','转速_温度','温度^2'])
x_poly.head()

Unnamed: 0,给料量,转速,温度,给料量^2,给料量_转速,给料量_温度,转速^2,转速_温度,温度^2
0,1.0,70.0,650.0,1.0,70.0,650.0,4900.0,45500.0,422500.0
1,1.0,80.0,750.0,1.0,80.0,750.0,6400.0,60000.0,562500.0
2,1.0,90.0,850.0,1.0,90.0,850.0,8100.0,76500.0,722500.0
3,1.2,70.0,650.0,1.44,84.0,780.0,4900.0,45500.0,422500.0
4,1.2,80.0,750.0,1.44,96.0,900.0,6400.0,60000.0,562500.0


In [24]:
x_train2,x_test2,y_train2,y_test2 = train_test_split(x_poly,y,test_size=0.2,random_state=1)
x_train.shape

(7, 9)

In [25]:
x_test2

Unnamed: 0,给料量,转速,温度,给料量^2,给料量_转速,给料量_温度,转速^2,转速_温度,温度^2
8,1.5,90.0,850.0,2.25,135.0,1275.0,8100.0,76500.0,722500.0
2,1.0,90.0,850.0,1.0,90.0,850.0,8100.0,76500.0,722500.0


In [26]:
reg2 = LR().fit(x_train2,y_train2)
yhat2 = reg2.predict(x_test2)
yhat2

array([9.30526316, 8.06315789])

In [27]:
print(reg2.coef_)
print(reg2.intercept_)

[ 2.18560068e+00  3.36332733e-05  3.36332714e-04 -1.40000000e+01
  1.08176167e+01 -1.10386693e+00  3.34678858e-04  1.66512502e-03
 -1.65385606e-04]
-26.311824611538


In [28]:
np.sqrt(MSE(y_test2,yhat2))/y_test2.mean()

0.060370723186499504

In [29]:
cross_val_score(reg2,x_poly,y,cv=3,scoring='r2').mean()

-2.3171465033273946

In [30]:
r2_score(y_test2,yhat2)

0.7823767770620631