In [1]:
import numpy as np
import numpy.random as random
import scipy as sp
import pandas as pd
from pandas import Series,DataFrame

# 視覺化函式庫
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

# 機器學習函式庫
import sklearn


import requests, zipfile
import io

# 顯示到小數點後第3位
%precision 3

'%.3f'

- Lasso、Ridge迴歸有著避免讓迴歸係數變大的機制，迴歸係數越大越易造成過度學習

In [8]:
# 取得汽車價格資料
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'
res = requests.get(url).content

# 將取得的資料作為DataFrame物件讀取
auto = pd.read_csv(io.StringIO(res.decode('utf-8')),header=None)

# 在資料的行裡設定標籤
auto.columns = ['symboling','normalized-losses','make','fuel-type','aspiration','num-of-doors',
                'body-style','drive-wheels','engine-location','wheel-base','length','width','height',
                'curb-weight','engine-type','num-of-cylinders','engine-size','fuel-system','bore',
                'stroke','compression-ratio','horsepower','peak-rpm','city-mpg','highway-mpg','price']

auto = auto[['price','horsepower','width','height']]
auto = auto.replace('?',np.nan).dropna()
auto = auto.assign(price = pd.to_numeric(auto.price))
auto = auto.assign(horsepower = pd.to_numeric(auto.horsepower))

In [9]:
auto.head()

Unnamed: 0,price,horsepower,width,height
0,13495,111,64.1,48.8
1,16500,111,64.1,48.8
2,16500,154,65.5,52.4
3,13950,102,66.2,54.3
4,17450,115,66.4,54.3


In [10]:
from sklearn.model_selection import train_test_split

# 多元線性迴歸模型建構的匯入
from sklearn.linear_model import LinearRegression

# Ridge迴歸模型匯入
from sklearn.linear_model import Ridge

# 指定目標變數為price、其他為解釋變數
X = auto.drop('price', axis = 1)
y = auto['price']

# 分為訓練資料與測試資料
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .5, random_state = 0)

# 模型建構與評估
linear = LinearRegression()
ridge = Ridge(random_state=0)

for model in [linear,ridge]:
    model.fit(X_train,y_train)
    print('{}(trian):{:.6f}'.format(model.__class__.__name__,model.score(X_train,y_train)))
    print('{}(test):{:.6f}'.format(model.__class__.__name__,model.score(X_test,y_test)))

LinearRegression(trian):0.733358
LinearRegression(test):0.737069
Ridge(trian):0.733355
Ridge(test):0.737768


- 對於訓練資料，多元線性迴歸的準確度較高，對於測試資料則反之，可推測這是歸因於正則化項的效果。

#### 練習問題8-4

In [11]:
# 取得汽車價格資料
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'
res = requests.get(url).content

# 將取得的資料作為DataFrame物件讀取
auto = pd.read_csv(io.StringIO(res.decode('utf-8')),header=None)

# 在資料的行裡設定標籤
auto.columns = ['symboling','normalized-losses','make','fuel-type','aspiration','num-of-doors',
                'body-style','drive-wheels','engine-location','wheel-base','length','width','height',
                'curb-weight','engine-type','num-of-cylinders','engine-size','fuel-system','bore',
                'stroke','compression-ratio','horsepower','peak-rpm','city-mpg','highway-mpg','price']

auto = auto[['price','engine-size','width']]
auto = auto.replace('?',np.nan).dropna()
auto = auto.assign(price = pd.to_numeric(auto.price))

In [23]:
from sklearn.linear_model import Lasso

X = auto[['width','engine-size']]
y = auto['price']

# 分為訓練資料與測試資料
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .5, random_state = 0)

# 模型建構與評估
models = {'linear':LinearRegression(),
          'lasso1':Lasso(alpha = .1,random_state=0),
          'lasso2':Lasso(alpha = 200,random_state=0)
          }


scores = {}
for model_name,model in models.items():
    model.fit(X_train,y_train)
    scores[(model_name,'train')] = model.score(X_train,y_train)
    scores[(model_name,'test')] = model.score(X_test,y_test)

pd.Series(scores).unstack()

Unnamed: 0,test,train
lasso1,0.77829,0.783189
lasso2,0.782421,0.782839
linear,0.778292,0.783189


In [15]:
auto


Unnamed: 0,price,engine-size,width
0,13495,130,64.1
1,16500,130,64.1
2,16500,152,65.5
3,13950,109,66.2
4,17450,136,66.4
...,...,...,...
200,16845,141,68.9
201,19045,141,68.8
202,21485,173,68.9
203,22470,145,68.9
