<h1>重回帰分析を使って自動車価格の予測をする</h1>
<p1>説明変数が複数あるデータを扱うのが重回帰<br>
    これによって各説明変数の係数が推定され予測値を計算できる.<br>
    回帰係数は予測値と目的変数の二乗誤差が最小になるように推定される</body>

In [1]:
import numpy as np 
import numpy.random as random 
import scipy as sp 
from pandas import Series,DataFrame 
import pandas as pd 

import matplotlib.pyplot as plt 
import matplotlib as mpl 
import seaborn as sns 
%matplotlib inline 

import sklearn 

%precision 3

'%.3f'

In [2]:
import requests,zipfile 
import io 

url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'
res = requests.get(url).content

auto = pd.read_csv(io.StringIO(res.decode('utf-8')),header = None)

auto.columns =['symboling','normalized-losses','make','fuel-type','aspiration','num-of-doors','body-style'
              ,'drive-wheels','engine-location','wheel-base','length','width','height',
              'curb-weight','engine-type','num-of-cylinders','engine-size','fuel-system','bore','stroke','compression-ratio',
              'horsepower','peak-rpm','city-mpg','highway-mpg','price']

In [3]:
auto.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [4]:
print(auto.shape)

(205, 26)


In [5]:
auto = auto[['price','horsepower','width','height']]
auto.isin(['?']).sum()

price         4
horsepower    2
width         0
height        0
dtype: int64

In [6]:
auto = auto.replace('?',np.nan).dropna()
print(auto.shape)

(199, 4)


In [7]:
auto.dtypes

price          object
horsepower     object
width         float64
height        float64
dtype: object

In [8]:
auto.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199 entries, 0 to 204
Data columns (total 4 columns):
price         199 non-null object
horsepower    199 non-null object
width         199 non-null float64
height        199 non-null float64
dtypes: float64(2), object(2)
memory usage: 7.8+ KB


In [9]:
auto = auto.assign(price = pd.to_numeric(auto.price))
auto = auto.assign(horsepower = pd.to_numeric(auto.horsepower))
auto.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199 entries, 0 to 204
Data columns (total 4 columns):
price         199 non-null int64
horsepower    199 non-null int64
width         199 non-null float64
height        199 non-null float64
dtypes: float64(2), int64(2)
memory usage: 7.8 KB


In [10]:
auto.corr()

Unnamed: 0,price,horsepower,width,height
price,1.0,0.810533,0.753871,0.13499
horsepower,0.810533,1.0,0.615315,-0.087407
width,0.753871,0.615315,1.0,0.309223
height,0.13499,-0.087407,0.309223,1.0


In [11]:
#データ分割のためのインポート
from sklearn.model_selection import train_test_split 

#重回帰のモデル構築のためのインポート
from sklearn.linear_model import LinearRegression 

#目的変数　今回はpriceを設定
y=auto['price']

#説明変数にそれ以外を設定
X =auto.drop('price',axis = 1)

#訓練データとテストデータに分ける
X_train,X_test,y_train,y_test =train_test_split(X,y,test_size = 0.5,random_state =0)

#重回帰クラスの初期化と学習
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [12]:
#決定係数の表示
print('決定係数(train):{:.3f}'.format(model.score(X_train,y_train)))
print('決定係数(test):{:.3f}'.format(model.score(X_test,y_test)))

決定係数(train):0.733
決定係数(test):0.737


In [13]:
#回帰係数
pd.Series(model.coef_,index=X.columns)

horsepower      81.651078
width         1829.174506
height         229.510077
dtype: float64

In [14]:
#切片の表示
model.intercept_

-128409.04630338575

<h3>次に,目的変数をprice,説明変数をlengthとengine-sizeにする</h3>

In [17]:
data = pd.read_csv(io.StringIO(res.decode('utf-8')),header = None)

data.columns =['symboling','normalized-losses','make','fuel-type','aspiration','num-of-doors','body-style'
              ,'drive-wheels','engine-location','wheel-base','length','width','height',
              'curb-weight','engine-type','num-of-cylinders','engine-size','fuel-system','bore','stroke','compression-ratio',
              'horsepower','peak-rpm','city-mpg','highway-mpg','price']

In [18]:
data= data[['price','length','engine-size']]
data.isin(['?']).sum()

price          4
length         0
engine-size    0
dtype: int64

In [19]:
data = data.replace('?',np.nan).dropna()
print(data.shape)

(201, 3)


In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 201 entries, 0 to 204
Data columns (total 3 columns):
price          201 non-null object
length         201 non-null float64
engine-size    201 non-null int64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.3+ KB


In [22]:
data = data.assign(price = pd.to_numeric(auto.price))
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 201 entries, 0 to 204
Data columns (total 3 columns):
price          199 non-null float64
length         201 non-null float64
engine-size    201 non-null int64
dtypes: float64(2), int64(1)
memory usage: 6.3 KB


In [30]:
#dataに一つでも欠損値が含まれているのかを返すメソッド
print(data.isnull().any())

price           True
length         False
engine-size    False
dtype: bool


In [27]:


#目的変数　今回はpriceを設定
y=data['price']

#説明変数にそれ以外を設定
X =data.drop('price',axis = 1)

#訓練データとテストデータに分ける
X_train,X_test,y_train,y_test =train_test_split(X,y,test_size = 0.5,random_state =0)

#重回帰クラスの初期化と学習
model = LinearRegression()
model.fit(X_train,y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [24]:
#決定係数の表示
print('決定係数(train):{:.3f}'.format(model.score(X_train,y_train)))
print('決定係数(test):{:.3f}'.format(model.score(X_test,y_test)))

決定係数(train):0.733
決定係数(test):0.737


In [25]:
#回帰係数
pd.Series(model.coef_,index=X.columns)

horsepower      81.651078
width         1829.174506
height         229.510077
dtype: float64

In [26]:
#切片の表示
model.intercept_

-128409.04630338575