<a href="https://colab.research.google.com/github/KaitoAMANO/EU_M_Math/blob/main/Chap08.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import numpy as np
import numpy.random as random
import scipy as sp
from pandas import DataFrame, Series
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

import sklearn


In [14]:
import requests, zipfile, io

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'
res = requests.get(url).content
auto = pd.read_csv(io.StringIO(res.decode('utf-8')), header=None)

auto.columns = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']

print(f'自動車データの形式:{auto.shape}')

自動車データの形式:(205, 26)


In [15]:
auto.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [16]:
auto = auto[['price', 'horsepower', 'width', 'height']]
auto.isin(['?']).sum()

price         4
horsepower    2
width         0
height        0
dtype: int64

In [17]:
auto = auto.replace('?', np.nan).dropna()
print(f'自動車データの形式:{auto.shape}')

自動車データの形式:(199, 4)


In [18]:
print(f'データ型の確認(型変換前)\n{auto.dtypes}')

データ型の確認(型変換前)
price          object
horsepower     object
width         float64
height        float64
dtype: object


In [19]:
auto = auto.assign(price=pd.to_numeric(auto.price))
auto = auto.assign(horsepower=pd.to_numeric(auto.horsepower))
print(f'データ型の確認(型変換後)\n{auto.dtypes}')

データ型の確認(型変換後)
price           int64
horsepower      int64
width         float64
height        float64
dtype: object


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X = auto.drop('price', axis=1)
y = auto['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

model = LinearRegression()
model.fit(X_train, y_train)

print(f'決定係数:{model.score(X_test, y_test):.3f}')
print(f'決定係数:{model.score(X_train, y_train):.3f}')

print(f'回帰係数:{pd.Series(model.coef_, index=X.columns)}')
print(f'切片:{model.intercept_:.3f}')

決定係数:0.737
決定係数:0.733
回帰係数:horsepower      81.651078
width         1829.174506
height         229.510077
dtype: float64
切片:-128409.046


In [21]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

X = auto.drop('price', axis=1)
y = auto['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

linear = LinearRegression()
ridge = Ridge(random_state=0)

for model in [linear, ridge]:
    model.fit(X_train, y_train)
    print(f'{model.__class__.__name__}(train):{model.score(X_train, y_train):.6f}')
    print(f'{model.__class__.__name__}(test):{model.score(X_test, y_test):.6f}')


LinearRegression(train):0.733358
LinearRegression(test):0.737069
Ridge(train):0.733355
Ridge(test):0.737768


In [22]:
import requests, zipfile, io

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'
res = requests.get(url).content
auto = pd.read_csv(io.StringIO(res.decode('utf-8')), header=None)

auto.columns = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']

auto = auto[['price', 'width', 'engine-size']]
auto = auto.replace('?', np.nan).dropna()
print(f'自動車データの形式:{auto.shape}')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

model = LinearRegression()
model.fit(X_train, y_train)

print(f'決定係数:{model.score(X_test, y_test):.3f}')
print(f'決定係数:{model.score(X_train, y_train):.3f}')

自動車データの形式:(201, 3)
決定係数:0.737
決定係数:0.733


# ラッソとリッジ回帰で用いられる正規化項式のメリット

・オーバーフィッティングの抑制
・変数選択の促進（ラッソ回帰）
・多重共線性の緩和
・汎化性能の向上