# 선형 회귀식의 계수를 찾는 법 - OLS VS. SGD
- 보스턴 집값 데이터 활용(RM VS Price)

### 필요한 모듈 import

In [1]:
import os
os.getcwd()

'/Users/kimminsoo/Desktop/ML/ML/수업자료'

In [2]:
# 필요한 라이브러리 import 
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

### 데이터 수집 및 분할

In [3]:
from sklearn import datasets

boston = datasets.fetch_openml('boston')
boston

  warn(


{'data':         CRIM    ZN  INDUS CHAS    NOX     RM   AGE     DIS RAD    TAX  \
 0    0.00632  18.0   2.31    0  0.538  6.575  65.2  4.0900   1  296.0   
 1    0.02731   0.0   7.07    0  0.469  6.421  78.9  4.9671   2  242.0   
 2    0.02729   0.0   7.07    0  0.469  7.185  61.1  4.9671   2  242.0   
 3    0.03237   0.0   2.18    0  0.458  6.998  45.8  6.0622   3  222.0   
 4    0.06905   0.0   2.18    0  0.458  7.147  54.2  6.0622   3  222.0   
 ..       ...   ...    ...  ...    ...    ...   ...     ...  ..    ...   
 501  0.06263   0.0  11.93    0  0.573  6.593  69.1  2.4786   1  273.0   
 502  0.04527   0.0  11.93    0  0.573  6.120  76.7  2.2875   1  273.0   
 503  0.06076   0.0  11.93    0  0.573  6.976  91.0  2.1675   1  273.0   
 504  0.10959   0.0  11.93    0  0.573  6.794  89.3  2.3889   1  273.0   
 505  0.04741   0.0  11.93    0  0.573  6.030  80.8  2.5050   1  273.0   
 
      PTRATIO       B  LSTAT  
 0       15.3  396.90   4.98  
 1       17.8  396.90   9.14  
 2       

In [4]:
# 데이터 준비
x = np.array(boston.data.RM).reshape(-1,1)
y = boston.target

# 데이터 분할
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=(0.3), random_state=1)

# 1. LinearRegression 모델을 사용한 경우

In [8]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(x_train, y_train)

print(reg.coef_, reg.intercept_)

y_pred = reg.predict(x_test)


[8.46109164] -30.571032410898336


In [10]:
from eval_score import evaluate_score
evaluate_score(y_test, y_pred)

mse: 36.517
rmse: 6.043
r2: 0.602


# 2. SGDRegressor with hyperparameter

In [22]:
# 초기값(eta0)이 매번 다르므로 값이 달라질 수 밖에 없음
# random_state값을 주게되면 값게 할 수 있음
## max_iter, eta0를 변화를 주더라도 원하는 것을 얻기 쉽지 않음
from sklearn.linear_model import SGDRegressor

reg = SGDRegressor(max_iter=100000000, eta0=0.0001, \
                   learning_rate='invscaling', loss='squared_error', \
                    random_state=42)
reg.fit(x_train, y_train)

# reg.coef_, reg.intercept_값은 ndarray의 형태로 주기 때문에 인덱싱이 필요
print(reg.coef_[0], reg.intercept_[0])

4.177239731696732 -3.5874778695577123


In [21]:
y_pred = reg.predict(x_test)
evaluate_score(y_test, y_pred)

mse: 55.130
rmse: 7.425
r2: 0.399


# 3. SGDRegressor with scaling

In [26]:
# 표준화 스케일링을 사용하여 경사하강법 모델링

# 스케일링 데이터 준비
train_mean = np.mean(x_train, axis=0)
train_std = np.std(x_train, axis=0)

x_train_scaled = (x_train - train_mean) / train_std
x_test_scaled = (x_test - train_mean) / train_std

# 모델 객체 생성
reg = SGDRegressor(max_iter=100000000, eta0=0.0001, \
                   learning_rate='invscaling', loss='squared_error', \
                    random_state=42)

reg.fit(x_train_scaled, y_train)

# 계수 확인
print(reg.coef_, reg.intercept_)

# 평가지표
y_pred = reg.predict(x_test_scaled)

evaluate_score(y_test, y_pred)

[5.62395551] [21.47086473]
mse: 37.668
rmse: 6.137
r2: 0.589


# 4. SGDRegressor with StandardScaler()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_train_scaled


In [28]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# 5. Pipeline with StandardScaler, LinearRegression or SGDRegressor

In [34]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
reg = make_pipeline(StandardScaler(),
                    SGDRegressor(max_iter=1000000, eta0=0.01,\
                                 tol=0.0001, random_state=42, loss='squared_error'))
reg.fit(x_train, y_train)
# 계수 및 절편 확인: _속성은 학습을 통해 결정되는 속성
print(reg[1].coef_, reg[1].intercept_)
#회귀식 - pipeline()을 사용했기 때문에 SGDRegressor의 parameter가 reg객체의 1번 인덱스에 들어감
print('y = {:2f}X + {:.3f}'.format(reg[1].coef_[0], reg[1].intercept_[0]))
# 예측 수행
y_pred = reg.predict(x_test)
from sklearn.metrics import mean_squared_error, r2_score
# MSE, RMSE, r2_score
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print('MSE:', np.round(mse, 3))
print('RMSE: ', np.round(rmse, 3))
print('R2: ', np.round(r2, 3))

[5.84750366] [22.31897879]
y = 5.847504X + 22.319
MSE: 36.523
RMSE:  6.043
R2:  0.602
