In [1]:
# 0.필요모듈 import
from sklearn import datasets
from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans

import pandas as pd  

In [2]:
# 1. 데이터셋 로드
diabetes_dataset = datasets.load_diabetes()

# 1.1 변수 타입 및 속성 확인
print(type(diabetes_dataset))

<class 'sklearn.utils.Bunch'>


In [3]:
print(dir(diabetes_dataset))

['DESCR', 'data', 'data_filename', 'data_module', 'feature_names', 'frame', 'target', 'target_filename']


In [4]:
# 2. 데이터 정보 확인
print("Description of data : \n {} \n\n".format(diabetes_dataset.DESCR))
print("feature names of data : \n {}\n\n".format(diabetes_dataset.feature_names))
print("data : \n {}\n\n".format(diabetes_dataset.data))


Description of data : 
 .. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood sugar leve

In [5]:
# 3. 다항속성 생성
# 2차 다항회귀 설정
polynomial_transformer = PolynomialFeatures(2)

# 2차 속성 생성
polynomial_data = polynomial_transformer.fit_transform(diabetes_dataset.data)
print("polynomial_data : \n {} \n\n".format(polynomial_data))

# 2차 속성의 이름
polynomial_feature_names = polynomial_transformer.get_feature_names_out(diabetes_dataset.feature_names)
print("polynomial_feature_names : \n {} \n\n".format(polynomial_feature_names))

polynomial_data : 
 [[ 1.00000000e+00  3.80759064e-02  5.06801187e-02 ...  3.96345222e-04
  -3.51306487e-04  3.11385733e-04]
 [ 1.00000000e+00 -1.88201653e-03 -4.46416365e-02 ...  4.66895386e-03
   6.30027907e-03  8.50158677e-03]
 [ 1.00000000e+00  8.52989063e-02  5.06801187e-02 ...  8.20118159e-06
  -7.42585403e-05  6.72382480e-04]
 ...
 [ 1.00000000e+00  4.17084449e-02  5.06801187e-02 ...  2.19768591e-03
  -7.26197419e-04  2.39962721e-04]
 [ 1.00000000e+00 -4.54724779e-02 -4.46416365e-02 ...  1.98277609e-03
  -1.15463583e-03  6.72382480e-04]
 [ 1.00000000e+00 -4.54724779e-02 -4.46416365e-02 ...  1.78072159e-05
  -1.29313778e-05  9.39060506e-06]] 


polynomial_feature_names : 
 ['1' 'age' 'sex' 'bmi' 'bp' 's1' 's2' 's3' 's4' 's5' 's6' 'age^2'
 'age sex' 'age bmi' 'age bp' 'age s1' 'age s2' 'age s3' 'age s4' 'age s5'
 'age s6' 'sex^2' 'sex bmi' 'sex bp' 'sex s1' 'sex s2' 'sex s3' 'sex s4'
 'sex s5' 'sex s6' 'bmi^2' 'bmi bp' 'bmi s1' 'bmi s2' 'bmi s3' 'bmi s4'
 'bmi s5' 'bmi s6' 'bp^2' 

In [6]:
# 4. DataFrame으로 변환
# Data
X = pd.DataFrame(polynomial_data, columns=polynomial_feature_names)

# Target
y = pd.DataFrame(diabetes_dataset.target, columns=['diabetes'])

In [7]:
# 5. 머신러닝 알고리즘 사용을 위한 train, test set 구성
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

In [8]:
# 6. 머신러닝을 위한 모델 선정 및 학습
# LinearRegression 사용
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [9]:
# 최적화된 theta값 확인
print("theta 0 : \n", model.intercept_)
print("theta 1 ~ n  : \n",model.coef_)

theta 0 : 
 [-115.37328603]
theta 1 ~ n  : 
 [[-1.52450828e-07  9.10505214e+01 -3.06205123e+02  4.67467078e+02
   3.61582517e+02 -5.84529582e+04  5.14043898e+04  2.14657451e+04
  -1.40195088e+02  1.98245032e+04  1.91613198e+01  1.23203317e+03
   2.20866661e+03 -8.70149982e+02  1.40586771e+03  1.28364692e+03
  -7.01696393e+03  5.24984110e+03  7.80356381e+03  8.91916834e+02
   1.23933345e+03 -1.84901419e+00  8.28798375e+02  1.78140473e+03
   4.24712314e+03 -3.19149460e+03 -2.79840398e+03 -5.62702982e+03
  -3.71764404e+01  1.84343895e+03  2.40735669e+02  4.49244847e+03
  -5.58907360e+03  4.89526278e+03  8.69628355e+02 -6.74079993e+02
   1.50242757e+03  1.45862960e+03 -6.13022041e+02  1.69684979e+04
  -1.23256448e+04 -5.07018127e+03 -5.55266896e+02 -6.34906034e+03
  -3.59928324e+03  2.29649995e+04 -2.43678707e+04 -1.19409302e+04
  -1.80433446e+04  1.65129489e+05 -3.85734734e+03  6.93446815e+03
  -5.86376496e+02  4.11726562e+03 -1.48232008e+05  2.59165327e+03
   4.10474556e+03  1.63750493e+

In [10]:
# 7. 최적화된 모델을 바탕으로 Test data 예측
y_test_predict = model.predict(X_test)

# Test datd와 예측값의 오차를 RMSE 로 나타냄
RMSE = mean_squared_error(y_test, y_test_predict) ** 0.5

RMSE

57.87704902724893