### 導入套件

In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 避免不必要的waring產生
import warnings
warnings.filterwarnings('ignore')

### 取得資料

In [21]:

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", 
    skiprows=22, header=None)#讀取格式為空格分隔的檔案，並從第22個row開始讀取
x_org = np.hstack([raw_df.values[::2, :], 
    raw_df.values[1::2, :2]])#從第一欄和第二欄開始各自每兩個row取一次，再依12、34、56合併，以此類推
yt = raw_df.values[1::2, 2]
feature_names = np.array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX',
    'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO','B', 'LSTAT'])

print('陣列型態', x_org.shape, yt.shape)
print('變數名: ', feature_names)

# データ絞り込み (項目 RMのみ)
x_data = x_org[:,feature_names == ['RM']]
print('應變數陣列型態', x_data.shape)

# ダミー変数を追加
x_add1 = x_org[:,feature_names == ['CRIM']]
x_add2 = x_org[:,feature_names == ['LSTAT']]
x = np.hstack((x_data, x_add2,x_add1))#插入新的變數
x = np.insert(x, 0, 1.0, axis=1)#在第一欄左邊插入1
print('應變數追加後陣列型態', x.shape)

陣列型態 (506, 13) (506,)
變數名:  ['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
應變數陣列型態 (506, 1)
應變數追加後陣列型態 (506, 4)


In [22]:
M  = x.shape[0]

# 係數數量
D = x.shape[1]

# 迭代次數
iters = 2500

# 学習率
alpha = 0.0001

# 權重向量初始值
w = np.array([1.0, 0.6, 0.8, 1.0])

# 評価結果記録用 (損失函數記録)
history = np.zeros((0,2))

In [23]:
def pred(x, w):
    return(x @ w)

In [24]:
for k in range(iters):
    
    # 計算預測值
    yp = pred(x, w)
    
    # 計算殘差
    yd = yp - yt
    
    # 梯度下降法
    w = w - alpha * (x.T @ yd) / M
    
    # 學習曲線的計算、保存機器學習的結果
    if ( k % 100 == 0):
        # 計算損失函數
        loss = np.mean(yd ** 2) / 2
        # 計算結果
        history = np.vstack((history, np.array([k, loss])))
        # 呈現學習結果
        print( "iter = %d  loss = %f" % (k, loss))    

iter = 0  loss = 190.333637
iter = 100  loss = 110.397353
iter = 200  loss = 80.997200
iter = 300  loss = 65.016989
iter = 400  loss = 54.562703
iter = 500  loss = 46.871870
iter = 600  loss = 40.869564
iter = 700  loss = 36.061163
iter = 800  loss = 32.167178
iter = 900  loss = 28.999835
iter = 1000  loss = 26.419002
iter = 1100  loss = 24.314599
iter = 1200  loss = 22.598194
iter = 1300  loss = 21.198095
iter = 1400  loss = 20.055962
iter = 1500  loss = 19.124248
iter = 1600  loss = 18.364181
iter = 1700  loss = 17.744137
iter = 1800  loss = 17.238321
iter = 1900  loss = 16.825687
iter = 2000  loss = 16.489070
iter = 2100  loss = 16.214464
iter = 2200  loss = 15.990447
iter = 2300  loss = 15.807699
iter = 2400  loss = 15.658616


In [25]:
# 呈現模型訓練時，損失函數的初期値、最終値
print('損失函数初期値: %f' % history[0,1])
print('損失函数最終値: %f' % history[-1,1])

損失函数初期値: 190.333637
損失函数最終値: 15.658616
