In [39]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
import math
import csv

# **导入数据**

In [40]:
path =  'hw1_data/train.csv'
data = pd.read_csv(path)
data[data == 'NR'] = 0 #替换非数字为0
raw_data = data.iloc[:, 3:].to_numpy()

In [41]:
raw_data.shape

(4320, 24)

# **按月重新分组数据**
將原始 4320 * 18 的資料依照每個月分重組成 12 個 18 (features) * 480 (hours) 的資料。
![圖片說明](hw1_data/换形状1.png)
![圖片說明](hw1_data/换形状2.png)

In [42]:
month_data = {} #一个月一个矩阵，month_data是一个数组
for month in range(12):  #year 2014 has 12 months
    sample = np.empty([18,480])
    for day in range(20):  #20 days each month
        sample[:, 24*day : 24*(day+1)] = raw_data[18*(20*month+day): 18*(20*month+day+1) , :]
    month_data[month] = sample
month_data[0].shape #一月的数据形状

(18, 480)

# **按小时分组**

利用前9个小时预测第10个小时
每個月會有 480hrs，每 9 小時形成一個 data（data之间有重叠,每个data偏移一列, 但每个月之间不重叠），每個月會有 471 個 data，故總資料數為 471 * 12 筆，而每筆 data 有 9 * 18 的 features (一小時 18 個 features * 9 小時)。

對應的 target 則有 471 * 12 個(第 10 個小時的 PM2.5)
![圖片說明](hw1_data/换形状3.png)

In [43]:
x = np.empty([471*12, 18*9], dtype = float)
y = np.empty([471*12,1], dtype = float)
for month in range(12):
    for day in range(20):
        for hour in range(24):
            if day == 19 and hour > 14: #每个月之间不重叠，所以每个月最后一日的9个小时不用重复
                continue #跳出循环
            #再次改矩阵形状，每行是9个小时
            x[month*471+day*24+hour, :] = month_data[month][:, day*24+hour : day*24+hour+9].reshape(1, -1) #.reshape(1, -1)将9行变成1行
            y[month*471+day*24+hour, 0] = month_data[month][9, day*24+hour+9]#第10个小时作为y，其中第9行为PM2.5值
print(x)
print(y)

[[14.  14.  14.  ...  2.   2.   0.5]
 [14.  14.  13.  ...  2.   0.5  0.3]
 [14.  13.  12.  ...  0.5  0.3  0.8]
 ...
 [17.  18.  19.  ...  1.1  1.4  1.3]
 [18.  19.  18.  ...  1.4  1.3  1.6]
 [19.  18.  17.  ...  1.3  1.6  1.8]]
[[30.]
 [41.]
 [44.]
 ...
 [17.]
 [24.]
 [29.]]


In [44]:
print(x.shape)
print(y.shape)

(5652, 162)
(5652, 1)


# **数据归一化**

In [45]:
mean_x = np.mean(x, axis = 0) #每列的平均值
std_x = np.std(x, axis = 0) #每列的标准差
for i in range(len(x)): #x的行数
    for j in range(len(x[0])): #x的列数
        if std_x[j] != 0:
            x[i][j] = (x[i][j] - mean_x[j]) / std_x[j] #归一化
x

array([[-1.35825331, -1.35883937, -1.359222  , ...,  0.26650729,
         0.2656797 , -1.14082131],
       [-1.35825331, -1.35883937, -1.51819928, ...,  0.26650729,
        -1.13963133, -1.32832904],
       [-1.35825331, -1.51789368, -1.67717656, ..., -1.13923451,
        -1.32700613, -0.85955971],
       ...,
       [-0.88092053, -0.72262212, -0.56433559, ..., -0.57693779,
        -0.29644471, -0.39079039],
       [-0.7218096 , -0.56356781, -0.72331287, ..., -0.29578943,
        -0.39013211, -0.1095288 ],
       [-0.56269867, -0.72262212, -0.88229015, ..., -0.38950555,
        -0.10906991,  0.07797893]])

Split Training Data Into "train_set" and "validation_set" 生成比較中用來訓練的 train_set 和不會被放入訓練、只是用來驗證的 validation_set。

In [46]:
x_train_set = x[: math.floor(len(x) * 0.8), :]#取前面80%
y_train_set = y[: math.floor(len(y) * 0.8), :]
x_validation = x[math.floor(len(x) * 0.8): , :]#取后面20%
y_validation = y[math.floor(len(y) * 0.8): , :]
print(len(x_train_set))
print(len(y_train_set))
print(len(x_validation))
print(len(y_validation))

4521
4521
1131
1131


# **训练**
![圖片說明](hw1_data/训练1.png)
![圖片說明](hw1_data/训练2.png)
![圖片說明](hw1_data/训练3.png)
(和上圖不同處: 下面的 code 採用 Root Mean Square Error 均方根差 $ RMSE = \sqrt{\sum_{i=0}^{N}{}(f(x_i) - Y_i)^2 \over N} $)

因為常數項的存在，所以 x的维度 (dim) 需要多加，一列全部为1

adagrad （Adaptive Gradient）自适应学习率: 为不同的变量提供不同的学习率,对于某些变量，已经优化到了极小值附近，但是有的变量仍然在梯度很大的地方，这时候一个统一的全局学习率是可能出现问题的。如果学习率太小，则梯度很大的变量会收敛很慢，如果学习率太大，已经趋于收敛的变量可能会不稳定  $$ w = w - {learning\_rate \times {gradient \over {\sqrt{adagrad + eps}}}} $$

eps 項是避免 adagrad 的分母為 0 而加的極小數值。

每一個 dimension (dim) 會對應到各自的 gradient, weight (w)，透過一次次的 iteration (iter_time) 學習。

loss函数 $$ lose = \sqrt{\sum_{i=0}^{471\times 12}{}(x_{i}w - y_i)^2 \over 471\times 12} $$

loss的w偏导数 $$ lose'_w = 2(xw - y)\dot x $$

In [47]:
dim = 18 * 9 + 1
w = np.zeros([dim, 1]) #权重矩阵
x = np.concatenate((np.ones([12 * 471, 1]), x), axis = 1).astype(float) #在x前面加1列全部为1，.concatenate是拼接
learning_rate = 100
iter_time = 1000 #学习次数
adagrad = np.zeros([dim, 1])
eps = 0.0000000001
for t in range(iter_time):
    loss = np.sqrt(np.sum(np.power(np.dot(x, w) - y, 2))/471/12) #算loss
    if(t%100==0): #输出loss
        print(str(t) + ":" + str(loss))
    gradient = 2 * np.dot(x.transpose(), np.dot(x, w) - y) #偏导数
    adagrad += gradient ** 2
    w = w - learning_rate * gradient / np.sqrt(adagrad + eps)
#np.save('weight.npy', w)
w

0:27.071214829194115
100:33.78905859777453
200:19.913751298197095
300:13.53106819368969
400:10.645466158446172
500:9.277353455475069
600:8.518042045956506
700:8.014061987588425
800:7.636756824775696
900:7.336563740371128


array([[ 2.13740269e+01],
       [ 3.58888909e+00],
       [ 4.56386323e+00],
       [ 2.16307023e+00],
       [-6.58545223e+00],
       [-3.38885580e+01],
       [ 3.22235518e+01],
       [ 3.49340354e+00],
       [-4.60308671e+00],
       [-1.02374754e+00],
       [-3.96791501e-01],
       [-1.06908800e-01],
       [ 2.22488184e-01],
       [ 8.99634117e-02],
       [ 1.31243105e-01],
       [ 2.15894989e-02],
       [-1.52867263e-01],
       [ 4.54087776e-02],
       [ 5.20999235e-01],
       [ 1.60824213e-01],
       [-3.17709451e-02],
       [ 1.28529025e-02],
       [-1.76839437e-01],
       [ 1.71241371e-01],
       [-1.31190032e-01],
       [-3.51614451e-02],
       [ 1.00826192e-01],
       [ 3.45018257e-01],
       [ 4.00130315e-02],
       [ 2.54331382e-02],
       [-5.04425219e-01],
       [ 3.71483018e-01],
       [ 8.46357671e-01],
       [-8.11920428e-01],
       [-8.00217575e-02],
       [ 1.52737711e-01],
       [ 2.64915130e-01],
       [-5.19860416e-02],
       [-2.5

# **Testing**
載入 test data，並且以相似於訓練資料預先處理和特徵萃取的方式處理，使 test data 形成 240 個維度為 18 * 9 + 1 的資料。

In [48]:
#载入测试数据集
path =  'hw1_data/test.csv'
data = pd.read_csv(path, header = None)
data[data == 'NR'] = 0 #替换非数字为0
test_data = data.iloc[:, 2:].to_numpy()

In [49]:
test_x = np.empty([240, 18*9], dtype = float)
for i in range(240): #改好
    test_x[i, :] = test_data[18*i :18*(i+1), :].reshape(1, -1)
for i in range(len(test_x)):
    for j in range(len(test_x[0])):
        if std_x[j] != 0:
            test_x[i][j] = (test_x[i][j] - mean_x[j]) / std_x[j]
test_x = np.concatenate((np.ones([240, 1]), test_x), axis = 1).astype(float)
test_x

array([[ 1.        , -0.24447681, -0.24545919, ..., -0.67065391,
        -1.04594393,  0.07797893],
       [ 1.        , -1.35825331, -1.51789368, ...,  0.17279117,
        -0.10906991, -0.48454426],
       [ 1.        ,  1.5057434 ,  1.34508393, ..., -1.32666675,
        -1.04594393, -0.57829812],
       ...,
       [ 1.        ,  0.3919669 ,  0.54981237, ...,  0.26650729,
        -0.20275731,  1.20302531],
       [ 1.        , -1.8355861 , -1.8360023 , ..., -1.04551839,
        -1.13963133, -1.14082131],
       [ 1.        , -1.35825331, -1.35883937, ...,  2.98427476,
         3.26367657,  1.76554849]])

# **Prediction**
有了 weight 和測試資料即可預測 target

In [50]:
ans_y = np.dot(test_x, w)
ans_y

array([[ 5.17496040e+00],
       [ 1.83062143e+01],
       [ 2.04912181e+01],
       [ 1.15239429e+01],
       [ 2.66160568e+01],
       [ 2.05313481e+01],
       [ 2.19065510e+01],
       [ 3.17364687e+01],
       [ 1.33916741e+01],
       [ 6.44564665e+01],
       [ 2.02645688e+01],
       [ 1.53585761e+01],
       [ 6.85894728e+01],
       [ 4.84281137e+01],
       [ 1.87023338e+01],
       [ 1.01885957e+01],
       [ 3.07403629e+01],
       [ 7.11322178e+01],
       [-4.13051739e+00],
       [ 1.82356940e+01],
       [ 3.85789223e+01],
       [ 7.13115197e+01],
       [ 7.41034816e+00],
       [ 1.87179553e+01],
       [ 1.49372503e+01],
       [ 3.67197367e+01],
       [ 1.79616970e+01],
       [ 7.57894629e+01],
       [ 1.23093102e+01],
       [ 5.62953517e+01],
       [ 2.51131609e+01],
       [ 4.61024867e+00],
       [ 2.48377055e+00],
       [ 2.47594223e+01],
       [ 3.04802805e+01],
       [ 3.84639307e+01],
       [ 4.42023106e+01],
       [ 3.00868360e+01],
       [ 4.0

In [51]:
with open('submit.csv', mode='w', newline='') as submit_file:
    csv_writer = csv.writer(submit_file)
    header = ['id', 'value']
    print(header)
    csv_writer.writerow(header)
    for i in range(240):
        row = ['id_' + str(i), ans_y[i][0]]
        csv_writer.writerow(row)
        print(row)

['id', 'value']
['id_0', 5.174960398984723]
['id_1', 18.30621425352784]
['id_2', 20.491218094180518]
['id_3', 11.523942869805346]
['id_4', 26.616056752306143]
['id_5', 20.531348081761173]
['id_6', 21.906551018797387]
['id_7', 31.736468747068816]
['id_8', 13.39167405511169]
['id_9', 64.4564665029195]
['id_10', 20.264568836159455]
['id_11', 15.358576077361239]
['id_12', 68.58947276926729]
['id_13', 48.42811374745714]
['id_14', 18.702333824193182]
['id_15', 10.188595737466692]
['id_16', 30.740362859820433]
['id_17', 71.13221776355113]
['id_18', -4.130517391262478]
['id_19', 18.235694016428706]
['id_20', 38.57892227500774]
['id_21', 71.3115197253133]
['id_22', 7.410348162634103]
['id_23', 18.717955330321423]
['id_24', 14.93725026008455]
['id_25', 36.71973669470527]
['id_26', 17.961697005662682]
['id_27', 75.78946287210543]
['id_28', 12.309310248614494]
['id_29', 56.295351739649554]
['id_30', 25.113160865661495]
['id_31', 4.610248674094022]
['id_32', 2.483770554515047]
['id_33', 24.75942226