In [1]:
import numpy as np
import pandas as pd
import sys

# Linear Regression
任务：由前9个小时的18个features(包含“PM2.5”)预测第10个小时的PM2.5。

## 一、Load 'train.csv'
train.csv: 一年中每个月取前20天，记录每个小时的18个faetures(4320 * 18)

In [2]:
data = pd.read_csv("./train.csv", encoding='big5')
#print(data.iloc[:5, :10])

## 二、Preprocessing
    （1）提取有效数据
    （2）数据数值化
    （3）转化为numpy矩阵

In [3]:
data = data.iloc[:, 3:]
#print(data.iloc[:18, :10])

data[data == 'NR'] = 0
#print(data.iloc[:18, :10])

raw_data = data.to_numpy()
#print(raw_data[:18, :10])

## 三、Extract Features (1)
将原始“4320 * 18”的资料依照每个月重组分成12个“18 (features) * 480 (hours)”的资料。

In [4]:
month_data = {}
for month in range(12):
    sample = np.empty([18, 480])
    for day in range(20):
        sample[:, 24 * day: 24 * (day + 1)] = raw_data[18 * (20 * month + day): 18 * (20 * month + day + 1), :]
    month_data[month] = sample
#print(month_data[0])

## 四、Extract Features (2)
每个月会有480个小时，每9小时形成一个data，每个月会有471个data，故总共有“12 * 471”个data，而每个data有“9 * 18”个features。
对应的target则有“471 * 12”個(第10个小時的PM2.5)。

In [5]:
x = np.empty([12 * 471, 9 * 18], dtype=float)
y = np.empty([12 * 471, 1], dtype=float)
for month in range(12):
    for day in range(20):
        for hour in range(24):
            if day == 19 and hour > 14:
                continue
            x[471 * month + 24 * day + hour, :] = month_data[month][:, 24 * day + hour: 24 * day + hour + 9].reshape(1, -1)
            y[471 * month + 24 * day + hour, 0] = month_data[month][9, 24 * day + hour + 9]
#print(x)
#print(y)

## 五、Normalize (1)
让每一个不同的features具有相同的scaling（正态分布）

In [6]:
mean_x = np.mean(x, axis=0)
std_x = np.std(x, axis=0)
for i in range(len(x)):          # 12 * 471
    for j in range(len(x[0])):   # 18
        if std_x[j] != 0:
            x[i][j] = (x[i][j] - mean_x[j]) / std_x[j]
#print(x)

## 选做：将training_set分为training_set和validation_set

In [7]:
import math
x_train_set = x[: math.floor(len(x) * 0.8), :]
y_train_set = y[: math.floor(len(y) * 0.8), :]
x_validation = x[math.floor(len(x) * 0.8):, :]
y_validation = y[math.floor(len(y) * 0.8):, :]
#print(x_train_set)
#print(y_train_set)
#print(x_validation)
#print(y_validation)
#print(len(x_train_set))
#print(len(y_train_set))
#print(len(x_validation))
#print(len(y_validation))

## 六、Training

In [53]:
# 超参数
learning_rate = 200
iter_time = 20000
eps = 0.0000000001  # 避免 adagrad 的分母为 0 而加的极小数值


def train_linear(x_train, y_train):
    dim = 9 * 18 + 1
    w = np.zeros([dim, 1])
    x = np.concatenate((np.ones([len(x_train), 1]), x_train), axis=1).astype(float)

    adagrad = np.zeros([dim, 1])

    for t in range(iter_time):
        loss = np.sqrt(np.sum(np.power(np.dot(x, w) - y_train, 2)) / len(x_train))
        if(t % 1000 == 999):
            print(str(t+1) + ":" + str(loss))
        gradient = 2 * np.dot(x.transpose(), np.dot(x, w) - y_train)
        adagrad += gradient ** 2
        w = w - learning_rate * gradient / np.sqrt(adagrad + eps)
    
    np.save('./weight.npy', w)
    return w


def train_twice(x_train, y_train):
    dim = 2 * 9 * 18 + 1
    w = np.zeros([dim, 1])
    x = np.concatenate((np.ones([len(x_train), 1]), x_train, x_train ** 2), axis=1).astype(float)

    adagrad = np.zeros([dim, 1])

    for t in range(iter_time):
        loss = np.sqrt(np.sum(np.power(np.dot(x, w) - y_train, 2)) / len(x_train))
        if(t % 1000 == 999):
            print(str(t+1) + ":" + str(loss))
        gradient = 2 * np.dot(x.transpose(), np.dot(x, w) - y_train)
        adagrad += gradient ** 2
        w = w - learning_rate * gradient / np.sqrt(adagrad + eps)
    
    np.save('./weight2.npy', w)
    return w

## 七、Validation

In [54]:
def validation_linear(x_val, y_val, model_path='./weigth.npy'):
    w = np.load(model_path)
    x_val = np.concatenate((np.ones([len(x_val), 1]), x_val), axis=1).astype(float)
    loss = np.sqrt(np.sum(np.power(np.dot(x_val, w) - y_val, 2)) / len(x_val))
    return loss


def validation_twice(x_val, y_val, model_path='./weigth2.npy'):
    w = np.load(model_path)
    x_val = np.concatenate((np.ones([len(x_val), 1]), x_val, x_val ** 2), axis=1).astype(float)
    loss = np.sqrt(np.sum(np.power(np.dot(x_val, w) - y_val, 2)) / len(x_val))
    return loss

## 八、Main

In [55]:
#train_linear(x_train_set, y_train_set)
#loss = validation_linear(x_validation, y_validation, './weight.npy')
#print("The loss on the val data is: " + str(loss))

train_twice(x_train_set, y_train_set)
loss = validation_twice(x_validation, y_validation, './weight2.npy')
print("The loss on the val data is: " + str(loss))

1000:14.304373042010026
2000:8.280023798203368
3000:6.845674499117718
4000:6.188042867878256
5000:5.869479086581121
6000:5.711993292202514
7000:5.632223766399255
8000:5.590641127490519
9000:5.568267815245391
10000:5.555814570712218
11000:5.548627536507751
12000:5.544316540018562
13000:5.541622322958631
14000:5.53986405900862
15000:5.538663890291588
16000:5.537806499833292
17000:5.537165915348107
18000:5.5366664941052735
19000:5.536261650896217
20000:5.535921990610281
The loss on the val data is: 5.693512787897238


## Testing

In [12]:
testdata = pd.read_csv('./test.csv', header = None, encoding = 'big5')
test_data = testdata.iloc[:, 2:]
test_data[test_data == 'NR'] = 0
test_data = test_data.to_numpy()
test_x = np.empty([240, 9 * 18], dtype = float)
for i in range(240):
    test_x[i, :] = test_data[18 * i: 18* (i + 1), :].reshape(1, -1)
for i in range(len(test_x)):
    for j in range(len(test_x[0])):
        if std_x[j] != 0:
            test_x[i][j] = (test_x[i][j] - mean_x[j]) / std_x[j]
test_x = np.concatenate((np.ones([240, 1]), test_x), axis = 1).astype(float)
#print(test_x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._where(-key, value, inplace=True)


array([[ 1.        , -0.24447681, -0.24545919, ..., -0.67065391,
        -1.04594393,  0.07797893],
       [ 1.        , -1.35825331, -1.51789368, ...,  0.17279117,
        -0.10906991, -0.48454426],
       [ 1.        ,  1.5057434 ,  1.34508393, ..., -1.32666675,
        -1.04594393, -0.57829812],
       ...,
       [ 1.        ,  0.3919669 ,  0.54981237, ...,  0.26650729,
        -0.20275731,  1.20302531],
       [ 1.        , -1.8355861 , -1.8360023 , ..., -1.04551839,
        -1.13963133, -1.14082131],
       [ 1.        , -1.35825331, -1.35883937, ...,  2.98427476,
         3.26367657,  1.76554849]])

## Prediction

In [13]:
w = np.load('./weight.npy')
ans_y = np.dot(test_x, w)
#print(ans_y)

[[ 5.17496040e+00]
 [ 1.83062143e+01]
 [ 2.04912181e+01]
 [ 1.15239429e+01]
 [ 2.66160568e+01]
 [ 2.05313481e+01]
 [ 2.19065510e+01]
 [ 3.17364687e+01]
 [ 1.33916741e+01]
 [ 6.44564665e+01]
 [ 2.02645688e+01]
 [ 1.53585761e+01]
 [ 6.85894728e+01]
 [ 4.84281137e+01]
 [ 1.87023338e+01]
 [ 1.01885957e+01]
 [ 3.07403629e+01]
 [ 7.11322178e+01]
 [-4.13051739e+00]
 [ 1.82356940e+01]
 [ 3.85789223e+01]
 [ 7.13115197e+01]
 [ 7.41034816e+00]
 [ 1.87179553e+01]
 [ 1.49372503e+01]
 [ 3.67197367e+01]
 [ 1.79616970e+01]
 [ 7.57894629e+01]
 [ 1.23093102e+01]
 [ 5.62953517e+01]
 [ 2.51131609e+01]
 [ 4.61024867e+00]
 [ 2.48377055e+00]
 [ 2.47594223e+01]
 [ 3.04802805e+01]
 [ 3.84639307e+01]
 [ 4.42023106e+01]
 [ 3.00868360e+01]
 [ 4.04736750e+01]
 [ 2.92264799e+01]
 [ 5.60645605e+00]
 [ 3.86660161e+01]
 [ 3.46102134e+01]
 [ 4.83896975e+01]
 [ 1.47572477e+01]
 [ 3.44668201e+01]
 [ 2.74831069e+01]
 [ 1.20008794e+01]
 [ 2.13780362e+01]
 [ 2.85444031e+01]
 [ 2.01655138e+01]
 [ 1.07966781e+01]
 [ 2.2171035

## Save Prediction to CSV File

In [14]:
import csv
with open('submit.csv', mode='w', newline='') as submit_file:
    csv_writer = csv.writer(submit_file)
    header = ['id', 'value']
    print(header)
    csv_writer.writerow(header)
    for i in range(240):
        row = ['id_' + str(i), ans_y[i][0]]
        csv_writer.writerow(row)
        #print(row)

['id', 'value']
['id_0', 5.174960398984738]
['id_1', 18.306214253527898]
['id_2', 20.49121809418055]
['id_3', 11.52394286980536]
['id_4', 26.616056752306125]
['id_5', 20.531348081761216]
['id_6', 21.90655101879739]
['id_7', 31.736468747068834]
['id_8', 13.391674055111721]
['id_9', 64.45646650291954]
['id_10', 20.264568836159434]
['id_11', 15.35857607736122]
['id_12', 68.58947276926722]
['id_13', 48.42811374745719]
['id_14', 18.702333824193225]
['id_15', 10.188595737466695]
['id_16', 30.74036285982045]
['id_17', 71.1322177635511]
['id_18', -4.130517391262442]
['id_19', 18.235694016428685]
['id_20', 38.57892227500775]
['id_21', 71.31151972531332]
['id_22', 7.410348162634072]
['id_23', 18.717955330321423]
['id_24', 14.937250260084577]
['id_25', 36.71973669470532]
['id_26', 17.961697005662707]
['id_27', 75.78946287210537]
['id_28', 12.309310248614473]
['id_29', 56.2953517396496]
['id_30', 25.11316086566149]
['id_31', 4.610248674094032]
['id_32', 2.4837705545150244]
['id_33', 24.75942226132