## ML HW1 手把手教學

In [1]:
import math
import numpy as np
import pandas as pd


For Data Preprocessing, first we deal with anomaly data, basically data with wrong or invalid format.

In [2]:
def readdata(data, k):

	# 把有些數字後面的奇怪符號刪除
	for col in list(data.columns[2:]):
		data[col] = data[col].astype(str).map(lambda x: x.rstrip('x*#A'))
	data = data.values
	
	# 刪除欄位名稱及日期
	data = np.delete(data, [0,1], 1)
	
	# Only consider the k hours before 9
	data = data[::, 10-k-1:10]

	# 特殊值補0
	data[ data == 'NR'] = 0
	data[ data == ''] = 0
	data[ data == 'nan'] = 0
	data = data.astype(np.float)

	return data

We flatten our data to be in such format (col: one hour/ per col, row: one feature/ per row)

In [3]:
def extract(data):
	N = data.shape[0] // 18

	temp = data[:18, :]
    
    # Shape 會變成 (x, 18) x = 取多少hours
	for i in range(1, N):
		temp = np.hstack((temp, data[i*18: i*18+18, :]))
	return temp

Since some data points (PM2.5) have anomaly values, which strongly effect our training result, we decide to abandon them. In our case, we define PM2.5 < 2 or > 100 as anomaly data points. 

In [4]:
def valid(x, y, k):
	if y <= 2 or y > 100:
		return False
	for i in range(k):
		if x[9,i] <= 2 or x[9,i] > 100:
			return False
	return True

def parse2train(data, k):
	x = []
	y = []
	
	# separate the 0 to 8 a.m and 9 a.m.
	total_length = data.shape[1] // (k+1)
	for i in range(total_length):
		x_tmp = data[:,i*(k+1):i*(k+1)+k]
		y_tmp = data[9,i*(k+1)+k]
		if valid(x_tmp, y_tmp, k):
			x.append(x_tmp.reshape(-1,))
			y.append(y_tmp)
	# x 會是一個(n, 18, 9)的陣列， y 則是(n, 1) 
	x = np.array(x)
	y = np.array(y)
	return x,y

def parse2test(data, k):
	x = []
	
	total_length = data.shape[1] // k
	for i in range(total_length):
		x_tmp = data[:,i*k:i*k+k]
		x.append(x_tmp.reshape(-1,))
	# x 會是一個(n, 18, 9)的陣列， y 則是(n, 1) 
	x = np.array(x)
	return x

This is our gradient descent algorithm. **Adam** was implemented.

In [5]:
def minibatch(x, y):
    # 打亂data順序
    index = np.arange(x.shape[0])
    np.random.shuffle(index)
    x = x[index]
    y = y[index]
    
    # 訓練參數以及初始化
    batch_size = 64
    lr = 1e-3
    lam = 0.001
    beta_1 = np.full(x[0].shape, 0.9).reshape(-1, 1)
    beta_2 = np.full(x[0].shape, 0.99).reshape(-1, 1)
    w = np.full(x[0].shape, 0.1).reshape(-1, 1)
    bias = 0.1
    m_t = np.full(x[0].shape, 0).reshape(-1, 1)
    v_t = np.full(x[0].shape, 0).reshape(-1, 1)
    m_t_b = 0.0
    v_t_b = 0.0
    t = 0
    epsilon = 1e-8
    
    for num in range(1000):
        for b in range(int(x.shape[0]/batch_size)):
            t+=1
            x_batch = x[b*batch_size:(b+1)*batch_size]
            y_batch = y[b*batch_size:(b+1)*batch_size].reshape(-1,1)
            loss = y_batch - np.dot(x_batch,w) - bias
            
            # 計算gradient
            g_t = np.dot(x_batch.transpose(),loss) * (-2) +  2 * lam * np.sum(w)
            g_t_b = loss.sum(axis=0) * (2)
            m_t = beta_1*m_t + (1-beta_1)*g_t 
            v_t = beta_2*v_t + (1-beta_2)*np.multiply(g_t, g_t)
            m_cap = m_t/(1-(beta_1**t))
            v_cap = v_t/(1-(beta_2**t))
            m_t_b = 0.9*m_t_b + (1-0.9)*g_t_b
            v_t_b = 0.99*v_t_b + (1-0.99)*(g_t_b*g_t_b) 
            m_cap_b = m_t_b/(1-(0.9**t))
            v_cap_b = v_t_b/(1-(0.99**t))
            w_0 = np.copy(w)
            
            # 更新weight, bias
            w -= ((lr*m_cap)/(np.sqrt(v_cap)+epsilon)).reshape(-1, 1)
            bias -= (lr*m_cap_b)/(math.sqrt(v_cap_b)+epsilon)
            

    return w, bias

**Combine them together!**

In [24]:
if __name__ == "__main__":
    
    # 同學這邊要自己吃csv files
    #uploaded = files.upload()
    
    year1_pd = pd.read_csv('./data/year2-data.csv')
#     year2_pd = pd.read_csv('./data/year2-data.csv')
    
    k = 2
    year1 = readdata(year1_pd, k)
    train_data1 = extract(year1)
#     year2 = readdata(year2_pd, k)
#     train_data2 = extract(year2)

    train_x, train_y = parse2train(train_data1, k)
#     train_x2, train_y2 = parse2train(train_data2, k)

#     train_x = np.vstack((train_x,train_x2))
#     train_y = np.concatenate((train_y,train_y2))
    # print(train_x.shape, train_y.shape)
    
    w, bias = minibatch(train_x, train_y)
    # read testing
    testing_pd = pd.read_csv('./data/testing_data.csv')
    testing = readdata(testing_pd, k)
    testing_data = extract(testing)
    testing_x = parse2test(testing_data, k)
    
    pred_y = testing_x @ w
    output = pd.read_csv('./data/sample_submission.csv')
    output['value'] = pred_y
#     output.to_csv('./data/output/khour_k4_y12.csv', index=False)

In [25]:
train_err = np.mean( (train_y - pred_y)**2 )
train_err

202.43066052573644

y11
k=1
225.0340017774136
k=2
223.27231655997852
k=3
225.27065064004847
k=4
228.75332226656104
k = 5
237.14931478267513
y2
k=1
211.1738501643621
k=2
202.43066052573644
k=3
209.0589548262205


y1y2
k=3
212.72780274772325
k=4
218.8796803550031
k=5
226.79062210580528