## Noise adding for regularization

In [1]:
import math
import numpy as np
import pandas as pd

For Data Preprocessing, first we deal with anomaly data, basically data with wrong or invalid format.

In [2]:
def readdata(data):
    
	# 把有些數字後面的奇怪符號刪除
	for col in list(data.columns[2:]):
		data[col] = data[col].astype(str).map(lambda x: x.rstrip('x*#A'))
	data = data.values
	
	# 刪除欄位名稱及日期
	data = np.delete(data, [0,1], 1)
	
	# 特殊值補0
	data[ data == 'NR'] = 0
	data[ data == ''] = 0
	data[ data == 'nan'] = 0
	data = data.astype(np.float)

	return data

We flatten our data to be in such format (col: one hour/ per col, row: one feature/ per row)

In [3]:
def extract(data):
	N = data.shape[0] // 18

	temp = data[:18, :]
    
    # Shape 會變成 (x, 18) x = 取多少hours
	for i in range(1, N):
		temp = np.hstack((temp, data[i*18: i*18+18, :]))
	return temp

Since some data points (PM2.5) have anomaly values, which strongly effect our training result, we decide to abandon them. In our case, we define PM2.5 < 2 or > 100 as anomaly data points. 

In [4]:
def valid(x, y):
	if y <= 2 or y > 100:
		return False
	for i in range(9):
		if x[9,i] <= 2 or x[9,i] > 100:
			return False
	return True

def parse2train(data):
	x = []
	y = []
	
	# 用前面9筆資料預測下一筆PM2.5 所以需要-9
	total_length = data.shape[1] - 9
	for i in range(total_length):
		x_tmp = data[:,i:i+9]
		y_tmp = data[9,i+9]
		if valid(x_tmp, y_tmp):
			x.append(x_tmp.reshape(-1,))
			y.append(y_tmp)
	# x 會是一個(n, 18, 9)的陣列， y 則是(n, 1) 
	x = np.array(x)
	y = np.array(y)
	return x,y

def parse2test(data):
	x = []
	
	total_length = data.shape[1] // 9
	for i in range(total_length):
		x_tmp = data[:,i*9:i*9+9]
		x.append(x_tmp.reshape(-1,))
	# x 會是一個(n, 18, 9)的陣列， y 則是(n, 1) 
	x = np.array(x)
	return x

This is our gradient descent algorithm. **Adam** was implemented.

In [5]:
def minibatch(x, y):
    # 打亂data順序
    index = np.arange(x.shape[0])
    np.random.shuffle(index)
    x = x[index]
    y = y[index]
    
    # 訓練參數以及初始化
    batch_size = 64
    lr = 1e-3
    lam = 0.001
    beta_1 = np.full(x[0].shape, 0.9).reshape(-1, 1)
    beta_2 = np.full(x[0].shape, 0.99).reshape(-1, 1)
    w = np.full(x[0].shape, 0.1).reshape(-1, 1)
    bias = 0.1
    m_t = np.full(x[0].shape, 0).reshape(-1, 1)
    v_t = np.full(x[0].shape, 0).reshape(-1, 1)
    m_t_b = 0.0
    v_t_b = 0.0
    t = 0
    epsilon = 1e-8
    
    for num in range(1000):
        for b in range(int(x.shape[0]/batch_size)):
            t+=1
            x_batch = x[b*batch_size:(b+1)*batch_size]
            y_batch = y[b*batch_size:(b+1)*batch_size].reshape(-1,1)
            loss = y_batch - np.dot(x_batch,w) - bias
            
            # 計算gradient
            g_t = np.dot(x_batch.transpose(),loss) * (-2) +  2 * lam * np.sum(w)
            g_t_b = loss.sum(axis=0) * (2)
            m_t = beta_1*m_t + (1-beta_1)*g_t 
            v_t = beta_2*v_t + (1-beta_2)*np.multiply(g_t, g_t)
            m_cap = m_t/(1-(beta_1**t))
            v_cap = v_t/(1-(beta_2**t))
            m_t_b = 0.9*m_t_b + (1-0.9)*g_t_b
            v_t_b = 0.99*v_t_b + (1-0.99)*(g_t_b*g_t_b) 
            m_cap_b = m_t_b/(1-(0.9**t))
            v_cap_b = v_t_b/(1-(0.99**t))
            w_0 = np.copy(w)
            
            # 更新weight, bias
            w -= ((lr*m_cap)/(np.sqrt(v_cap)+epsilon)).reshape(-1, 1)
            bias -= (lr*m_cap_b)/(math.sqrt(v_cap_b)+epsilon)
            

    return w, bias

In [6]:
def read_once(path):
    year1_pd = pd.read_csv(path)
    year1 = readdata(year1_pd)
    train_data = extract(year1)
    return parse2train(train_data)
    

def read_once2(path1, path2):
    # read y1
    year1_pd = pd.read_csv(path1)
    year1 = readdata(year1_pd)
    train_data = extract(year1)
    train_x, train_y = parse2train(train_data)
    # Read y2
    year2_pd = pd.read_csv(path2)    
    year2 = readdata(year2_pd)
    train_data2 = extract(year2)
    train_x2, train_y2 = parse2train(train_data2)
    
    # concate
    train_x = np.vstack((train_x,train_x2))
    train_y = np.concatenate((train_y,train_y2))
    return train_x, train_y
# **Combine them together!**

def read_test(path3):
    testing_pd = pd.read_csv(path3)
    testing = readdata(testing_pd)
    testing_data = extract(testing)
    return parse2test(testing_data)

def read_item_list(path):
    from collections import OrderedDict
    year1_pd = pd.read_csv(path)
    return list(OrderedDict.fromkeys(year1_pd.iloc[::,1]))

In [7]:
def valid2(x, y):
	if y <= 2 or y > 100:
		return False
	for i in range(9):
		if x[i] <= 2 or x[i] > 100:
			return False
	return True

def parse2train2(data):
	x = []
	y = []
	
	# 用前面9筆資料預測下一筆PM2.5 所以需要-9
	total_length = data.shape[0] - 9
	for i in range(total_length):
		x_tmp = data[i:i+9]
		y_tmp = data[i+9]
		if valid2(x_tmp, y_tmp):
			x.append(x_tmp.reshape(-1,))
			y.append(y_tmp)
	# x 會是一個(n, 18, 9)的陣列， y 則是(n, 1) 
	x = np.array(x)
	y = np.array(y)
	return x,y

# Adding noise in features for regularization
**Noise in features!**

### Case 1:抽全部9小時內的污染源feature當作一次項(加bias)
Adding noise before parsing.

### Case 2:抽全部9小時內pm2.5的一次項當作feature(加bias)
只使用PM2.5當feature


In [10]:
case = '1'

np.random.seed(0)
path = './data/year2-data.csv'
item_list = read_item_list(path)

year_pd = pd.read_csv(path)
year = readdata(year_pd)
train_data = extract(year)
test_path = './data/testing_data.csv'
test_x = read_test(test_path)

if case == '2':
    train_data = train_data[9,::]
    test_x = test_x[::,9*9:9*9+9]
    train_x, train_y = parse2train2(train_data)
else:
    train_x, train_y = parse2train(train_data)
# read testing and pred


# Add noise here
noise_level = 0.1
for i in range(train_x.shape[1]):
    train_x[::,i] = train_x[::,i] + np.random.normal(0, (np.var(train_x[::,i])*noise_level)**(1/2), len(train_x))

# training
w, bias = minibatch(train_x, train_y)
pred_y = test_x @ w + bias

id_list = [ "id_"+str(i) for i in range(len(test_x))]
output_pd = pd.DataFrame(zip(id_list, pred_y.ravel()), columns=['id', 'value'])

# output_name = './output/case' + case +'_y2_feature_noise0.1.csv'
# output_pd.to_csv(output_name, index=False)

In [12]:
train_x.shape

(7100, 162)

In [11]:
output_pd

Unnamed: 0,id,value
0,id_0,19.030662
1,id_1,17.898701
2,id_2,29.605728
3,id_3,13.527441
4,id_4,24.370929
...,...,...
495,id_495,8.198283
496,id_496,15.824962
497,id_497,11.367833
498,id_498,8.271553
