## Noise adding for regularization

In [1]:
import math
import numpy as np
import pandas as pd



For Data Preprocessing, first we deal with anomaly data, basically data with wrong or invalid format.

In [2]:
def readdata(data):
    
	# 把有些數字後面的奇怪符號刪除
	for col in list(data.columns[2:]):
		data[col] = data[col].astype(str).map(lambda x: x.rstrip('x*#A'))
	data = data.values
	
	# 刪除欄位名稱及日期
	data = np.delete(data, [0,1], 1)
	
	# 特殊值補0
	data[ data == 'NR'] = 0
	data[ data == ''] = 0
	data[ data == 'nan'] = 0
	data = data.astype(np.float)

	return data

We flatten our data to be in such format (col: one hour/ per col, row: one feature/ per row)

In [3]:
def extract(data):
	N = data.shape[0] // 18

	temp = data[:18, :]
    
    # Shape 會變成 (x, 18) x = 取多少hours
	for i in range(1, N):
		temp = np.hstack((temp, data[i*18: i*18+18, :]))
	return temp

Since some data points (PM2.5) have anomaly values, which strongly effect our training result, we decide to abandon them. In our case, we define PM2.5 < 2 or > 100 as anomaly data points. 

In [4]:
def valid(x, y):
	if y <= 2 or y > 100:
		return False
	for i in range(9):
		if x[9,i] <= 2 or x[9,i] > 100:
			return False
	return True

def parse2train(data):
	x = []
	y = []
	
	# 用前面9筆資料預測下一筆PM2.5 所以需要-9
	total_length = data.shape[1] - 9
	for i in range(total_length):
		x_tmp = data[:,i:i+9]
		y_tmp = data[9,i+9]
		if valid(x_tmp, y_tmp):
			x.append(x_tmp.reshape(-1,))
			y.append(y_tmp)
	# x 會是一個(n, 18, 9)的陣列， y 則是(n, 1) 
	x = np.array(x)
	y = np.array(y)
	return x,y

def parse2test(data):
	x = []
	
	total_length = data.shape[1] // 9
	for i in range(total_length):
		x_tmp = data[:,i*9:i*9+9]
		x.append(x_tmp.reshape(-1,))
	# x 會是一個(n, 18, 9)的陣列， y 則是(n, 1) 
	x = np.array(x)
	return x

This is our gradient descent algorithm. **Adam** was implemented.

In [5]:
def minibatch(x, y):
    # 打亂data順序
    index = np.arange(x.shape[0])
    np.random.shuffle(index)
    x = x[index]
    y = y[index]
    
    # 訓練參數以及初始化
    batch_size = 64
    lr = 1e-3
    lam = 0.001
    beta_1 = np.full(x[0].shape, 0.9).reshape(-1, 1)
    beta_2 = np.full(x[0].shape, 0.99).reshape(-1, 1)
    w = np.full(x[0].shape, 0.1).reshape(-1, 1)
    bias = 0.1
    m_t = np.full(x[0].shape, 0).reshape(-1, 1)
    v_t = np.full(x[0].shape, 0).reshape(-1, 1)
    m_t_b = 0.0
    v_t_b = 0.0
    t = 0
    epsilon = 1e-8
    
    for num in range(1000):
        for b in range(int(x.shape[0]/batch_size)):
            t+=1
            x_batch = x[b*batch_size:(b+1)*batch_size]
            y_batch = y[b*batch_size:(b+1)*batch_size].reshape(-1,1)
            loss = y_batch - np.dot(x_batch,w) - bias
            
            # 計算gradient
            g_t = np.dot(x_batch.transpose(),loss) * (-2) +  2 * lam * np.sum(w)
            g_t_b = loss.sum(axis=0) * (2)
            m_t = beta_1*m_t + (1-beta_1)*g_t 
            v_t = beta_2*v_t + (1-beta_2)*np.multiply(g_t, g_t)
            m_cap = m_t/(1-(beta_1**t))
            v_cap = v_t/(1-(beta_2**t))
            m_t_b = 0.9*m_t_b + (1-0.9)*g_t_b
            v_t_b = 0.99*v_t_b + (1-0.99)*(g_t_b*g_t_b) 
            m_cap_b = m_t_b/(1-(0.9**t))
            v_cap_b = v_t_b/(1-(0.99**t))
            w_0 = np.copy(w)
            
            # 更新weight, bias
            w -= ((lr*m_cap)/(np.sqrt(v_cap)+epsilon)).reshape(-1, 1)
            bias -= (lr*m_cap_b)/(math.sqrt(v_cap_b)+epsilon)
            

    return w, bias

In [6]:
def read_once(path):
    year1_pd = pd.read_csv(path)
    year1 = readdata(year1_pd)
    train_data = extract(year1)
    return parse2train(train_data)
    

def read_once2(path1, path2):
    # read y1
    year1_pd = pd.read_csv(path1)
    year1 = readdata(year1_pd)
    train_data = extract(year1)
    train_x, train_y = parse2train(train_data)
    # Read y2
    year2_pd = pd.read_csv(path2)    
    year2 = readdata(year2_pd)
    train_data2 = extract(year2)
    train_x2, train_y2 = parse2train(train_data2)
    
    # concate
    train_x = np.vstack((train_x,train_x2))
    train_y = np.concatenate((train_y,train_y2))
    return train_x, train_y
# **Combine them together!**

def read_test(path3):
    testing_pd = pd.read_csv(path3)
    testing = readdata(testing_pd)
    testing_data = extract(testing)
    return parse2test(testing_data)

def read_item_list(path):
    from collections import OrderedDict
    year1_pd = pd.read_csv(path)
    return list(OrderedDict.fromkeys(year1_pd.iloc[::,1]))

# Read data

In [7]:
np.random.seed(0)
path = './data/year2-data.csv'
item_list = read_item_list(path)

year_pd = pd.read_csv(path)
year = readdata(year_pd)
train_data = extract(year)
train_x, train_y = parse2train(train_data)


# Adding noise in data as regularization

In [8]:
[np.var(train_data[i,::]) for i in range(len(train_data))]

[38.43047691861617,
 0.027521032583765977,
 0.059790400636715244,
 0.01164623556118513,
 36.49385222355142,
 102.65542733666418,
 202.25939268103252,
 369.1745158769626,
 590.4693430078604,
 7717.303667550197,
 4.2946946263630865,
 197.6025015319739,
 5.525599986317008,
 0.05338167208773795,
 6141.835352129335,
 6413.486185131539,
 1.0198066240017931,
 0.8722428033975521]

# Tree

In [19]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
Tree_regressor = DecisionTreeRegressor(random_state=0)
# cross_val_score(Tree_regressor, train_x, train_y, cv=10)

Tree_regressor.fit(train_x, train_y)
# read testing and pred
test_path = './data/testing_data.csv'
test_x = read_test(test_path)
pred_y = Tree_regressor.predict(test_x)
id_list = [ "id_"+str(i) for i in range(len(test_x))]

output_pd = pd.DataFrame(zip(id_list, pred_y.ravel()), columns=['id', 'value'])
output_name = './output/y2_tree_default.csv'
output_pd.to_csv(output_name, index=False)

In [20]:
output_pd

Unnamed: 0,id,value
0,id_0,24.0
1,id_1,11.0
2,id_2,41.0
3,id_3,20.0
4,id_4,27.0
...,...,...
495,id_495,11.0
496,id_496,15.0
497,id_497,12.0
498,id_498,9.0


In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

Rand_forest_regr = RandomForestRegressor(max_depth=2, random_state=0,
                             n_estimators=100)

cross_val_score(Rand_forest_regr, train_x, train_y, cv=10)
# Rand_forest_regr.fit(train_x, train_y)


array([0.74935575, 0.75268889, 0.76563788, 0.74492853, 0.66472086,
       0.63473952, 0.69333062, 0.76748318, 0.72358738, 0.80954346])

In [11]:
from sklearn.ensemble import AdaBoostRegressor
ada_boost_regr = AdaBoostRegressor(random_state=0, n_estimators=100)
cross_val_score(ada_boost_regr, train_x, train_y, cv=10)


# ada_boost_regr.fit(train_x, train_y)  
# ada_boost_regr.feature_importances_  
# ada_boost_regr.score(X, y)  

array([0.76450798, 0.75357532, 0.76597157, 0.71782487, 0.19144676,
       0.29981248, 0.53668613, 0.65906242, 0.44638515, 0.59183664])

In [12]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
r1 = LinearRegression()
r2 = RandomForestRegressor(n_estimators=10, random_state=1)


er = VotingRegressor([('lr', r1), ('rf', r2)])
cross_val_score(er, train_x, train_y, cv=10)

# print(er.fit(X, y).predict(X))

array([0.84972775, 0.82459689, 0.79439089, 0.7972529 , 0.75481918,
       0.7749639 , 0.80292446, 0.85333585, 0.83318711, 0.88902308])

In [16]:
import numpy as np
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

cross_val_score(lin_reg, train_x, train_y, cv=10)

# lin_reg.score(train_x, y)


array([0.85676701, 0.82525482, 0.79285547, 0.79573328, 0.74869566,
       0.79259722, 0.80740053, 0.85774307, 0.84776547, 0.89180665])

In [13]:
train_x.shape

(7100, 162)

In [14]:
# from scipy import stats
# stats.describe(train_data[0,::])