In [23]:
from sklearn import datasets, metrics
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from math import sqrt
import numpy as np
import pandas as pd
import pymysql

## 取得資料

In [24]:
df = pd.read_csv("C:/Users/user/OneDrive/桌面/Air/data/Taoyuan.csv")

In [25]:
## 移除風速、風向
del df["WindDirec"]
del df["WindSpeed"]

## 處理遺失值
> 暫時先將遺失值刪除

In [26]:
df.isna().sum()
df1 = df.dropna()

In [27]:
X = df1.iloc[:,[3,4,5,8,9,10,12,13]]
y = df1.iloc[:,7:8].values # PM2.5

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 12)

In [29]:
X

Unnamed: 0,SO2,CO,O3,NO2,NOx,NO,Temp,Humidity
0,9.6,0.43,17.2,16.5,18.1,1.7,23.0,98.0
1,5.3,0.39,16.6,14.7,16.2,1.5,23.1,98.0
2,4.4,0.48,7.5,23.8,24.3,0.3,23.0,99.0
3,4.5,0.44,6.0,23.0,24.0,1.0,22.8,98.0
5,3.4,0.55,2.8,21.1,27.9,6.8,23.5,97.0
...,...,...,...,...,...,...,...,...
4387,0.8,0.53,45.6,14.7,16.1,1.4,18.2,73.0
4388,0.6,0.58,39.5,12.7,14.0,1.3,17.8,68.0
4389,0.8,0.47,40.7,10.3,11.6,1.2,17.5,66.0
4390,0.7,0.43,37.6,10.5,12.2,1.7,17.0,61.0


## 標準化
> 將所有特徵標準化，使得數據的平均值為 0，標準差為 1 (標準常態分配)。  
> 適合使用時機於：當有些特徵的標準差過大時，使離群值影響降低，能夠有效地讓模型快速收斂

In [7]:
sc_x = StandardScaler()
sc_y = StandardScaler()
x_train_svr = sc_x.fit_transform(X_train)
y_train_svr = sc_y.fit_transform(y_train)

## 建模

* C: 決定給誤差/被分錯的資料「多少」懲罰值
> C 越大，代表容錯越小，卻容易overfitting  
> C 越小，代表容錯越大，可以追求更大的margin
* degree: 增加模型複雜度，3 代表轉換到三次空間進行分類。
* gamma: 數值越大越能做複雜的分類邊界。
> gamma 大，資料點的影響力範圍比較近，對超平面來說，近點的影響力權重較大，容易勾勒出擬合近點的超平面，也容易造成 overfitting  
> gamma 小，資料點的影響力範圍比較遠，對超平面來說，較遠的資料點也有影響力，因此能勾勒出平滑、近似直線的超平面  
* epsilon = margin of tolerance
> 越大，代表容忍區塊越大，越多資料會被忽視，造成模型的準確度越低  
> 越小越接近 0，所有的資料殘差(error)都會被考慮，卻也容易造成 overfitting

## 評估迴歸模型
* RMSLE (Root mean square logarithmic error)  
> RMSLE 就是 log 形式的 RMSE，與 MSPE、MAPE 一樣考慮相對誤差，但它的誤差曲線具有不對稱性喔

In [8]:
def rmsle(real, predicted):
    sum=0.0
    for x in range(len(predicted)):
        if predicted[x]<0 or real[x]<0:
            continue
        p = np.log(predicted[x]+1)
        r = np.log(real[x]+1)
        sum = sum + (p - r)**2
    return ((sum/len(predicted))**0.5)[0]

1. test data  
2. training data

### SVR

#### linear

In [9]:
############################################################ 建模 ############################################################
svr_linear = SVR(kernel = "linear", C = 1)
# 利用訓練集讓模型進行學習
svr_linear.fit(x_train_svr, y_train_svr)
'''
fit_transform：是 fit 和 transform 的组合，既包括了模型訓練又包含了轉換(資料標準化)
scaler.inverse_transform：是將標準化後的資料轉換為原始數據
'''
svr_linear_y_predict = sc_y.inverse_transform(svr_linear.predict(sc_x.transform(X_test)))

######################################################## 評估迴歸模型 ########################################################
### test data
RMSE_svr_linear = sqrt(metrics.mean_squared_error(y_test, svr_linear_y_predict))
MAE_svr_linear = metrics.mean_absolute_error(y_test, svr_linear_y_predict)
R2_svr_linear = metrics.r2_score(y_test,svr_linear_y_predict)
RMSLE_svr_linear = rmsle(y_test,svr_linear_y_predict)

### training data
svr_train_y_RMSE_linear = sqrt(metrics.mean_squared_error(y_train,sc_y.inverse_transform(svr_linear.predict(sc_x.transform(X_train)))))
svr_train_y_MAE_linear = metrics.mean_absolute_error(y_train,sc_y.inverse_transform(svr_linear.predict(sc_x.transform(X_train))))
svr_train_y_R2_linear = metrics.r2_score(y_train,sc_y.inverse_transform(svr_linear.predict(sc_x.transform(X_train))))
svr_train_y_RMSLE_linear = rmsle(y_train,sc_y.inverse_transform(svr_linear.predict(sc_x.transform(X_train))))

  return f(*args, **kwargs)


#### poly

In [10]:
############################################################ 建模 ############################################################
svr_poly = SVR(kernel='poly', degree=3, gamma='auto', C=1)
# 利用訓練集讓模型進行學習
svr_poly.fit(x_train_svr, y_train_svr)
'''
fit_transform：是 fit 和 transform 的组合，既包括了模型訓練又包含了轉換(資料標準化)
scaler.inverse_transform：是將標準化後的資料轉換為原始數據
'''
svr_poly_y_predict = sc_y.inverse_transform(svr_poly.predict(sc_x.transform(X_test)))

######################################################## 評估迴歸模型 ########################################################
### test data
RMSE_svr_poly = sqrt(metrics.mean_squared_error(y_test, svr_poly_y_predict))
MAE_svr_poly = metrics.mean_absolute_error(y_test, svr_poly_y_predict)
R2_svr_poly = metrics.r2_score(y_test,svr_poly_y_predict)
RMSLE_svr_poly = rmsle(y_test,svr_poly_y_predict)

### training data
svr_train_y_RMSE_poly = sqrt(metrics.mean_squared_error(y_train,sc_y.inverse_transform(svr_poly.predict(sc_x.transform(X_train)))))
svr_train_y_MAE_poly = metrics.mean_absolute_error(y_train,sc_y.inverse_transform(svr_poly.predict(sc_x.transform(X_train))))
svr_train_y_R2_poly = metrics.r2_score(y_train,sc_y.inverse_transform(svr_poly.predict(sc_x.transform(X_train))))
svr_train_y_RMSLE_poly = rmsle(y_train,sc_y.inverse_transform(svr_poly.predict(sc_x.transform(X_train))))

  return f(*args, **kwargs)


#### rbf

In [11]:
############################################################ 建模 ############################################################
svr_rbf = SVR(kernel='rbf', degree=3, gamma=0.1, C=15)
# 利用訓練集讓模型進行學習
svr_rbf.fit(x_train_svr, y_train_svr)
'''
fit_transform：是 fit 和 transform 的组合，既包括了模型訓練又包含了轉換(資料標準化)
scaler.inverse_transform：是將標準化後的資料轉換為原始數據
'''
svr_rbf_y_predict = sc_y.inverse_transform(svr_rbf.predict(sc_x.transform(X_test)))

######################################################## 評估迴歸模型 ########################################################
### test data
RMSE_svr_rbf = sqrt(metrics.mean_squared_error(y_test, svr_rbf_y_predict))
MAE_svr_rbf = metrics.mean_absolute_error(y_test, svr_rbf_y_predict)
R2_svr_rbf = metrics.r2_score(y_test,svr_rbf_y_predict)
RMSLE_svr_rbf = rmsle(y_test,svr_rbf_y_predict)

### training data
svr_train_y_RMSE_rbf = sqrt(metrics.mean_squared_error(y_train,sc_y.inverse_transform(svr_rbf.predict(sc_x.transform(X_train)))))
svr_train_y_MAE_rbf = metrics.mean_absolute_error(y_train,sc_y.inverse_transform(svr_rbf.predict(sc_x.transform(X_train))))
svr_train_y_R2_rbf = metrics.r2_score(y_train,sc_y.inverse_transform(svr_rbf.predict(sc_x.transform(X_train))))
svr_train_y_RMSLE_rbf = rmsle(y_train,sc_y.inverse_transform(svr_rbf.predict(sc_x.transform(X_train))))

  return f(*args, **kwargs)


### Random Forest
> random forest with 500 trees

In [14]:
############################################################ 建模 ############################################################
rf_reg = RandomForestRegressor(n_estimators = 500, random_state = 0)
rf_reg.fit(X_train,y_train)

rf_y_predict = rf_reg.predict(X_test)

######################################################## 評估迴歸模型 ########################################################
### test data
RMSE_rf = sqrt(metrics.mean_squared_error(y_test, rf_y_predict))
MAE_rf = metrics.mean_absolute_error(y_test, rf_y_predict)
R2_rf = metrics.r2_score(y_test,rf_y_predict)
RMSLE_rf = rmsle(y_test,rf_y_predict)

### training data
rf_ytp_RMSE = sqrt(metrics.mean_squared_error(y_train, rf_reg.predict(X_train)))
rf_ytp_MAE = metrics.mean_absolute_error(y_train, rf_reg.predict(X_train))
rf_ytp_R2 = metrics.r2_score(y_train, rf_reg.predict(X_train))
rf_ytp_RMSLE = rmsle(y_train, rf_reg.predict(X_train))

  rf_reg.fit(X_train,y_train)


## 比較表

In [15]:
test_data_evalue = {"linear": [RMSE_svr_linear, MAE_svr_linear, R2_svr_linear , RMSLE_svr_linear],
                    "poly": [RMSE_svr_poly, MAE_svr_poly, R2_svr_poly , RMSLE_svr_poly],
                    "rbf": [RMSE_svr_rbf, MAE_svr_rbf, R2_svr_rbf , RMSLE_svr_rbf],
                    "Random Forest":[RMSE_rf, MAE_rf, R2_rf, RMSLE_rf]}
test_evalue = pd.DataFrame(test_data_evalue).round(3)
test_evalue.index = ["RMSE", "MAE", "R2", "RMSLE"]
test_evalue

Unnamed: 0,linear,poly,rbf,Random Forest
RMSE,5.447,6.219,4.769,4.669
MAE,3.785,3.8,3.312,3.267
R2,0.296,0.082,0.46,0.483
RMSLE,0.438,0.432,0.386,0.389


In [16]:
train_data_evalue = {"linear": [svr_train_y_RMSE_linear, svr_train_y_MAE_linear, svr_train_y_R2_linear , svr_train_y_RMSLE_linear],
                    "poly": [svr_train_y_RMSE_poly, svr_train_y_MAE_poly, svr_train_y_R2_poly , svr_train_y_RMSLE_poly],
                    "rbf": [svr_train_y_RMSE_rbf, svr_train_y_MAE_rbf, svr_train_y_R2_rbf , svr_train_y_RMSLE_rbf],
                    "Random Forest":[rf_ytp_RMSE, rf_ytp_MAE, rf_ytp_R2, rf_ytp_RMSLE]}
train_evalue = pd.DataFrame(train_data_evalue).round(3)
train_evalue.index = ["RMSE", "MAE", "R2", "RMSLE"]
train_evalue

Unnamed: 0,linear,poly,rbf,Random Forest
RMSE,4.656,4.807,3.63,1.496
MAE,3.461,3.396,2.494,1.103
R2,0.381,0.34,0.624,0.936
RMSLE,0.419,0.42,0.341,0.163
