In [1]:
# 導入所需使用到的套件

# 導入 Python 進行資料處理之套件 Pandas，並把它另為 pd 
import pandas as pd
# 導入 Python 處理數值之套件 mumpy，並把它另為 np
import numpy as np
# 繪圖
import matplotlib.pyplot as plt
# 導入 Python 處理時間套件
import time

# 統計套件，欲作均勻分布之隨機抽取
from scipy.stats import uniform
# 分割訓練集、測試集、驗證集
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor as rfr
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, KFold

In [2]:
df = pd.read_csv('dftrain0420.csv')

In [3]:
# 刪除不要用的 columns
df = df.drop(['Unnamed: 0','土地位置建物門牌','交易年月日','建築完成年月',
             'new_date','address','longitude','latitude'],axis=1)

In [4]:
# check
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66479 entries, 0 to 66478
Data columns (total 25 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   鄉鎮市區           66479 non-null  int64  
 1   土地移轉總坪數        66479 non-null  float64
 2   都市土地使用分區       66479 non-null  int64  
 3   移轉層次           66479 non-null  int64  
 4   總樓層數           66479 non-null  int64  
 5   建物型態           66479 non-null  int64  
 6   主要建材           66479 non-null  int64  
 7   建物移轉總坪數        66479 non-null  float64
 8   建物現況格局-房       66479 non-null  int64  
 9   建物現況格局-廳       66479 non-null  int64  
 10  建物現況格局-衛       66479 non-null  int64  
 11  建物現況格局-隔間      66479 non-null  int64  
 12  有無管理組織         66479 non-null  int64  
 13  總價元            66479 non-null  float64
 14  主建物坪數          66479 non-null  float64
 15  附屬建物坪數         66479 non-null  float64
 16  陽台坪數           66479 non-null  float64
 17  土地數量           66479 non-null  int64  
 18  建物數量  

In [5]:
# 設定依變數與自變數，即設定權重與標籤

# 權重
x = df.drop(['總價元'],axis=1)

# 標籤
y = df['總價元']

In [6]:
# 先將 x, y 依照 80%、20% 的比例分割成 訓練集與測試集
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

# 再將訓練集依照 75%、25% 的比例分割成 訓練集與驗證集 => 訓練集、驗證集、測試集比例為 60%、20%、20%
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=42)

# 將 x_train、x_test、x_val 轉換成 numpy 格式
x_train = x_train.to_numpy()
x_test = x_test.to_numpy()
x_val = x_val.to_numpy()

In [7]:
# 將 x_train、x_test、x_val 進行資料標準化
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
x_val = scaler.transform(x_val)

In [8]:
# 計算誤差之 Function，輸出 RMSE 與 R^2
def compute_errors(y,pred):
    
    mse = mean_squared_error(y, pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y, pred)
    
    error_names = ['RMSE', 'R^2']
    errors = [rmse, r2]
    
    for name, error in zip(error_names, errors):
        print(f'{name} : {error:.6f}')

In [9]:
# 紀錄開始時間
start = time.time()

# 模型訓練
train_model1 = SVR().fit(x_train, y_train)

# 紀錄結束時間
end = time.time()

# 顯示執行時間
print(f'Execution time : {end - start :.3f} seconds')

# 顯示預設超參數(所使用超參數)
print(train_model1.get_params())

Execution time : 150.983 seconds
{'C': 1.0, 'cache_size': 200, 'coef0': 0.0, 'degree': 3, 'epsilon': 0.1, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'shrinking': True, 'tol': 0.001, 'verbose': False}


In [10]:
# 預測測試集
train_pred1 = train_model1.predict(x_test)

# 預測驗證集
val_pred1 = train_model1.predict(x_val)

In [11]:
print('----------Experiment 1----------')
print('Training Error')
compute_errors(y_test, train_pred1)
print('--------------------------------')
print('Valid Error')
compute_errors(y_val, val_pred1)

----------Experiment 1----------
Training Error
RMSE : 252.492223
R^2 : 0.677384
--------------------------------
Valid Error
RMSE : 253.881837
R^2 : 0.673741


In [13]:
# 紀錄開始時間
start = time.time()

# 訓練隨機森林迴歸模型，以取得模型權重做特徵選擇
forest_model = rfr(random_state=42).fit(x_train, y_train)

# 紀錄結束時間
end = time.time()

# 顯示執行時間
print(f'Execution time : {end - start :.3f} seconds')

# 顯示預設超參數(所使用超參數)
print(forest_model.get_params())

Execution time : 196.660 seconds
{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}


In [14]:
feature_importances = forest_model.feature_importances_

In [15]:
median_fi = np.median(feature_importances)
x_train_new = x_train[:,feature_importances >= median_fi]
x_test_new = x_test[:,feature_importances >= median_fi]
x_val_new = x_val[:,feature_importances >= median_fi]

In [16]:
median_fi

0.007505298375173299

In [17]:
# 紀錄開始時間
start = time.time()

# 模型訓練
train_model2 = SVR().fit(x_train_new, y_train)

# 紀錄結束時間
end = time.time()

# 顯示執行時間
print(f'Execution time : {end - start :.3f} seconds')

# 顯示預設超參數(所使用超參數)
print(train_model2.get_params())

Execution time : 139.821 seconds
{'C': 1.0, 'cache_size': 200, 'coef0': 0.0, 'degree': 3, 'epsilon': 0.1, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'shrinking': True, 'tol': 0.001, 'verbose': False}


In [18]:
# 預測測試集
train_pred2 = train_model2.predict(x_test_new)
# 預測驗證集
val_pred2 = train_model2.predict(x_val_new)

In [19]:
print('----------Experiment 2----------')
print('Training Error')
compute_errors(y_test, train_pred2)
print('--------------------------------')
print('Valid Error') 
compute_errors(y_val, val_pred2)

----------Experiment 2----------
Training Error
RMSE : 243.192237
R^2 : 0.700712
--------------------------------
Valid Error
RMSE : 243.782654
R^2 : 0.699182


In [251]:
start = time.time()

cv = KFold(n_splits=10, shuffle=True, random_state=42)
scoring='neg_mean_squared_error'
param_dict = {
    'kernel':['linear','poly','rbf','sigmoid'],
    'C': uniform(1,15),
    'epsilon':uniform(0.1, 1),
    'gamma':['scale','auto'],
    'coef0':unifrom(-1,1)
}

rscv = RandomizedSearchCV(SVR(), param_dict, n_iter=20, cv=cv, scoring=scoring, random_state=42)
rscv.fit(x_train_new, y_train)

print('The best model score :', np.sqrt(-rscv.best_score_))
print('The best model param :', rscv.best_params_)

end = time.time()

print(f'Execution time : {end - start :.3f} seconds')

The best model score : 200.60464398923034
The best model param : {'C': 15.233283058799998, 'epsilon': 1.0656320330745594, 'gamma': 'auto', 'kernel': 'rbf'}
Execution time : 14243.901 seconds


In [252]:
kernel, gamma = rscv.best_params_['kernel'], rscv.best_params_['gamma']
c, e = rscv.best_params_['C'], rscv.best_params_['epsilon']

In [253]:
start = time.time()

cv = KFold(n_splits=10, shuffle=True, random_state=42)

param_grid = {
    'kernel':[kernel],
    'C':[2*c/3, c, c*3/2],
    'epsilon':[2*e/3, e, e*3/2],
    'gamma':[gamma]
}


gscv = GridSearchCV(SVR(), param_grid, cv=cv, scoring='neg_mean_squared_error').fit(x_train_new, y_train)

print('The best model score : ', np.sqrt(-gscv.best_score_))
print('The best model param :', gscv.best_params_)

end = time.time()

print(f'Execution time : {end - start :.3f} seconds')

The best model score :  197.37914619398398
The best model param : {'C': 22.849924588199997, 'epsilon': 1.5984480496118392, 'gamma': 'auto', 'kernel': 'rbf'}
Execution time : 7009.077 seconds


In [254]:
svr_param = {'kernel': gscv.best_params_['kernel'],
             'C': gscv.best_params_['C'],
             'epsilon': gscv.best_params_['epsilon'],
             'gamma': gscv.best_params_['gamma']}

In [255]:
# 紀錄開始時間
start = time.time()

train_model3 = SVR(**svr_param).fit(x_train_new, y_train)


# 紀錄結束時間
end = time.time()

# 顯示執行時間
print(f'Execution time : {end - start :.6f} seconds')

Execution time : 82.338584 seconds


In [256]:
# 預測測試集
train_pred3 = train_model3.predict(x_test_new)
# 預測驗證集
val_pred3 = train_model3.predict(x_val_new)

In [257]:
print('----------Experiment 3----------')
print('Training Error')
compute_errors(y_test, train_pred3)
print('-------------------------------- ')         
compute_errors(y_val, val_pred3)
print('--------------------------------')

----------Experiment 3----------
Training Error
RMSE : 197.495173
R^2 : 0.802620
-------------------------------- 
RMSE : 195.629884
R^2 : 0.806282
--------------------------------


In [258]:
df112 = pd.read_csv('dfvalid0420.csv')
df112.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15916 entries, 0 to 15915
Data columns (total 35 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     15916 non-null  int64  
 1   鄉鎮市區           15916 non-null  int64  
 2   土地位置建物門牌       15916 non-null  object 
 3   土地移轉總坪數        15916 non-null  float64
 4   都市土地使用分區       15916 non-null  int64  
 5   交易年月日          15916 non-null  object 
 6   移轉層次           15916 non-null  int64  
 7   總樓層數           15916 non-null  int64  
 8   建物型態           15916 non-null  int64  
 9   主要用途           15916 non-null  object 
 10  主要建材           15916 non-null  int64  
 11  建築完成年月         15916 non-null  object 
 12  建物移轉總坪數        15916 non-null  float64
 13  建物現況格局-房       15916 non-null  int64  
 14  建物現況格局-廳       15916 non-null  int64  
 15  建物現況格局-衛       15916 non-null  int64  
 16  建物現況格局-隔間      15916 non-null  int64  
 17  有無管理組織         15916 non-null  int64  
 18  總價元   

In [259]:
df112 = df112.drop(['Unnamed: 0','土地位置建物門牌','交易年月日','建築完成年月','主要用途','車位數量',
             'new_date','address','longitude','latitude'],axis=1)

In [260]:
new_data = df112.drop(['總價元'],axis=1)

In [261]:
selected_columns = new_data.columns[feature_importances >= median_fi]
new_data = new_data[selected_columns]

In [262]:
new_data = new_data.to_numpy()
scaler = StandardScaler().fit(new_data)
new_data = scaler.transform(new_data)

In [263]:
start = time.time()
new_data_pred = model.predict(new_data)

print('Test new data')
print(compute_errors(df112['總價元'], new_data_pred))

end = time.time()

print(f'Execution time : {end - start :.3f} seconds')

Test new data
RMSE : 290.015641
R^2 : 0.668997
None
Execution time : 59.440 seconds
