In [1]:
%matplotlib inline
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_columns', 120)
pd.set_option('display.max_rows', 120)

In [3]:
def clean_and_drop(df):
    # 只篩選有包含 '住' 用途的交易案
    df = df.loc[df['Main_Usage_Living'] == 1]
    df = df.drop(columns=['Main_Usage_Living'])
    
    # 因為都是 0
    df = df.drop(columns=['Non_City_Land_Usage', 'Main_Usage_Walk', 
                          'Main_Usage_Selling',
                          'Main_Usage_SnE'])
    
    # 只有 344 筆是包含工廠用途，且都不具住宅用途，故剔除
    df = df.loc[df['Main_Usage_Manufacturing'] == 0]
    df = df.drop(columns=['Main_Usage_Manufacturing'])
    
    # 只有 76 筆是包含停車用途，且都不具住宅用途，故剔除
    df = df.loc[df['Main_Usage_Parking'] == 0]
    df = df.drop(columns=['Main_Usage_Parking'])
    
    # 只有 78 筆有農業用途，且都不具住宅用途，故剔除
    df = df.loc[df['Main_Usage_Farm'] == 0]
    df = df.drop(columns=['Main_Usage_Farm'])
    
    # NOTICE: 我沒有錢，所以我先只買 6 房以下的
    df = df.loc[df['room'] < 6]
    
    df = df.loc[df['trading_floors_count'] == 1]
    
    # 雖然有 95 個樣本包含地下室，但是樣本太少，可能不足以推廣
    # 所以先剔除，剔除完後，都是 0 所以直接 drop
    df = df.loc[df['including_basement'] == 0]
    df = df.drop(columns=['including_basement'])
    
    # 所有的樣本都不包含人行道，所以直接去除這個 feature
    df = df.drop(columns=['including_arcade'])

    # 剔除交易樓層高度是 -1 (原本有一個樣本)
    df = df.loc[df['min_floors_height'] != -1]

    # 剔除交易建物是 0 個樓層的情況
    df = df.loc[df['building_total_floors'] != 0]
    
    # 因為車位交易 50 坪以上的資料只有 22 筆，所以先去除
    # 因為浮點數在硬體儲存會有小數點，故不能直接用 == 50.0 去比較
    df = df.loc[df['Parking_Area'] < 49.5]
    
    # 把農舍，廠辦踢掉
    df = df.loc[df['Building_Types'] < 8]

    # 把超大轉移坪數刪掉
    df = df.loc[df['Transfer_Total_Ping'] < 150]
    
    # 我先刪除 area_m2, 因為覺得跟 area_ping 的意義很類似，但是不確定會不會有些微差距。
    # 因為在 future data 中，manager 都是 0，所以也把這個欄位刪除
    # trading_floor_count 有 0 的情況，這樣應該不是房屋交易
    df = df.drop(columns=['address', 'area_m2', 'manager', 'Building_Material_stone', 
                     'TDATE', 'Total_price', '編號'])
    
    # Convert the categorical features' dtype to 'category'
    category_columns = ['Type', 'Month', 'Month_raw',
                       'City_Land_Usage', 'Main_Usage_Business',
                       'Building_Material_S', 'Building_Material_R', 'Building_Material_C',
                       'Building_Material_steel', 'Building_Material_B', 
                       'Building_Material_W', 'Building_Material_iron',
                       'Building_Material_tile', 'Building_Material_clay',
                       'Building_Material_RC_reinforce',
                       'Parking_Space_Types', 'Building_Types']
    df.loc[:, category_columns] = df.loc[:, category_columns].astype('category')
    return df

In [10]:
def simple_drop(df):
    df = df.drop(columns=['address', 'TDATE', 'Total_price', '編號'])
    return df

In [5]:
def split_features_target(df):
    X = df.drop(columns=['Unit_Price_Ping'])
    y = df['Unit_Price_Ping']
    return X, y

def train(model, X_train, y_train):
    model.fit(X_train, y_train)
    return model

def eval(model, X_test, y_test):
    from sklearn.metrics import (r2_score, 
                                 mean_absolute_error, 
                                 mean_squared_error)
    
    y_pred = model.predict(X_test)
    print(f'R2 score: {r2_score(y_test, y_pred)}')
    print(f'MAE score: {mean_absolute_error(y_test, y_pred)}')
    print(f'MSE score: {mean_squared_error(y_test, y_pred)}')

In [6]:
from sklearn.model_selection import train_test_split

In [11]:
df_future = pd.read_csv('../temp_future/output_feature/clean_data_future_train.csv')
df_future_test = pd.read_csv('../temp_future/output_feature/clean_data_future_test.csv')

In [None]:
df_future = clean_and_drop(df_future)
df_future = df_future.sample(frac=1, random_state=0)
X_train, y_train = split_features_target(df_future)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1)

In [12]:
# Code cell for built baseline
df_future = simple_drop(df_future)
df_future = df_future.sample(frac=1, random_state=0)
X_train, y_train = split_features_target(df_future)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1)

df_future_test = simple_drop(df_future_test)
X_test, y_test = split_features_target(df_future_test)

In [None]:
df_future_test = clean_and_drop(df_future_test)
X_test, y_test = split_features_target(df_future_test)

In [None]:
df_future['Month'].unique()

In [None]:
X_val.shape

In [14]:
from sklearn.tree import DecisionTreeRegressor
model = train(DecisionTreeRegressor(max_depth=20), X_train, y_train)

print('Training performance: ')
eval(model, X_train, y_train)
print()
print('Evaluation performance: ')
eval(model, X_val, y_val)
print()
print('Test performance:')
eval(model, X_test, y_test)

Training performance: 
R2 score: 0.9885100686261682
MAE score: 10023.16839742318
MSE score: 353474513.8519085

Evaluation performance: 
R2 score: 0.9501390804092396
MAE score: 17096.804624893495
MSE score: 1486714615.930852

Test performance:
R2 score: 0.7380072256095961
MAE score: 47327.19744163418
MSE score: 9357991635.724712


In [None]:
# from sklearn.svm import LinearSVR

# model = LinearSVR(C=1.0, epsilon=0.0, verbose=True, max_iter=300, random_state=1207)
# train(model, X_train, y_train)

# print('Training performance: ')
# eval(model, X_train, y_train)
# print()
# print('Evaluation performance: ')
# eval(model, X_test, y_test)

In [None]:
# Too slow
# The fit time complexity is more than quadratic
# with the number of samples which makes it hard
# to scale to datasets with more than a couple of
# 10000 samples. For large datasets consider using
# LinearSVR or SGDRegressor instead, possibly
# after a Nystroem transformer.
from sklearn.svm import SVR

model = SVR(C=1.0, epsilon=0.0, verbose=True)
train(model, X_train, y_train)

print('Training performance: ')
eval(model, X_train, y_train)
print()
print('Evaluation performance: ')
eval(model, X_test, y_test)