In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_columns', 120)
pd.set_option('display.max_rows', 120)

In [3]:
def simple_drop(df):
    df = df.drop(columns=['address', 'TDATE', 'Total_price', '編號'])
    return df

In [4]:
def split_features_target(df):
    X = df.drop(columns=['Unit_Price_Ping'])
    y = df['Unit_Price_Ping']
    return X, y

def train(model, X_train, y_train):
    model.fit(X_train, y_train)
    return model

def eval(model, X_test, y_test):
    from sklearn.metrics import (r2_score, 
                                 mean_absolute_error, 
                                 mean_squared_error)
    
    y_pred = model.predict(X_test)
    print(f'R2 score: {r2_score(y_test, y_pred)}')
    print(f'MAE score: {mean_absolute_error(y_test, y_pred)}')
    print(f'MSE score: {mean_squared_error(y_test, y_pred)}')

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
df_future = pd.read_csv('../temp_future/output_feature/clean_data_future_train.csv')
df_future_test = pd.read_csv('../temp_future/output_feature/clean_data_future_test.csv')

In [7]:
# Code cell for built baseline
df_future = simple_drop(df_future)
df_future = df_future.sample(frac=1, random_state=0)
X_train, y_train = split_features_target(df_future)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1)

df_future_test = simple_drop(df_future_test)
X_test, y_test = split_features_target(df_future_test)

In [8]:
from sklearn.tree import DecisionTreeRegressor
model = train(DecisionTreeRegressor(max_depth=20), X_train, y_train)

print('Training performance: ')
eval(model, X_train, y_train)
print()
print('Evaluation performance: ')
eval(model, X_val, y_val)
print()
print('Test performance:')
eval(model, X_test, y_test)

Training performance: 
R2 score: 0.9874002296105632
MAE score: 10186.981359720226
MSE score: 387398904.9551694

Evaluation performance: 
R2 score: 0.9504147220289036
MAE score: 17051.481455372603
MSE score: 1486229471.298085

Test performance:
R2 score: 0.7494452997759204
MAE score: 46273.5103883766
MSE score: 8949440664.705328
