In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
pd.options.display.max_columns = 999
sns.set_style('whitegrid')

# https://metadata.phila.gov/#home/datasetdetails/5543865f20583086178c4ee5/representationdetails/55d624fdad35c7e854cb21a4/

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import RobustScaler, OneHotEncoder

from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [4]:
df = pd.read_csv('PHL_Building_Dataset_ML_Regression.csv')
df = df.drop(columns=['Unnamed: 0'])
df

Unnamed: 0,basements,building_code_description,central_air,depth,exterior_condition,fireplaces,frontage,garage_type,interior_condition,market_value,number_of_bathrooms,number_of_bedrooms,number_of_rooms,number_stories,parcel_shape,street_designation,topography,total_area,total_livable_area,type_heater,view_type,year_built,zoning,building_description,segment
0,0,ROW B/GAR 2 STY MASONRY,Y,90.00,4.0,0.0,29.17,A,4.0,257500.0,2.0,3.0,6.0,2.0,E,ST,6,2625.30,1266.0,H,I,1960.0,1,MASONRY,2
1,0,ROW B/GAR 2 STY MASONRY,Y,90.00,4.0,0.0,18.03,A,4.0,249400.0,2.0,3.0,6.0,2.0,E,ST,6,1622.70,1266.0,A,I,1960.0,1,MASONRY,2
2,0,ROW B/GAR 2 STY MASONRY,Y,90.00,4.0,0.0,18.05,A,4.0,249500.0,2.0,3.0,6.0,2.0,E,ST,6,1624.50,1266.0,A,I,1960.0,1,MASONRY,2
3,0,ROW B/GAR 2 STY MASONRY,Y,90.00,4.0,0.0,18.08,A,4.0,249500.0,2.0,3.0,6.0,2.0,E,ST,6,1627.20,1266.0,H,I,1960.0,1,MASONRY,2
4,0,ROW B/GAR 2 STY MASONRY,Y,90.00,4.0,0.0,18.71,A,4.0,253800.0,2.0,3.0,6.0,2.0,E,ST,6,1683.90,1310.0,H,I,1960.0,1,MASONRY,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44479,D,ROW 3 STY MASONRY,N,100.00,4.0,0.0,21.87,0,4.0,117500.0,0.0,3.0,6.0,3.0,A,ST,6,1651.00,2907.0,H,I,1929.0,1,MASONRY,1
44480,D,ROW 2 STY MASONRY,Y,102.19,3.0,0.0,18.24,0,3.0,70800.0,2.0,3.0,9.0,3.0,A,ST,6,1811.00,2026.0,A,I,2005.0,1,MASONRY,1
44481,D,ROW 3 STY MASONRY,Y,103.26,3.0,0.0,16.00,0,3.0,23600.0,1.0,3.0,6.0,3.0,A,ST,6,1604.00,2082.0,A,I,2005.0,1,MASONRY,1
44482,D,ROW 3 STY MASONRY,Y,90.00,3.0,0.0,16.00,0,3.0,23600.0,1.0,3.0,6.0,3.0,E,ST,6,1440.00,1387.0,A,I,2005.0,1,MASONRY,1


In [5]:
# Rename Segment
df['segment'] = df['segment'].replace({1:'Bottom', 2:'Lower Middle', 3:'Middle', 4:'Upper Middle', 5:'Top'})

# Splitting Data

In [6]:
X = df.drop(columns=['building_code_description', 'market_value'])
y = df['market_value']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Evaluation Matrix

In [8]:
# Evaluation Matrix Regression
def Eva_Matrix(Model, X_train, y_train, X_test, y_test, Name):
    y_pred_train = Model.predict(X_train)
    r2_train = r2_score(y_train, y_pred_train)
    mae_train = mean_absolute_error(y_train, y_pred_train)
    mse_train = mean_squared_error(y_train, y_pred_train)
    rmse_train = np.sqrt(mse_train)
    y_pred_test = Model.predict(X_test)
    r2_test = r2_score(y_test, y_pred_test)
    mae_test = mean_absolute_error(y_test, y_pred_test)
    mse_test = mean_squared_error(y_test, y_pred_test)
    rmse_test = np.sqrt(mse_test)
    data = {
        f"Training {Name}" : [r2_train, mae_train, mse_train, rmse_train],
        f"Test {Name}" : [r2_test, mae_test, mse_test, rmse_test]
    }
    df = pd.DataFrame(data=data, index=['R2', 'MAE', 'MSE', 'RMSE'])
    return df

# Pipeline

In [298]:
num_columns = ['fireplaces', 'number_of_rooms', 'number_stories', 'total_area', 
               'total_livable_area']

cat_columns = ['basements', 'central_air', 'exterior_condition', 'garage_type',
               'interior_condition', 'parcel_shape', 'street_designation', 'topography',
               'type_heater', 'view_type', 'building_description']

numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', RobustScaler()),
    #('poly', PolynomialFeatures(degree=3, include_bias=False)),
    #('power', PowerTransformer(method='yeo-johnson'))
])

categoric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

preprocessor = ColumnTransformer([
    ('numeric', numeric_pipeline, num_columns),
    ('categoric', categoric_pipeline, cat_columns)
])

pipeSVM = Pipeline([
    ("prep", preprocessor),
    ("algo", SVR(max_iter=400))
])

pipeLR = Pipeline([
    ("prep", preprocessor),
    ("algo", LinearRegression())
])

pipeLasso = Pipeline([
    ("prep", preprocessor),
    ("algo", Lasso())
])

pipeRidge = Pipeline([
    ("prep", preprocessor),
    ("algo", Ridge())
])

pipeElasticNet = Pipeline([
    ("prep", preprocessor),
    ("algo", ElasticNet())
])

pipeKNN = Pipeline([
    ("prep", preprocessor),
    ("algo", KNeighborsRegressor())
])
               
pipeDT = Pipeline([
    ("prep", preprocessor),
    ("algo", DecisionTreeRegressor())
])
            
pipeRF = Pipeline([
    ("prep", preprocessor),
    ("algo", RandomForestRegressor())
])

# Base Model KNN

In [10]:
pipeKNN.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer())]),
                                                  ['fireplaces',
                                                   'number_of_rooms',
                                                   'number_stories',
                                                   'total_area',
                                                   'total_livable_area']),
                                                 ('categoric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEnco

In [11]:
y_KNN_test = pipeKNN.predict(X_test)

In [12]:
df_KNN = Eva_Matrix(pipeKNN, X_train, y_train, X_test, y_test, "Pipeline KNN")
df_KNN

Unnamed: 0,Training Pipeline KNN,Test Pipeline KNN
R2,0.7648403,0.6382117
MAE,31033.23,40920.95
MSE,4277652000.0,7529401000.0
RMSE,65403.76,86772.12


# Base Model Linear Regression

In [13]:
pipeLR.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer())]),
                                                  ['fireplaces',
                                                   'number_of_rooms',
                                                   'number_stories',
                                                   'total_area',
                                                   'total_livable_area']),
                                                 ('categoric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEnco

In [14]:
y_LR_test = pipeLR.predict(X_test)

In [15]:
df_LR = Eva_Matrix(pipeLR, X_train, y_train, X_test, y_test, "Pipeline LR")
df_LR

Unnamed: 0,Training Pipeline LR,Test Pipeline LR
R2,0.751802,0.752884
MAE,41246.12,42543.37
MSE,4514825000.0,5142883000.0
RMSE,67192.45,71713.9


# Base Model Ridge

In [16]:
pipeRidge.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer())]),
                                                  ['fireplaces',
                                                   'number_of_rooms',
                                                   'number_stories',
                                                   'total_area',
                                                   'total_livable_area']),
                                                 ('categoric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEnco

In [17]:
y_Ridge_test = pipeRidge.predict(X_test)

In [18]:
df_Ridge = Eva_Matrix(pipeRidge, X_train, y_train, X_test, y_test, "Pipeline Ridge")
df_Ridge

Unnamed: 0,Training Pipeline Ridge,Test Pipeline Ridge
R2,0.6237616,0.636953
MAE,53842.31,55415.33
MSE,6843932000.0,7555597000.0
RMSE,82728.06,86922.93


# Base Model Lasso

In [19]:
pipeLasso.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer())]),
                                                  ['fireplaces',
                                                   'number_of_rooms',
                                                   'number_stories',
                                                   'total_area',
                                                   'total_livable_area']),
                                                 ('categoric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEnco

In [20]:
y_Lasso_test = pipeLasso.predict(X_test)

In [21]:
df_Lasso = Eva_Matrix(pipeLasso, X_train, y_train, X_test, y_test, "Pipeline Lasso")
df_Lasso

Unnamed: 0,Training Pipeline Lasso,Test Pipeline Lasso
R2,0.751838,0.7531919
MAE,41242.44,42514.25
MSE,4514169000.0,5136477000.0
RMSE,67187.57,71669.22


# Base Model ElasticNet

In [22]:
pipeElasticNet.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer())]),
                                                  ['fireplaces',
                                                   'number_of_rooms',
                                                   'number_stories',
                                                   'total_area',
                                                   'total_livable_area']),
                                                 ('categoric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEnco

In [23]:
y_ElasticNet_test = pipeElasticNet.predict(X_test)

In [24]:
df_ElasticNet = Eva_Matrix(pipeElasticNet, X_train, y_train, X_test, y_test, "Pipeline ElasticNet")
df_ElasticNet

Unnamed: 0,Training Pipeline ElasticNet,Test Pipeline ElasticNet
R2,0.7014703,0.7073878
MAE,44870.03,46400.47
MSE,5430379000.0,6089734000.0
RMSE,73691.1,78036.75


# Base Model SVM

In [25]:
pipeSVM.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer())]),
                                                  ['fireplaces',
                                                   'number_of_rooms',
                                                   'number_stories',
                                                   'total_area',
                                                   'total_livable_area']),
                                                 ('categoric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEnco

In [26]:
y_SVM_test = pipeSVM.predict(X_test)

In [27]:
df_SVM = Eva_Matrix(pipeSVM, X_train, y_train, X_test, y_test, "Pipeline SVM")
df_SVM

Unnamed: 0,Training Pipeline SVM,Test Pipeline SVM
R2,-2.141573,-1.832491
MAE,220162.8,221629.9
MSE,57146510000.0,58948720000.0
RMSE,239053.4,242793.6


# Base Model DecisionTreeRegressor

In [32]:
pipeDT.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer())]),
                                                  ['fireplaces',
                                                   'number_of_rooms',
                                                   'number_stories',
                                                   'total_area',
                                                   'total_livable_area']),
                                                 ('categoric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEnco

In [33]:
y_DT_test = pipeDT.predict(X_test)

In [41]:
df_DT = Eva_Matrix(pipeDT, X_train, y_train, X_test, y_test, "Pipeline DT")
df_DT

Unnamed: 0,Training Pipeline DT,Test Pipeline DT
R2,0.9997697,0.6868985
MAE,262.762,31113.64
MSE,4190052.0,6516150000.0
RMSE,2046.962,80722.67


# Base Model RandomForestRegressor

In [35]:
pipeRF.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer())]),
                                                  ['fireplaces',
                                                   'number_of_rooms',
                                                   'number_stories',
                                                   'total_area',
                                                   'total_livable_area']),
                                                 ('categoric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEnco

In [36]:
y_RF_test = pipeRF.predict(X_test)

In [42]:
df_RF = Eva_Matrix(pipeRF, X_train, y_train, X_test, y_test, "Pipeline RF")
df_RF

Unnamed: 0,Training Pipeline RF,Test Pipeline RF
R2,0.9773417,0.8406967
MAE,8468.079,23843.08
MSE,412164600.0,3315360000.0
RMSE,20301.84,57579.17


# Fine Tuning

## DecisionTreeRegressor

In [319]:
pipeDT_Tuning = Pipeline([
    ("prep", preprocessor),
    ("algo", DecisionTreeRegressor(max_depth = None, max_features = 0.9, min_samples_leaf = 25, min_samples_split = 24, random_state=42))
])

In [320]:
pipeDT_Tuning.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   RobustScaler())]),
                                                  ['fireplaces',
                                                   'number_of_rooms',
                                                   'number_stories',
                                                   'total_area',
                                                   'total_livable_area']),
                                                 ('categoric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequen

In [321]:
y_DT_Fine_Tuned_test = pipeDT_Tuning.predict(X_test)

In [322]:
df_DT_Fine_Tuned = Eva_Matrix(pipeDT_Tuning, X_train, y_train, X_test, y_test, "Pipeline DT Tuned")
df_DT_Fine_Tuned

Unnamed: 0,Training Pipeline DT Tuned,Test Pipeline DT Tuned
R2,0.8071473,0.7769997
MAE,29310.71,33450.81
MSE,3508071000.0,4640997000.0
RMSE,59228.97,68124.86


In [None]:
bikin residual plot -> untuk yg test, kalo mau buat yg train juga boleh

In [None]:
error = abs(y_DT_Fine_Tuned_test - ytest)

In [None]:
dftest = pd.concat(x_test,y_test)

In [None]:
dftest['error'] = error

In [None]:
bikin histogram dari dftest error

In [None]:
scatter plot antara dftest error & dftest y_test

In [None]:
nanti keliatan bentuknya gimana