# **Installation of required libraries**

In [27]:
# pip install boruta

In [28]:
import pandas as pd
import numpy as np
import sklearn.metrics as metrics
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from boruta import BorutaPy as bp
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import joblib

In [29]:
df = pd.read_csv('../data/ResaleFlatPrice_Formatted.csv')
df.head()

Unnamed: 0,year,month,town,flat_type,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price,town_le,flat_type_le,storey_range_le,flat_model_le,price_category
0,1990,1,ANG MO KIO,1 ROOM,10 TO 12,31.0,IMPROVED,1977,86.0,9000.0,0,0,5,7,low
1,1990,1,ANG MO KIO,1 ROOM,04 TO 06,31.0,IMPROVED,1977,86.0,6000.0,0,0,2,7,low
2,1990,1,ANG MO KIO,1 ROOM,10 TO 12,31.0,IMPROVED,1977,86.0,8000.0,0,0,5,7,low
3,1990,1,ANG MO KIO,1 ROOM,07 TO 09,31.0,IMPROVED,1977,86.0,6000.0,0,0,4,7,low
4,1990,1,ANG MO KIO,3 ROOM,04 TO 06,73.0,NEW GENERATION,1976,85.0,47200.0,0,2,2,20,low


# Feature Selection

In [30]:
le = LabelEncoder()

df['town_le'] = le.fit_transform(df['town'])
df['flat_type_le'] = le.fit_transform(df['flat_type'])
df['storey_range_le'] = le.fit_transform(df['storey_range'])
df['flat_model_le'] = le.fit_transform(df['flat_model'])

In [31]:
new_df = df[['town_le', 'flat_type_le', 'storey_range_le', 'flat_model_le', 'floor_area_sqm', 'remaining_lease', 'resale_price', 'price_category']].copy()
new_df.columns = ['town', 'flat_type', 'storey_range', 'flat_model', 'floor_area_sqm', 'remaining_lease', 'resale_price', 'price_category']

# Modelling

## Regression

In [32]:
regression_df = new_df[['town', 'flat_type', 'storey_range', 'floor_area_sqm', 'flat_model', 'remaining_lease', 'resale_price']]

X = regression_df.drop(columns=['resale_price'])
y = regression_df['resale_price']

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train Set :", X_train.shape, y_train.shape)
print("Test Set  :", X_test.shape, y_test.shape)

Train Set : (732407, 6) (732407,)
Test Set  : (183102, 6) (183102,)


In [34]:
rf_regression = RandomForestRegressor().fit(X_train, y_train)
y_pred_rf = rf_regression.predict(X_test)

lr_regression = LinearRegression().fit(X_train, y_train)
y_pred_lr = lr_regression.predict(X_test)



In [None]:
# To save the trained model
joblib.dump(rf_regression, '../models/rf_regression.sav')
joblib.dump(lr_regression, '../models/lr_regression.sav')

In [None]:
def evaluationRegressionModel(model, y_test, y_pred):
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = metrics.r2_score(y_test, y_pred)

    print("-----" + model + "-----")
    print("Mean Absolute Error: ", round(mae,2))
    print("Mean Square Error: ", round(mse,2))
    print("Root Mean Square Error: ", round(rmse,2))
    print("R-squared: ", round(r2,4))


In [37]:
evaluationRegressionModel("Random Forest Regression", y_test, y_pred_rf)

-----Random Forest Regression-----
Mean Absolute Error:  31760.17
Mean Square Error:  2414530881.87
Root Mean Square Error:  49137.88
R-squared:  0.9142


In [38]:
evaluationRegressionModel("Linear Regression", y_test, y_pred_lr)

-----Linear Regression-----
Mean Absolute Error:  89085.32
Mean Square Error:  14991437102.01
Root Mean Square Error:  122439.52
R-squared:  0.4673


## Classification

In [42]:
classification_df = new_df[['town', 'flat_type', 'storey_range', 'floor_area_sqm', 'flat_model', 'remaining_lease', 'price_category']]

price_category = {'low': 0, 'medium': 1, 'high': 2}
classification_df['price_category'] = [price_category[item] for item in classification_df['price_category']]

X = classification_df.drop(columns=['price_category'])
y = classification_df['price_category']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  classification_df['price_category'] = [price_category[item] for item in classification_df['price_category']]


In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

print("Train Set :", X_train.shape, y_train.shape)
print("Test Set  :", X_test.shape, y_test.shape)

Train Set : (732407, 6) (732407,)
Test Set  : (183102, 6) (183102,)


In [44]:
rf_classification = RandomForestClassifier().fit(X_train, y_train)
y_pred_rfc = rf_classification.predict(X_test)

xgb_classification = XGBClassifier().fit(X_train, y_train)
y_pred_xgb = xgb_classification.predict(X_test)

In [45]:
# To save the trained model
joblib.dump(rf_classification, '../models/rf_classification.sav')
joblib.dump(xgb_classification, '../models/xgb_classification.sav')

['../models/xgb_classification.sav']

In [46]:
def evaluationClassificationModel(model, y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    print("-----" + model + "-----")
    print("Accuracy Score: ", round(accuracy,4))
    print("Precision: ", round(precision,4))
    print("Recall: ", round(recall,4))
    print("F1-Score: ", round(f1,4))

In [47]:
evaluationClassificationModel("Random Forest Classifier", y_test, y_pred_rfc)

-----Random Forest Classifier-----
Accuracy Score:  0.9525
Precision:  0.8415
Recall:  0.7455
F1-Score:  0.7843


In [48]:
evaluationClassificationModel("XG-Boosting Classifier", y_test, y_pred_xgb)

-----XG-Boosting Classifier-----
Accuracy Score:  0.9474
Precision:  0.8612
Recall:  0.7145
F1-Score:  0.7713
