In [4]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error

import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams["figure.figsize"] = (16,10)

ImportError: DLL load failed: The specified module could not be found.

# Data cleaning

In [None]:
data = pd.read_csv("./combined-data.csv")

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data = data.dropna()

In [None]:
X = data.drop(columns=['resale_price'])
y = data['resale_price']

In [None]:
for c in X.columns:
    print(X[c].isna().sum())
    print(c, X[c].isnull().any())

In [None]:
# Categorical features to numeric features

for c in ['month', 'town', 'flat_type', 'block', 'storey_range', 'flat_model', 'mrt_station']:
    X = pd.concat([X, pd.get_dummies(X[c])], axis=1)


tmp = X['remaining_lease'].str.split(" ", n = 1, expand = True)
X["years_left"] = tmp[0]
X["years_left"] = pd.to_numeric(X["years_left"])

X = X.drop(columns=['month', 'town', 'flat_type', 'block', 'street_name', 'storey_range', 'flat_model', 'remaining_lease', 'mrt_station'])

In [None]:
X.head()

In [None]:
# Train, validation, and test split
train_split = 0.8
validation_split = 0.1
test_split = 0.1

train_X = X.iloc[:round(len(data)*train_split),:]
train_y = y.iloc[:round(len(data)*train_split)]

validation_X = X.iloc[round(len(data)*train_split):round(len(data)*train_split + len(data)*validation_split)]
validation_y = y.iloc[round(len(data)*train_split):round(len(data)*train_split + len(data)*validation_split)]


test_X = X.iloc[round(len(data)*train_split + len(data)*validation_split):,:]
test_y = y.iloc[round(len(data)*train_split + len(data)*validation_split):]

In [None]:
train_X.shape, train_y.shape, test_X.shape, test_y.shape

# Modeling

In [None]:
model = RandomForestRegressor(verbose=3)
model.fit(train_X, train_y)

# Validation

In [None]:
test_predictions = model.predict(test_X)

In [None]:
mae = mean_absolute_error(y_true=test_y, y_pred=test_predictions)
mae, mae/np.mean(test_y)

In [None]:
r2 = r2_score(y_true=test_y, y_pred=test_predictions)

In [None]:
fig, ax = plt.subplots(figsize=(16, 10))

plt.title(f"Correlation between target and predictions | R^2: {round(r2, ndigits=2)}", fontsize=20)
plt.xlabel("Predicted prices")
plt.ylabel("Target prices")

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

sns.regplot(test_predictions, test_y)