In [137]:
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

In [138]:
# Gather Data
boston_dataset = load_boston()
data = pd.DataFrame(data = boston_dataset.data, columns = boston_dataset.feature_names)
features = data.drop(["INDUS", "AGE"], axis = 1)

# Log prices
log_prices = np.log(boston_dataset.target)
# Covert to 2D array.
target = pd.DataFrame(log_prices, columns = ["PRICE"])

In [139]:
# Create property_stats varible to store
CRIME_IDX = 0
ZN_IDX = 1
CHAS_IDX = 2
RM_IDX = 4
PTRATIO_IDX = 8

# property_stats = np.ndarray(shape = (1, 11))
# property_stats[0][CRIME_IDX] = features["CRIM"].mean()
# property_stats[0][ZN_IDX] = features["ZN"].mean()
# property_stats[0][CHAS_IDX] = features["CHAS"].mean()

# Fill the default value using the mean value of every feature, reshape the pandas.series to ndarray.
property_stats = features.mean().values.reshape(1, 11)

In [140]:
regr = LinearRegression().fit(features, target)
fitted_values = regr.predict(features)
#print(fitted_values.shape)
MSE = mean_squared_error(target, fitted_values)
RMSE = np.sqrt(MSE)

In [141]:
def get_log_estimate(room_number, pupil_teacher_ratio, beside_river = False, high_confidence = True):
    # Configure property values
    property_stats[0][RM_IDX] = room_number
    property_stats[0][PTRATIO_IDX] = pupil_teacher_ratio
    if beside_river:
        property_stats[0][CHAS_IDX] = 1
    else:
        property_stats[0][CHAS_IDX] = 0
    # Predict
    log_estimate = regr.predict(property_stats)[0][0]
    # Calculate the range
    if high_confidence:
        # 2 standard deviation
        upper_bound = log_estimate + 2*RMSE
        lower_bound = log_estimate - 2*RMSE
        interval = 95
    else:
        # 1 standard deviation
        upper_bound = log_estimate + RMSE
        lower_bound = log_estimate - RMSE
        interval = 68
    return log_estimate, upper_bound, lower_bound, interval

In [142]:
# Today's median price from Zillow website
ZILLOW_MEDIAN_PRICE = 583.3
SCALE_FACTOR = ZILLOW_MEDIAN_PRICE / np.median(boston_dataset.target)

log_estimate, upper_bound, lower_bound, confidence = get_log_estimate(9, 15, False, False)
# Covert to today's value
estimate_today = np.around(np.e**log_estimate * SCALE_FACTOR * 1000, -3)
estimate_high = np.around(np.e**upper_bound * SCALE_FACTOR * 1000, -3)
estimate_low = np.around(np.e**lower_bound * SCALE_FACTOR * 1000, -3)

print(f"Today's estimate: {estimate_today}, Upper Bound: {estimate_high}, Lower Bound: {estimate_low}, Confidence: {confidence}%")

Today's estimate: 827000.0, Upper Bound: 997000.0, Lower Bound: 685000.0, Confidence: 68%


In [143]:
def get_today_dollar_estimate(room_number, pupil_teacher_ratio, beside_river = False, high_confidence = True):
    """ Estimate today's property price in Boston.
    """
    if room_number < 1 or pupil_teacher_ratio < 1:
        return "Information provided is not valid."
    log_estimate, upper_bound, lower_bound, confidence = get_log_estimate(room_number, pupil_teacher_ratio, beside_river, high_confidence)
    estimate_today = np.around(np.e**log_estimate * SCALE_FACTOR * 1000, -3)
    estimate_high = np.around(np.e**upper_bound * SCALE_FACTOR * 1000, -3)
    estimate_low = np.around(np.e**lower_bound * SCALE_FACTOR * 1000, -3)
    print(f"Today's estimate: {estimate_today}, Upper Bound: {estimate_high}, Lower Bound: {estimate_low}, Confidence: {confidence}%")
    

In [144]:
get_today_dollar_estimate(1, 20, True)

Today's estimate: 369000.0, Upper Bound: 537000.0, Lower Bound: 253000.0, Confidence: 95%


In [7]:
# The kernel must be restarted if the package is modified.
import boston_property_valuation as valuation
valuation.get_today_dollar_estimate(1, 20, True)

Today's estimate: 369000.0, Upper Bound: 537000.0, Lower Bound: 253000.0, Confidence: 95%
