In [1]:
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np

In [2]:
# gather data

boston_dataset = load_boston()
data = pd.DataFrame(data = boston_dataset.data , columns = boston_dataset.feature_names)
data.head()

features = data.drop(["INDUS","AGE"], axis = 1)

log_prices = np.log(boston_dataset.target)
target = pd.DataFrame(log_prices, columns = ["PRICE"])

In [3]:
CRIME_IDX = 0
ZN_IDX = 1
CHAS_IDX = 2
RM_IDX = 4
PTRATIO_IDX = 8

#property_stats= np.ndarray(shape = (1,11))
#property_stats[0][CRIME_IDX] = features["CRIM"].mean() 
#property_stats[0][ZN_IDX] = features["ZN"].mean() 
#property_stats[0][CHAS_IDX] = features["CHAS"].mean() 
#property_stats[0][RM_IDX] = features["RM"].mean()  
#property_stats[0][PTRATIO_IDX] = features["PTRATIO"].mean() 


# can do  : property_stats = features.mean().reshaper(1,11) to make it easier than coding each one by reshaping the array from 1d series

In [4]:
property_stats = features.mean().values.reshape(1,11)

In [5]:
regr = LinearRegression().fit(features, target)
fitted_vals = regr.predict(features)

In [6]:
MSE = mean_squared_error(target, fitted_vals)
RMSE = np.sqrt(MSE)

In [7]:
def get_log_estimate(nr_rooms,
                    students_per_classroom,
                    next_to_river = False,
                    high_confidence = True):
    
    #Configure Property
    property_stats[0][RM_IDX] = nr_rooms
    property_stats[0][PTRATIO_IDX] = students_per_classroom
    
    if next_to_river :
        property_stats[0][CHAS_IDX] = 1
    else:
        property_stats[0][CHAS_IDX] = 0
            
        
    
    #Make Prediction
    log_estimate = regr.predict(property_stats)[0][0]
    
    #Calc Range 
    if high_confidence :
        upper_bound = log_estimate + 2*RMSE
        lower_bound = log_estimate - 2*RMSE
        interval = 95

    else:
        upper_bound = log_estimate + RMSE
        lower_bound = log_estimate - RMSE 
        interval = 68

    return log_estimate, upper_bound , lower_bound , interval 

In [8]:
get_log_estimate(3,20, next_to_river= True)

(2.776758191480399, 3.1517824618746597, 2.4017339210861386, 95)

In [9]:
np.median(boston_dataset.target)

21.2

In [23]:
ZILLOW_MEAN_PRICE = 715.469
SCALE_FACTOR = 715.468 / np.median(boston_dataset.target)

log_est, upper , lower , conef = get_log_estimate(9, students_per_classroom= 15, next_to_river= False , high_confidence= False)

#convert to current doller
dollar_est =np.e**log_est * 1000 * SCALE_FACTOR
dollar_high =np.e**upper * 1000 * SCALE_FACTOR
dollar_low =np.e**lower * 1000 * SCALE_FACTOR
#round dollar value to nearest 1000

rounded_est = np.around(dollar_est, -3 )
upper_est = np.around(dollar_high, -3)
lower_est = np.around(dollar_low, -3)

In [29]:
print(f"Prices for boston house prices have a upper value of ${upper_est} and a lower bound value of ${lower_est}, avreging ${rounded_est} acounting for inflation")

Prices for boston house prices have a upper value of $1223000.0 and a lower bound value of $841000.0, avreging $1014000.0 acounting for inflation


NameError: name 'conf' is not defined

In [44]:
def get_dollar_estimate(rm,ptratio, chas = False , large_range = True) :
    """Estimate price of a house in Boston
    rm - rooms in propery
    ptratio - pupils per teacher
    chas - is it next to chas river, True or False
    """
    if rm<1 or ptratio > 30:
        print("That is unrealistic, please try again")
        return

    
    ZILLOW_MEAN_PRICE = 715.469
    SCALE_FACTOR = 715.468 / np.median(boston_dataset.target)

    log_est, upper , lower , conf = get_log_estimate(rm, ptratio, 
                                                      next_to_river = chas,
                                                      high_confidence = large_range)
#convert to current doller
    dollar_est =np.e**log_est * 1000 * SCALE_FACTOR
    dollar_high =np.e**upper * 1000 * SCALE_FACTOR
    dollar_low =np.e**lower * 1000 * SCALE_FACTOR
#round dollar value to nearest 1000

    rounded_est = np.around(dollar_est, -3 )
    upper_est = np.around(dollar_high, -3)
    lower_est = np.around(dollar_low, -3)

    print(f"The estimated property value is {rounded_est}")
    print(f"At confidence {conf} the value range is :")
    print(f"${lower_est} at the lower end, ${upper_est} at the high end")

In [47]:
get_dollar_estimate(rm = 3, ptratio=14, chas= True)

The estimated property value is 679000.0
At confidence 95 the value range is :
$466000.0 at the lower end, $988000.0 at the high end
