In [2]:
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import numpy as np
import pandas as pd

In [36]:
# Data collection
boston_dataset = load_boston()

data = pd.DataFrame( data = boston_dataset.data , columns= boston_dataset.feature_names )

features = data.drop(['INDUS' , 'AGE'] , axis = 1)

log_prices = np.log( boston_dataset.target )

target = pd.DataFrame( log_prices , columns=['PRICE'] )

features.mean()

CRIM         3.613524
ZN          11.363636
CHAS         0.069170
NOX          0.554695
RM           6.284634
DIS          3.795043
RAD          9.549407
TAX        408.237154
PTRATIO     18.455534
B          356.674032
LSTAT       12.653063
dtype: float64

In [35]:
# Making a template for data input
property_stats = np.ndarray( shape=(1,11) )

property_stats = features.mean().values.reshape(1,11)
property_stats

array([[3.61352356e+00, 1.13636364e+01, 6.91699605e-02, 5.54695059e-01,
        6.28463439e+00, 3.79504269e+00, 9.54940711e+00, 4.08237154e+02,
        1.84555336e+01, 3.56674032e+02, 1.26530632e+01]])

In [31]:
regr = LinearRegression().fit( features , target )

fitted_vals = regr.predict( features )
MSE = mean_squared_error( target , fitted_vals )
RMSE = np.sqrt( MSE )


In [41]:
def get_log_estimate( nr_of_rooms ,
                      students_per_classroom ,
                      next_to_river = False,
                      high_confidence = True  ):
    
    # Configuring Property
    property_stats[0][4] = nr_of_rooms
    property_stats[0][8] = students_per_classroom
    
    if next_to_river:
        property_stats[0][2] = 1
    else:
        property_stats[0][2] = 0
        
    if high_confidence:
        bound = 2
        interval =95
    else:
        bound = 1
        interval = 68
    
        
    # Make Prediction
    log_estimate = regr.predict( property_stats )[0][0]
    
    upper_bound = log_estimate + bound * RMSE
    lower_bound = log_estimate - bound * RMSE
    
    return log_estimate, upper_bound, lower_bound , interval
    

In [43]:
get_log_estimate( 3 , 20 , next_to_river=True , high_confidence=False)

(2.776758191480399, 2.9642703266775294, 2.589246056283269, 68)

In [55]:
ZILLOW_MEDIAN_PRICE = 583.3

SCALE_FACTOR = ZILLOW_MEDIAN_PRICE / np.median( boston_dataset.target )


827000.0
1824000.0
142000.0


In [62]:
def get_dollar_estimate( rm , ptratio , chas = False , hf = True ):
    
    """
        Estimate the price of a property in Boston in current day and age
        
        Keyword argments:
        rm -- number of rooms in the property
        ptratio -- pupil teacher ratio
        chas -- close to Charles river
        hf -- True for 95% , False for 68% prediction interval
    """
    
    if rm < 1 or ptratio < 1 :
        return
    
    log_estiamte , upper , lower , conf = get_log_estimate( rm , students_per_classroom=ptratio , 
                                                           next_to_river=chas , high_confidence=hf)

    # Convert to todays dollars
    dollar_est = np.e** log_estiamte * 1000 * SCALE_FACTOR
    dollar_hi = np.e** upper * 1000 * SCALE_FACTOR
    dollar_low = np.e** lower * 1000 * SCALE_FACTOR


    # Round the use to nearest thousand
    rounded_est = np.around( dollar_est , -3)
    rounded_hi = np.around( dollar_hi , -3)
    rounded_low = np.around( dollar_low , -3)


    print( rounded_est )
    print( rounded_hi + rounded_est )
    print( rounded_est - rounded_low )

In [59]:
get_dollar_estimate( 2 , 20 , True)

404000.0
991000.0
127000.0
