In [38]:
from sklearn.datasets import load_boston
# from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# import boston_valuation as val

%matplotlib inline

In [39]:
# Gather Data 
boston_data = load_boston()
data = pd.DataFrame(data=boston_data.data, columns=boston_data.feature_names)


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [40]:
data.head(6)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33
5,0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3.0,222.0,18.7,394.12,5.21


In [41]:
# Dropping INDUS and AGES
features = data.drop(["INDUS", "AGE"], axis=1)
features

Unnamed: 0,CRIM,ZN,CHAS,NOX,RM,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,0.0,0.538,6.575,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,0.0,0.469,6.421,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,0.0,0.469,7.185,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,0.0,0.458,6.998,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,0.0,0.458,7.147,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,0.0,0.573,6.593,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,0.0,0.573,6.120,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,0.0,0.573,6.976,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,0.0,0.573,6.794,2.3889,1.0,273.0,21.0,393.45,6.48


In [42]:
# taking log of the prices
log_prices = np.log(boston_data.target)
target = pd.DataFrame(log_prices, columns=["PRICE"])

In [43]:
#  Preparing to make predictions, values for prediction 
property_stats = features.mean().values.reshape(1, 11)

In [44]:
property_stats

array([[3.61352356e+00, 1.13636364e+01, 6.91699605e-02, 5.54695059e-01,
        6.28463439e+00, 3.79504269e+00, 9.54940711e+00, 4.08237154e+02,
        1.84555336e+01, 3.56674032e+02, 1.26530632e+01]])

In [45]:
# Building models and making predictions
regr = LinearRegression().fit(features, target)
fitted_vals = regr.predict(features) # Prediction 

In [46]:
#  MSE, RMSE
MSE  = mean_squared_error(target, fitted_vals)
RMSE = np.sqrt(MSE)

In [47]:
RMSE

0.18751213519713034

In [63]:
def get_log_estimate(nr_rooms, student_per_class, next_to_river=False, high_confidence=True):
#     configure property 
    property_stats[0][3] = nr_rooms
    property_stats[0][4] = student_per_class
    
    property_stats[0][2] = 1 if next_to_river else 0
    log_estimate = regr.predict(property_stats) [0][0]
    
#     calculate range 
    if high_confidence:
        upper_bound = log_estimate + 2*RMSE
        lower_bound = log_estimate - 2*RMSE
        interval = 95
    else:
        upper_bound = log_estimate + RMSE
        lower_bound = log_estimate - RMSE
        interval = 68
            
    return log_estimate, upper_bound, lower_bound, interval

In [73]:
get_log_estimate(3, 10, next_to_river=True, high_confidence=True)



(1.9006129684666497, 2.2756372388609103, 1.5255886980723892, 95)

In [77]:
# convert log dollar prices into prices for both the upper and lower bound to the nearest whole number 
ZILLOW_MIDIAN_PRICE = 583
SCALE_FACTOR = ZILLOW_MIDIAN_PRICE / np.median(boston_data.target)
log_est, upper, lower, conf = get_log_estimate(9, student_per_class=15, next_to_river=False, high_confidence=False)

# convert to todays dollars
dollar_est = np.e**log_est * 1000 * SCALE_FACTOR
round(dollar_est, -2)

converted_prices = [ round(np.e**val * 1000 * SCALE_FACTOR, -3) for val in get_log_estimate(9, student_per_class=15, next_to_river=False, high_confidence=False) ]
converted_prices

print(f"The estimated property value is $ {converted_prices[0]} \n At confidence of {conf}% \n with upper and lower bound of {converted_prices[1]}, {converted_prices[2]} respectively")

The estimated property value is $ 3000.0 
 At confidence of 68% 
 with upper and lower bound of 4000.0, 3000.0 respectively


