In [89]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np

In [90]:
# Gather Data

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep=r"\s+", skiprows=22, header = None)
np_data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2] 
data = pd.DataFrame(np_data, columns=['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO','B','LSTAT'])
data['PRICE'] = target
features = data.drop(['INDUS', 'AGE', 'PRICE'], axis=1)

log_prices = np.log(target)
log_prices
target = pd.DataFrame(log_prices, columns = ['PRICE'])
target.shape

(506, 1)

In [91]:
CRIM_IDX = 0
ZN_IDX = 1
CHAS_IDX =2
NOX_IDX = 3
RM_IDX = 4
DIS_IDX = 5
RAD_IDX = 6
TAX_IDX = 7
PTRAIO_IDX = 8 
B_IDX = 9
LSTAT_IDX = 10
property_stats = np.ndarray(shape = (1, 11))

In [92]:
features.mean()

CRIM         3.613524
ZN          11.363636
CHAS         0.069170
NOX          0.554695
RM           6.284634
DIS          3.795043
RAD          9.549407
TAX        408.237154
PTRATIO     18.455534
B          356.674032
LSTAT       12.653063
dtype: float64

In [93]:
features.mean().values.reshape(1,11)

array([[3.61352356e+00, 1.13636364e+01, 6.91699605e-02, 5.54695059e-01,
        6.28463439e+00, 3.79504269e+00, 9.54940711e+00, 4.08237154e+02,
        1.84555336e+01, 3.56674032e+02, 1.26530632e+01]])

In [94]:
property_stats = features.mean().values.reshape(1,11)

In [95]:
regr = LinearRegression().fit(features, target)
fitted_vals = regr.predict(features)

MSE = mean_squared_error(target, fitted_vals)
RMSE = np.sqrt(MSE)
RMSE

0.18751213519713034

In [104]:
def get_log_estimate(nr_rooms, students_per_classroom, 
                     next_to_river=False, high_confidence=True):
    # Configure property
    property_stats[0][RM_IDX] = nr_rooms
    property_stats[0][PTRAIO_IDX] = students_per_classroom
    
    if next_to_river:  # Corrected 'nex_to_river' to 'next_to_river'
        property_stats[0][CHAS_IDX] = 1
    else:
        property_stats[0][CHAS_IDX] = 0
    
    # Make Predictions
    log_estimate = regr.predict(property_stats)[0]  # Corrected variable name from 'Log_estimate' to 'log_estimate'
    
    # Calculate Range
    if high_confidence:
        upper_bound = log_estimate + 2 * RMSE  # 'RSME' should be defined elsewhere in your code
        lower_bound = log_estimate - 2 * RMSE
        interval = 95
    else:
        upper_bound = log_estimate + RMSE
        lower_bound = log_estimate - RMSE
        interval = 68
    
    return log_estimate, upper_bound, lower_bound, interval  # Corrected 'iterval' to 'interval'


In [106]:
get_log_estimate(3, 20, next_to_river=True)



(array([2.77675819]), array([3.15178246]), array([2.40173392]), 95)

In [146]:
np.median(target)
np.median(data['PRICE'])
np.e**np.median(target)

ZILLOW_MEDIAM_PRICE = 583.3
SCALE_FACTOR = ZILLOW_MEDIAM_PRICE/ np.median(data['PRICE'])

log_est, upper , lower, confi = get_log_estimate(nr_rooms=9, students_per_classroom = 15, next_to_river= False, high_confidence=False)
# Convert today's dollar
dollar_est = np.e**log_est*1000*SCALE_FACTOR
dollar_hi = np.e**upper*1000*SCALE_FACTOR
dollar_low = np.e**lower*1000*SCALE_FACTOR

rounded_dollar= np.around(dollar_est, -3)
rounded_dollar_hi = np.around(dollar_hi, -3)
rounded_dollar_low = np.around(dollar_low, -3)

print(f'the estimated property values is {dollar_est}')
print(f'At {confi}% confidence the valuation range is')
print(f'USD{rounded_dollar_low} at the lower end to USD{rounded_dollar_hi} at the high end.') 

the estimated property values is [826728.58924189]
At 68% confidence the valuation range is
USD [685000.]at the lower end to USD[997000.] at the high end.




In [162]:
def get_dollar_estimate(rm, ptratio, chas=False, large_range=True):
        """Estimate the price of a property in Boston.
        
        Keyword arguments:
        rm -- number of rooms in the property.
        ptratio -- number of students per teacher in the classroom for the school in the area.
        chas -- True if the property is next to the river, False otherwise.
        large_range -- True for a 95% prediction interval, False for a 68% interval.
        
        """
        
        
        if rm < 1 or ptratio < 1:
            print('That is unrealistic. Try again.')
            return
            
        SCALE_FACTOR = ZILLOW_MEDIAM_PRICE/ np.median(data['PRICE'])

        log_est, upper , lower, confi = get_log_estimate(nr_rooms=rm, students_per_classroom = ptratio, next_to_river= chas, high_confidence=large_range)
        # Convert today's dollar
        dollar_est = np.e**log_est*1000*SCALE_FACTOR
        dollar_hi = np.e**upper*1000*SCALE_FACTOR
        dollar_low = np.e**lower*1000*SCALE_FACTOR

        rounded_dollar= np.around(dollar_est, -3)
        rounded_dollar_hi = np.around(dollar_hi, -3)
        rounded_dollar_low = np.around(dollar_low, -3)

        print(f'the estimated property values is {dollar_est}')
        print(f'At {confi}% confidence the valuation range is')
        print(f'USD{rounded_dollar_low} at the lower end to USD{rounded_dollar_hi} at the high end.') 

In [164]:
get_dollar_estimate(2, 16)

the estimated property values is [422141.68786146]
At 95% confidence the valuation range is
USD[290000.] at the lower end to USD[614000.] at the high end.


