In [41]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

In [19]:
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :3]])
bst_df = pd.DataFrame(data)
bst_df = bst_df.rename(columns={0:'crime', 1:'zn', 2:'indus', 3:'chas', 4:'nox', 5:'rm', 6:'age', 7:'dis', 8:'rad', 9:'tax', 10:'pt_ratio', 11:'b', 12:'l_stat', 13:'price'})
features = bst_df.drop(['age', 'indus', 'price'], axis=1)

In [22]:
log_prices = np.log(bst_df.price)
target = pd.DataFrame(log_prices, columns=['price'])

In [49]:
crime_idx = 0
zn_idx = 1
chas_idx = 2
rm_idx = 4
pt_ratio = 8


property_stats = features.mean().values.reshape(1, 11)

In [40]:
regr = LinearRegression().fit(features, target)
fitted_vals = regr.predict(features)

In [47]:
mse = mean_squared_error(fitted_vals, target)
rmse = np.sqrt(mse)

In [71]:
def get_log_est(nr_rooms, students_per_classroom, next_to_river=False, high_confidence=True):
    property_stats[0][rm_idx] = nr_rooms
    property_stats[0][pt_ratio] = students_per_classroom

    if next_to_river:
        property_stats[0][chas_idx] = 1
    else:
        property_stats[0][chas_idx] = 0

    log_est = regr.predict(property_stats)

    if high_confidence:
        upper_bound = log_est + 2*rmse
        lower_bound = log_est - 2*rmse
        interval = 95
    else:
        upper_bound = log_est + rmse
        lower_bound = log_est - rmse
        interval = 68


    return log_est[0][0], upper_bound[0][0], lower_bound[0][0], interval


In [85]:
zillow_median_price = 583.3
scale_factor = zillow_median_price / np.median(bst_df.price)

In [91]:
def get_dolar_est(rm, ptratio, chas=False, large_range=True):
    if rm < 1 or ptratio < 1:
        return 'This is unrealistic! Try again'
    log_est, upper_bound, lower_bound, interval = get_log_est(rm, ptratio, chas, large_range)

    log_est = np.e**log_est * 1000 * scale_factor
    upper_bound = np.e**upper_bound*1000*scale_factor
    lower_bound = np.e**lower_bound*1000*scale_factor

    rounded_log_est = np.round(log_est, -3)
    rounded_upper_bound = np.round(upper_bound, -3)
    rounded_lower_est = np.round(lower_bound, -3)

    print("Price in Thousands")
    print(f"Estimated Price: {rounded_log_est}")
    print(f"Estimated upper bound Price: {rounded_upper_bound}")
    print(f"Estimated lower bound Price: {rounded_lower_est}")
    print(f"interval percentage: {(data[3])}%")

In [92]:
get_dolar_est(0, 30, True)

'This is unrealistic! Try again'