# Valuation Tool 

In [69]:
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np

In [70]:
data = load_boston()
boston_data = pd.DataFrame(data=data.data, columns=data.feature_names)
boston_data['PRICE'] = data.target
bos_feature = pd.DataFrame(data=data.data, columns=data.feature_names)

In [71]:
# But in our analysis we found that INDUS and AGE are not desirable .
bos_feature = bos_feature.drop(['INDUS', 'AGE'], axis=1)

In [72]:
bos_feature.head()

Unnamed: 0,CRIM,ZN,CHAS,NOX,RM,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,0.0,0.538,6.575,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,0.0,0.469,6.421,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,0.0,0.469,7.185,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,0.0,0.458,6.998,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,0.0,0.458,7.147,6.0622,3.0,222.0,18.7,396.9,5.33


In [73]:
# Now the target values 
log_prices = np.log(data.target)
type(log_prices)

numpy.ndarray

In [74]:
print("Shape of target values ",log_prices.shape," and shape of features",bos_feature.shape)

Shape of target values  (506,)  and shape of features (506, 11)


In [75]:
# Since the shape of target value array was 1D inorder to convert it to 2D we transformed it to a DataFrame 
target = pd.DataFrame(data=log_prices, columns=['PRICE'])
print(target.shape)

(506, 1)


## Explanation 

Now in order to create a tool where we would be supplying custom data , we have to create an array and supply the data in the same format as it is there in the features . 

By training our model we will have the $\theta$ values we just have to supply the values of the parameters that is NOX, CHAS , RM ...so on . They must be supplied in the same order as appearing in bos_feature . 

Since the prediction depends on this: $\hat y$ = $\theta_1.CRIM + \theta_2.ZN + \theta_3.CHAS....\theta_{11}.LSTAT $ 

In [76]:
# Therefore in order to supply the data we need an array of the same shape as that of features .But one may ask
# why (1, 11) whereas bos_features has (506, 11) that is because we only evaluate one set of values at a time .
# 506 is the no. of rows , we need only 1 row of value at a time .

property_stats = np.ndarray(shape=(1, 11))

# Now we supply the values of the parameters .

# we are here supplying the mean since the crime rate of an area is always not known incase it's known replace it
# with that value .Here we use mean value for all parameters since we are just testing it .

property_stats[0][0] = bos_feature['CRIM'].mean()   
property_stats[0][0] = bos_feature['ZN'].mean()
property_stats[0][0] = bos_feature['CHAS'].mean()

# We can supply all of them at once rather than the above method .Since bos_features.mean() is a series object
# which returns the mean value of all the features as a series object,it has an attribute .values() which gives  
# us only the mean values of all properties .
# We reshaped it coz our features has 11 columns whereas our bos_feature.mean().values had 0 columns but 11 rows
# by reshaping we made it 1 row and 11 columns .

property_stats = bos_feature.mean().values.reshape(1, 11)

## What are we doing ?

Now we just prepopulated our property_stats array with the mean values of their respective features , since it is not possible for a person to know the exact values of all the parameters . When a person is looking for a house and wants to know it's value he or she just has a handful of features whose values are known to them , say the no. of rooms or whether the house is near a river bed or not etc .

We'll just overwrite the values provided by the person using our valuation tool with the mean values already present in property_stats array .

In [77]:
# But at first we need the theta values , so we train our model .

regr = LinearRegression().fit(bos_feature, target)
fitted_vals = regr.predict(bos_feature)
mse = mean_squared_error(fitted_vals, target)
rmse = np.sqrt(mse)

0.03516080084618688

In [78]:
def get_log_estimates(nr_of_rooms, student_teacher_ratio, near_river_bed=False, high_confidence=False):
    
    # number of rooms provided by querent 
    property_stats[0][4] = nr_of_rooms 
    
    # student teacher ratio by querent
    property_stats[0][8] = student_teacher_ratio
    
    # whether or not the house is near a river 
    if near_river_bed:
        property_stats[0][2] = 1
    else:
        property_stats[0][2] = 0
    
    # This is the price estimate . 
    log_esti = regr.predict(property_stats)[0][0]
    
    # if the high_confidence = False we use 68% distribution else 95% , that is 95% of the residual distribution 
    # which is 2 Standard Deviation on either side of the mean , whereas when it's 68% it is 1S.D on either side
    # this is used to give a price range
    
    if high_confidence:
        log_esti_hi = regr.predict(property_stats)[0][0] + 2*rmse
        log_esti_low = regr.predict(property_stats)[0][0] - 2*rmse
        interval = 95
    else:
        log_esti_hi = regr.predict(property_stats)[0][0] + 1*rmse
        log_esti_low = regr.predict(property_stats)[0][0] - 1*rmse
        interval = 68
        
    return log_esti, log_esti_hi, log_esti_low, interval 

In [79]:
# Test run of our function . 

log_esti, log_esti_hi, log_esti_low, interval = get_log_estimates(3, 20, True)

### Now we need to modify it a little bit since the data used here is of 1970's and therefore the price that we get back is also corresponding to 1970's so we need to scale it up according to the current year - 2021

In [80]:
median = np.median(boston_data['PRICE'])
zillow_price_21 = 583.3                   # current median price according to a valuation website .
scale_factor = zillow_price_21/median     

In [81]:
def current_valuation(rm, ptrt, chas=False, hi_co=False):
    '''
        Estimate the price of a property in Boston.
        
        Parameters :
        ------------------------------------------
        rm : No. of rooms (Not optional)
        ptrt : Student to teacher ratio (Not optional)
        chas : dummy variable , whether or not house is near a river bed (Optional)
        hi_co : Interval of normal distribution (Optional)
        
    '''
    
    if rm < 1 or ptrt < 1:
        print('Unrealistic figures !!!')
        return 
    
    log_esti, log_esti_hi, log_esti_low, interval = get_log_estimates(rm, ptrt, near_river_bed=chas, 
                                                                      high_confidence=hi_co)
    
    log_esti = np.e**log_esti * 1000 * scale_factor
    log_esti_hi = np.e**log_esti_hi * 1000 * scale_factor
    log_esti_low = np.e**log_esti_low * 1000 * scale_factor
    
    rounded_esti = np.around(log_esti, -3)
    rounded_esti_hi = np.around(log_esti_hi, -3)
    rounded_esti_low = np.around(log_esti_low, -3)
    
    print(f'Current valuation of the property is: {rounded_esti}')
    print(f'Property upper side range:{rounded_esti_hi}, lower side:{rounded_esti_low}')
    

In [83]:
current_valuation(0, 30, True, True)

Unrealistic figures !!!
