# Estimating House Prices with a Support Vector Regressor

In [13]:
import numpy
from sklearn import datasets
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import ipywidgets as widgets
from IPython.display import display

import sys

sys.path.append("../")

from common import common_widgets

## Data Set

In [14]:
data = datasets.load_boston()

print(data.DESCR)

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

## Initializing the Model

In [22]:
def train(random_state=7, test_size=0.2, C=1.0, epsilon=0.1, kernel="linear"):
    features, target = shuffle(data.data, data.target, random_state=random_state)

    split = train_test_split(features, target, test_size=test_size, random_state=random_state)
    feature_train, feature_test = split[0], split[1]
    target_train, target_test = split[2], split[3]

    regressor = SVR(kernel=kernel, C=C, epsilon=epsilon)
    regressor.fit(feature_train, target_train)
    
    return features, target, feature_train, feature_test, target_train, target_test, regressor

model_widget = widgets.interact(
    train,
    random_state=common_widgets.random_state,
    test_size=common_widgets.test_size,
    C=widgets.FloatSlider(value=1.0, min=1.0, max=100.0),
    epsilon=widgets.FloatLogSlider(value=0.1, base=10, min=-5, max=1, step=0.5),
    kernel=['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']
)

features, target, feature_train, feature_test, target_train, target_test, regressor = model_widget()

(array([[  6.72400000e-02,   0.00000000e+00,   3.24000000e+00, ...,
           1.69000000e+01,   3.75210000e+02,   7.34000000e+00],
        [  9.23230000e+00,   0.00000000e+00,   1.81000000e+01, ...,
           2.02000000e+01,   3.66150000e+02,   9.53000000e+00],
        [  1.14250000e-01,   0.00000000e+00,   1.38900000e+01, ...,
           1.64000000e+01,   3.93740000e+02,   1.05000000e+01],
        ..., 
        [  1.50980000e-01,   0.00000000e+00,   1.00100000e+01, ...,
           1.78000000e+01,   3.94510000e+02,   1.03000000e+01],
        [  2.29270000e-01,   0.00000000e+00,   6.91000000e+00, ...,
           1.79000000e+01,   3.92740000e+02,   1.88000000e+01],
        [  1.39140000e-01,   0.00000000e+00,   4.05000000e+00, ...,
           1.66000000e+01,   3.96900000e+02,   1.46900000e+01]]),
 array([ 22.6,  50. ,  23. ,   8.3,  21.2,  19.9,  20.6,  18.7,  16.1,
         18.6,   8.8,  17.2,  14.9,  10.5,  50. ,  29. ,  23. ,  33.3,
         29.4,  21. ,  23.8,  19.1,  20.4,  29.1, 

## Test Data

In [18]:
predictions = regressor.predict(feature_test)
mse = mean_squared_error(target_test, predictions)
evs = explained_variance_score(target_test, predictions)

print("Performance:")
print("\tMean Squared Error\t= {:.2f}".format(mse))
print("\tExplained Variance Score\t= {:.2f}".format(evs))

Performance:
	Mean Squared Error	= 29.90
	Explained Variance Score	= 0.74


## New Data Point

In [None]:
def predict(crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, b, lstat):
    test_data = (crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, b, lstat)
    predicted_price = regressor.predict([test_data])[0]
    print("Predicted Price\t= ${:.2f}".format(predicted_price * 1000))

def slider(data, label):
    min = data.min()
    max = data.max()
    return widgets.FloatSlider(
        min=min,
        max=max,
        value=(min + max) / 2,
        description=label
    )

widget = widgets.interactive(
    predict,
    crim=slider(data.data[:, 0].round(), "Crime per Capita"),
    zn=slider(data.data[:, 1].round(), "Land Zoned for 25000ft^2"),
    indus=slider(data.data[:, 2].round(), "Non-Retail Business Acres Per Town"),
    chas=widgets.Checkbox(description="On Charles River"),
    nox=slider(data.data[:, 4].round(), "NOX concentration (pp10m)"),
    rm=slider(data.data[:, 5].round(), "Avg. rooms per home"),
    age=slider(data.data[:, 6].round(), "Prop. units built before 1940"),
    dis=slider(data.data[:, 7].round(), "Distance to 5 employment centers"),
    rad=slider(data.data[:, 8].round(), "Index of accessibility to radial highways"),
    tax=slider(data.data[:, 9].round(), "Property tax per $10,000"),
    ptratio=slider(data.data[:, 10].round(), "Students per teacher by town"),
    b=slider(data.data[:, 11].round(), "Prop. blacks by town"),
    lstat=slider(data.data[:, 12].round(), "% Lower status of population")
)

widget