In [50]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV

import visuals as vs 
%matplotlib inline





In [51]:
data=pd.read_csv('housing.csv')
price=data['MEDV']
features=data.drop('MEDV', axis=1)
data.head()

Unnamed: 0,RM,LSTAT,PTRATIO,MEDV
0,6.575,4.98,15.3,504000.0
1,6.421,9.14,17.8,453600.0
2,7.185,4.03,17.8,728700.0
3,6.998,2.94,18.7,701400.0
4,7.147,5.33,18.7,760200.0


In [52]:
min_price=data['MEDV'].min()
max_price=data['MEDV'].max()
mean_price=data['MEDV'].mean()
median_price=data['MEDV'].median()
std_price=data['MEDV'].std()
print("Minimum price: ${}".format(min_price))
print("Maximum price: ${}".format(max_price))
print("Mean price: ${}".format(mean_price))
print("Median price: ${}".format(median_price))
print("Standard Deviation: ${}".format(std_price))


Minimum price: $105000.0
Maximum price: $1024800.0
Mean price: $454342.9447852761
Median price: $438900.0
Standard Deviation: $165340.27765266786


In [53]:
def performance_metric(y_true, y_predict):
    score = r2_score(y_true,y_predict)
    return score
    

In [54]:
performance_metric([3, -0.5, 2, 7, 4.2], [2.5, 0.0, 2.1, 7.8, 5.3])

0.9228556485355649

In [72]:

X_train, X_test, y_train, y_test = train_test_split( features, price, test_size=0.33, random_state=0)

print("Training and testing split was successful.")

Training and testing split was successful.


In [73]:
def fit_model(X, y):
    
    cv_sets = ShuffleSplit(X.shape[0], n_iter = 10, test_size = 0.20, random_state = 0)
    regressor = DecisionTreeRegressor()
    params = {'max_depth': [1,2,3,4,5,6,7,8,9,10]}
    scoring_fnc = make_scorer(performance_metric)
    grid = GridSearchCV(regressor, params, scoring=scoring_fnc, cv=cv_sets)
    grid = grid.fit(X, y)
    return grid.best_estimator_

In [74]:
reg = fit_model(X_train, y_train)

# Produce the value for 'max_depth'
print("Parameter 'max_depth' is {} for the optimal model.".format(reg.get_params()['max_depth']))

Parameter 'max_depth' is 4 for the optimal model.
