In [None]:
#Import libraries necessary fot this project
import numpy as np
import pandas as pd 
from sklearn.model_selection import ShuffleSplit

#Import supplementary visualizations code visuals.py
import visuals as vs

#pretty displays for notebooks
%matplotlib inline

#load the boston housing dataset
data = pd.read_csv('housing.csv')
prices = data['MEDV']
features = data.drop('MEDV', axis = 1)

#Success
print("Boston housing dataset has {} data points with {} variables each."
     .format(*data.shape))

Data exploration

In the first section of the project, we will make an exploratory analysis of the dataset and
some observations

Calculate Statistics

In [None]:
#Minimum price of the data
minimum_price = np.amin(prices)

#Maximum price of the data
maximum_price = np.amax(prices)

#Mean price of the data
mean_price = np.mean(prices)

#Median price of the data
median_price = np.median(prices)

#standard deviations of prices of the data
std_price = np.std(prices)

#show the calculated statistics
print("Statistics of housing dataset:\n")
print("Minimum price: ${}".format(minimum_price))
print("Maximum price: ${}".format(maximum_price))
print("Mean price: ${}".format(mean_price))
print("Median price: ${}".format(median_price))
print("standard deviation of prices: ${}".format(std_price))

Feature Obseravation

Exploratory Data Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

#calculate and show pairplot
sns.pairplot(data, size=2.5)
plt.tight_layout()

Correlation matrix

In [None]:
#calculate and show correlation matrix
cm = np.corrcoef(data.values.T)
sns.set(font_scale=1.5)
hm = sns.heatmap(cm,
                cbar=True,
                annot=True,
                square=True,
                fmt='.2f',
                annot_kws={'size': 15},
                yticklables=cols,
                xticklabels=cols)

Developing a model

In [None]:
# import 'r2_score'
from sklearn.metrics import r2_score

def performance_metric(y_true, y_predict):
    
    score = r2_score(y_true, y_predict)
    
    #Return the score
    return score
    

Shuffle and split data

In [None]:
#import 'train_test_split'
from sklearn.model_selection import train_test_split

#shuffle and split data into training and testing subsets
x_train, x_test, y_train, y_test = train_test_split(features,
                                                   prices,test_size=0.2, random_state 42)

#success
print("Training and testing split was successful.")

Analyzing Models performance

In [None]:
#produce learning curves for varying training set sizes and maximum depths
vs.modelComplexity(x_train, y_train)

Evaluating Models performance

Grid Search

In [None]:
param_grid = [ {‘C’: [1, 10, 100, 1000], ‘kernel’: [‘linear’]}, {‘C’: [1, 10, 100, 1000], ‘gamma’: [0.001, 0.0001], 
                                                                 ‘kernel’: [‘rbf’]}, ]

Cross validation

Fitting a model

In [None]:
# Import 'make_scorer', 'DecisionTreeRegressor', and 'GridSearchCV'
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

def fit_model(X, y):
    """ Performs grid search over the 'max_depth' parameter for a 
        decision tree regressor trained on the input data [X, y]. """
    
    # Create cross-validation sets from the training data
    cv_sets = ShuffleSplit(n_splits = 10, test_size = 0.20, random_state = 0)

    # Create a decision tree regressor object
    regressor = DecisionTreeRegressor()

    # Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
    params = {'max_depth':[1,2,3,4,5,6,7,8,9,10]}

    # Transform 'performance_metric' into a scoring function using 'make_scorer' 
    scoring_fnc = make_scorer(performance_metric)

    # Create the grid search cv object --> GridSearchCV()
    grid = GridSearchCV(estimator=regressor, param_grid=params, scoring=scoring_fnc, cv=cv_sets)

    # Fit the grid search object to the data to compute the optimal model
    grid = grid.fit(X, y)

    # Return the optimal model after fitting the data
    return grid.best_estimator_

Making predictions

Optimal model

In [None]:
#fit the training data to the model using grid search
reg = fit_model(x_train, y_train)

#produce the value for 'max_depth'
print("parameter 'max_depth' is {} for the optimal
      model.".format(reg.get_params() ['max_depth']))

Predicting selling prices

In [None]:
# Produce a matrix for client data
client_data = [[5, 17, 15], # Client 1
               [4, 32, 22], # Client 2
               [8, 3, 12]]  # Client 3

# Show predictions
for i, price in enumerate(reg.predict(client_data)):
    print("Predicted selling price for Client {}'s home: ${:,.2f}".format(i+1, price))

Models sensitivity

In [None]:
vs.predictTrails(features, prices, fit_model, client_data)