[View in Colaboratory](https://colab.research.google.com/github/Mahsa-Kiani/lectures/blob/master/ValueEstimate.ipynb)

In [4]:
def estimate_home_value(size_in_sqft, number_of_bedrooms):

    # Assume all homes are worth at least $50,000
    value = 50000

    # Adjust the value estimate based on the size of the house
    value = value + (size_in_sqft * 92)

    # Adjust the value estimate based on the number of bedrooms
    value = value + (number_of_bedrooms * 10000)

    return value

# Estimate the value of our house:
# - 5 bedrooms
# - 3800 sq ft
# Actual value: $450,000

value = estimate_home_value(3800, 5)

print("Estimated valued:")
print(value)

Estimated valued:
449600


In [5]:
from google.colab import files
uploaded = files.upload()

Saving ml_house_data_set.csv to ml_house_data_set.csv


In [6]:
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(name=fn, length=len(uploaded[fn])))

User uploaded file "ml_house_data_set.csv" with length 4560349 bytes


In [7]:
import pandas as pd
import webbrowser
import os
import io

# Read the dataset into a data table using Pandas

df = pd.read_csv(io.StringIO(uploaded['ml_house_data_set.csv'].decode('utf-8')))
print(df.head())



   year_built  stories  num_bedrooms  full_bathrooms  half_bathrooms  \
0        1978        1             4               1               1   
1        1958        1             3               1               1   
2        2002        1             3               2               0   
3        2004        1             4               2               0   
4        2006        1             4               2               0   

   livable_sqft  total_sqft garage_type  garage_sqft  carport_sqft  \
0          1689        1859    attached          508             0   
1          1984        2002    attached          462             0   
2          1581        1578        none            0           625   
3          1829        2277    attached          479             0   
4          1580        1749    attached          430             0   

   has_fireplace  has_pool  has_central_heating  has_central_cooling  \
0           True     False                 True                 True   
1 

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
from sklearn.externals import joblib

# Load the data set
df = pd.read_csv("ml_house_data_set.csv")

# Remove the fields from the data set that we don't want to include in our model
del df['house_number']
del df['unit_number']
del df['street_name']
del df['zip_code']

# Replace categorical data with one-hot encoded data
features_df = pd.get_dummies(df, columns=['garage_type', 'city'])

# Remove the sale price from the feature data
del features_df['sale_price']

# Create the X and y arrays
X = features_df.as_matrix()
y = df['sale_price'].as_matrix()
print(X)
print(y)


[[1978 1 4 ... 0 0 0]
 [1958 1 3 ... 0 0 0]
 [2002 1 3 ... 0 0 0]
 ...
 [1983 1 1 ... 0 0 0]
 [1981 1 3 ... 0 0 0]
 [1980 1 3 ... 0 0 0]]
[ 270897.  302404. 2519996. ...   98280.   98278.  186480.]


In [0]:
# Split the data set in a training set (70%) and a test set (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [10]:
model = ensemble.GradientBoostingRegressor(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=6,
    min_samples_leaf=9,
    max_features=0.1,
    loss='huber',
    random_state=0
)
model.fit(X_train, y_train)

# Save the trained model to a file so we can use it in other programs
joblib.dump(model, 'trained_house_classifier_model.pkl')

['trained_house_classifier_model.pkl']

In [11]:

# Find the error rate on the training set
mse = mean_absolute_error(y_train, model.predict(X_train))
print("Training Set Mean Absolute Error: %.4f" % mse)

# Find the error rate on the test set
mse = mean_absolute_error(y_test, model.predict(X_test))
print("Test Set Mean Absolute Error: %.4f" % mse)


Training Set Mean Absolute Error: 47130.6706
Test Set Mean Absolute Error: 60415.2605


In [12]:
import numpy as np
from sklearn.externals import joblib

# These are the feature labels from our data set
feature_labels = np.array(['year_built', 'stories', 'num_bedrooms', 'full_bathrooms', 'half_bathrooms', 'livable_sqft', 'total_sqft', 'garage_sqft', 'carport_sqft', 'has_fireplace', 'has_pool', 'has_central_heating', 'has_central_cooling', 'garage_type_attached', 'garage_type_detached', 'garage_type_none', 'city_Amystad', 'city_Brownport', 'city_Chadstad', 'city_Clarkberg', 'city_Coletown', 'city_Davidfort', 'city_Davidtown', 'city_East Amychester', 'city_East Janiceville', 'city_East Justin', 'city_East Lucas', 'city_Fosterberg', 'city_Hallfort', 'city_Jeffreyhaven', 'city_Jenniferberg', 'city_Joshuafurt', 'city_Julieberg', 'city_Justinport', 'city_Lake Carolyn', 'city_Lake Christinaport', 'city_Lake Dariusborough', 'city_Lake Jack', 'city_Lake Jennifer', 'city_Leahview', 'city_Lewishaven', 'city_Martinezfort', 'city_Morrisport', 'city_New Michele', 'city_New Robinton', 'city_North Erinville', 'city_Port Adamtown', 'city_Port Andrealand', 'city_Port Daniel', 'city_Port Jonathanborough', 'city_Richardport', 'city_Rickytown', 'city_Scottberg', 'city_South Anthony', 'city_South Stevenfurt', 'city_Toddshire', 'city_Wendybury', 'city_West Ann', 'city_West Brittanyview', 'city_West Gerald', 'city_West Gregoryview', 'city_West Lydia', 'city_West Terrence'])

# Load the trained model created with train_model.py
model = joblib.load('trained_house_classifier_model.pkl')

# Create a numpy array based on the model's feature importances
importance = model.feature_importances_

# Sort the feature labels based on the feature importance rankings from the model
feauture_indexes_by_importance = importance.argsort()

# Print each feature label, from most important to least important (reverse order)
for index in feauture_indexes_by_importance:
    print("{} - {:.2f}%".format(feature_labels[index], (importance[index] * 100.0)))

city_Martinezfort - 0.00%
city_Julieberg - 0.00%
city_New Michele - 0.00%
city_New Robinton - 0.00%
city_Davidtown - 0.03%
city_Rickytown - 0.08%
city_West Terrence - 0.08%
city_Port Daniel - 0.11%
city_Toddshire - 0.11%
city_Amystad - 0.11%
city_West Brittanyview - 0.12%
city_Fosterberg - 0.13%
city_East Justin - 0.14%
city_Lake Jennifer - 0.14%
city_Jenniferberg - 0.16%
city_Leahview - 0.16%
city_South Stevenfurt - 0.17%
city_Clarkberg - 0.18%
city_Joshuafurt - 0.18%
city_West Gerald - 0.19%
city_Brownport - 0.19%
city_West Lydia - 0.20%
city_Lake Carolyn - 0.21%
city_Port Adamtown - 0.21%
city_Wendybury - 0.24%
city_Port Jonathanborough - 0.26%
city_Davidfort - 0.27%
city_East Amychester - 0.28%
city_Scottberg - 0.29%
city_East Janiceville - 0.29%
city_Lake Dariusborough - 0.29%
city_West Gregoryview - 0.30%
city_East Lucas - 0.32%
city_Lake Christinaport - 0.37%
city_Justinport - 0.38%
city_Hallfort - 0.38%
city_Richardport - 0.41%
city_Morrisport - 0.49%
city_South Anthony - 0.50%

In [15]:
from sklearn.externals import joblib

# Load the model we trained previously
model = joblib.load('trained_house_classifier_model.pkl')

# For the house we want to value, we need to provide the features in the exact same
# arrangement as our training data set.
house_to_value = [
    # House features
    2006,   # year_built
    1,      # stories
    4,      # num_bedrooms
    3,      # full_bathrooms
    0,      # half_bathrooms 
    2200,   # livable_sqft
    2350,   # total_sqft
    0,      # garage_sqft
    0,      # carport_sqft
    True,   # has_fireplace
    False,  # has_pool
    True,   # has_central_heating
    True,   # has_central_cooling
    
    # Garage type: Choose only one
    0,      # attached
    0,      # detached
    1,      # none
    
    # City: Choose only one
    0,      # Amystad
    1,      # Brownport
    0,      # Chadstad
    0,      # Clarkberg
    0,      # Coletown
    0,      # Davidfort
    0,      # Davidtown
    0,      # East Amychester
    0,      # East Janiceville
    0,      # East Justin
    0,      # East Lucas
    0,      # Fosterberg
    0,      # Hallfort
    0,      # Jeffreyhaven
    0,      # Jenniferberg
    0,      # Joshuafurt
    0,      # Julieberg
    0,      # Justinport
    0,      # Lake Carolyn
    0,      # Lake Christinaport
    0,      # Lake Dariusborough
    0,      # Lake Jack
    0,      # Lake Jennifer
    0,      # Leahview
    0,      # Lewishaven
    0,      # Martinezfort
    0,      # Morrisport
    0,      # New Michele
    0,      # New Robinton
    0,      # North Erinville
    0,      # Port Adamtown
    0,      # Port Andrealand
    0,      # Port Daniel
    0,      # Port Jonathanborough
    0,      # Richardport
    0,      # Rickytown
    0,      # Scottberg
    0,      # South Anthony
    0,      # South Stevenfurt
    0,      # Toddshire
    0,      # Wendybury
    0,      # West Ann
    0,      # West Brittanyview
    0,      # West Gerald
    0,      # West Gregoryview
    0,      # West Lydia
    0       # West Terrence
]

# scikit-learn assumes you want to predict the values for lots of houses at once, so it expects an array.
# We just want to look at a single house, so it will be the only item in our array.
homes_to_value = [
    house_to_value
]

# Run the model and make a prediction for each house in the homes_to_value array
predicted_home_values = model.predict(homes_to_value)

# Since we are only predicting the price of one house, just look at the first prediction returned
predicted_value = predicted_home_values[0]

print("This house has an estimated value of ${:,.2f}".format(predicted_value))



This house has an estimated value of $594,639.98


In [0]:
from google.colab import files
files.download('trained_house_classifier_model.pkl')

In [14]:
import pandas
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

# Load the data set
df = pandas.read_csv("ml_house_data_set.csv")

# Remove the fields from the data set that we don't want to include in our model
del df['house_number']
del df['unit_number']
del df['street_name']
del df['zip_code']

# Replace categorical data with one-hot encoded data
features_df = pandas.get_dummies(df, columns=['garage_type', 'city'])
del features_df['sale_price']

X = features_df.as_matrix()
y = df['sale_price'].as_matrix()

# Split the data set in a training set (70%) and a test set (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Create the model
model = ensemble.GradientBoostingRegressor()

# Parameters we want to try
#LS: Least Square errors
# LAD: Least Absolute Deviations
# HUBER: Huber loss function.
param_grid = {
    'n_estimators': [500, 1000, 3000],
    'max_depth': [4, 6],
    'min_samples_leaf': [3, 5, 9, 17],
    'learning_rate': [0.1, 0.05, 0.02, 0.01],
    'max_features': [1.0, 0.3, 0.1],
    'loss': ['ls', 'lad', 'huber']
}

# Define the grid search we want to run. Run it with four cpus in parallel.
gs_cv = GridSearchCV(model, param_grid, n_jobs=4)

# Run the grid search - on only the training data!
gs_cv.fit(X_train, y_train)

# Print the parameters that gave us the best result!
print(gs_cv.best_params_)

# After running a .....long..... time, the output will be something like
# {'loss': 'huber', 'learning_rate': 0.1, 'min_samples_leaf': 9, 'n_estimators': 3000, 'max_features': 0.1, 'max_depth': 6}

# That is the combination that worked best.

# Find the error rate on the training set using the best parameters
mse = mean_absolute_error(y_train, gs_cv.predict(X_train))
print("Training Set Mean Absolute Error: %.4f" % mse)

# Find the error rate on the test set using the best parameters
mse = mean_absolute_error(y_test, gs_cv.predict(X_test))
print("Test Set Mean Absolute Error: %.4f" % mse)



KeyboardInterrupt: ignored