In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
import joblib

In [None]:
df = pd.read_csv('ml_house_data_set.csv')

In [None]:
#Display first five columns
head = df.head()
head 

In [None]:
del df['house_number']
del df['street_name']
del df['unit_number']

In [None]:
#Replace categorical data with one-hote encoded
feature_df = pd.get_dummies(df, columns=['garage_type', 'has_fireplace', 'has_pool', 'has_central_heating', 'has_central_cooling', 'city'])

#Remove the sale price from the feature data
del feature_df['sale_price']
feature_df

In [None]:
#Create the X and y arrays.
X = feature_df.to_numpy()
y = df['sale_price'].to_numpy()

In [None]:
#Split the data set in a training set (70%) and a test set (30%)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3, random_state=0)

In [None]:
#Fit regression model
model = ensemble.GradientBoostingRegressor(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=6,
    min_samples_leaf=9,
    max_features=0.1,
    loss='huber', 
    random_state=0)

model.fit(X_train, y_train)

In [None]:
#Save the trained model to a file so we can use it in other programs
joblib.dump(model,'house_classifier.pkl')

In [None]:
#Find the error rate on the training set
mse = mean_absolute_error(y_train, model.predict(X_train))
print(mse)

In [None]:
#Find the error rate on the test set
mse = mean_absolute_error(y_test, model.predict(X_test))
print(mse)

In [None]:
#These are the feature labels from our data set
import numpy as np
feature_labels = np.array(['year_built', 'stories', 'num_bedrooms', 'full_bathrooms', 'half_bathrooms', 'livable_sqft', 'total_sqft', 'garage_sqft', 'carport_sqft', 'has_fireplace', 'has_pool', 'has_central_heating', 'has_central_cooling', 'garage_type_attached', 'garage_type_detached', 'garage_type_none', 'city_Amystad', 'city_Brownport', 'city_Chadstad', 'city_Clarkberg', 'city_Coletown', 'city_Davidfort', 'city_Davidtown', 'city_East Amychester', 'city_East Janiceville', 'city_East Justin', 'city_East Lucas', 'city_Fosterberg', 'city_Hallfort', 'city_Jeffreyhaven', 'city_Jenniferberg', 'city_Joshuafurt', 'city_Julieberg', 'city_Justinport', 'city_Lake Carolyn', 'city_Lake Christinaport', 'city_Lake Dariusborough', 'city_Lake Jack', 'city_Lake Jennifer', 'city_Leahview', 'city_Lewishaven', 'city_Martinezfort', 'city_Morrisport', 'city_New Michele', 'city_New Robinton', 'city_North Erinville', 'city_Port Adamtown', 'city_Port Andrealand', 'city_Port Daniel', 'city_Port Jonathanborough', 'city_Richardport', 'city_Rickytown', 'city_Scottberg', 'city_South Anthony', 'city_South Stevenfurt', 'city_Toddshire', 'city_Wendybury', 'city_West Ann', 'city_West Brittanyview', 'city_West Gerald', 'city_West Gregoryview', 'city_West Lydia', 'city_West Terrence'])

In [None]:
#Load the trained model created with train model
model = joblib.load('house_classifier.pkl')

In [None]:
#Create a numpy array based on the model's feature
importance = model.feature_importances_

In [None]:
# Sort the feature lables based on the feature importance rankings from the model
feature_indexes_by_importance = importance.argsort()

In [122]:
# Print each feature label, from most important to least important (reverse order)
for index in feature_indexes_by_importance:
    print("{} - {:.2f}%".format(feature_labels[index], (importance[index] * 100.0)))

city_Lake Jack - 0.00%
city_Port Adamtown - 0.00%
city_Port Daniel - 0.00%
city_Port Jonathanborough - 0.00%
city_Fosterberg - 0.00%
city_New Michele - 0.00%
city_Wendybury - 0.01%


IndexError: index 63 is out of bounds for axis 0 with size 63