# Housing Prices Competition

In [19]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

iowa_file_path = 'train.csv'
home_data = pd.read_csv(iowa_file_path)

test_data_path = 'test.csv'
test_data = pd.read_csv(test_data_path)

In [20]:
# Keep only intuitive features
features = ['MSSubClass', 'MSZoning', 'Neighborhood', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'ExterQual', 'ExterCond', 'BsmtQual',
       'BsmtCond', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'FullBath','HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'GarageArea', 'GarageQual',
       'GarageCond']
X = home_data[features].copy()
y = home_data.SalePrice.copy()
test_X = test_data[features].copy()

X = X.dropna()
y = y[y.index.isin(X.index)]

In [21]:
one_hot_encoded_training_predictors = pd.get_dummies(X)
one_hot_encoded_test_predictors = pd.get_dummies(test_X)
X, test_X = one_hot_encoded_training_predictors.align(one_hot_encoded_test_predictors, join='inner', axis=1)

In [22]:
# Find missing values
missing_val_count_by_column = (X.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

missing_val_count_by_column = (X.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

Series([], dtype: int64)
Series([], dtype: int64)


In [23]:
# Find best features using a correlation matrix
correlation_dataframe = X.copy()

corr = correlation_dataframe.corr()
corr.style.background_gradient()
corr.style.background_gradient().set_precision(2)

Unnamed: 0,MSSubClass,OverallQual,OverallCond,YearBuilt,YearRemodAdd,TotalBsmtSF,1stFlrSF,2ndFlrSF,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,GarageArea,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Neighborhood_Blmngtn,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,ExterQual_Ex,ExterQual_Fa,ExterQual_Gd,ExterQual_TA,ExterCond_Ex,ExterCond_Fa,ExterCond_Gd,ExterCond_TA,BsmtQual_Ex,BsmtQual_Fa,BsmtQual_Gd,BsmtQual_TA,BsmtCond_Fa,BsmtCond_Gd,BsmtCond_Po,BsmtCond_TA,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,GarageQual_Fa,GarageQual_Gd,GarageQual_Po,GarageQual_TA,GarageCond_Ex,GarageCond_Fa,GarageCond_Gd,GarageCond_Po,GarageCond_TA
MSSubClass,1.0,0.05,-0.1,0.06,0.04,-0.27,-0.26,0.31,0.11,0.2,-0.07,0.23,0.01,-0.07,0.01,0.19,0.02,-0.34,0.27,0.16,0.1,0.27,-0.01,-0.01,-0.11,0.01,0.0,0.01,-0.04,0.24,0.01,-0.19,0.17,-0.07,-0.01,0.04,0.07,-0.0,-0.1,0.0,0.11,0.09,-0.07,0.01,-0.03,0.01,0.07,-0.06,0.02,0.06,-0.05,0.03,-0.05,0.01,0.18,-0.15,0.0,-0.0,-0.02,0.0,-0.04,-0.0,0.05,-0.02,-0.02,0.06,0.04,-0.01,-0.03,0.01,0.0,0.03,-0.02
OverallQual,0.05,1.0,-0.16,0.57,0.55,0.51,0.48,0.28,0.58,0.25,0.08,-0.13,0.44,0.53,-0.13,0.17,-0.05,0.13,-0.21,0.08,-0.01,-0.04,-0.11,-0.02,0.12,0.01,-0.16,0.07,-0.16,-0.12,-0.08,-0.25,-0.01,0.02,0.23,0.38,-0.17,-0.07,-0.21,0.03,0.22,0.2,0.12,0.03,0.4,-0.09,0.54,-0.67,0.07,-0.1,-0.1,0.12,0.48,-0.14,0.33,-0.56,-0.14,0.09,-0.03,0.02,0.45,-0.12,0.38,-0.58,-0.15,0.04,-0.03,0.11,-0.02,-0.15,-0.02,-0.06,0.16
OverallCond,-0.1,-0.16,1.0,-0.42,0.03,-0.23,-0.15,0.01,-0.22,-0.07,0.02,-0.08,-0.07,-0.2,-0.05,-0.12,-0.03,-0.05,0.15,-0.06,0.01,-0.01,0.14,0.02,-0.11,0.18,-0.01,-0.11,-0.01,-0.04,-0.02,0.1,-0.0,0.08,-0.06,-0.13,0.21,0.02,0.06,-0.08,-0.13,-0.08,-0.07,0.06,-0.03,-0.07,-0.2,0.22,0.12,-0.1,0.35,-0.31,-0.17,0.06,-0.22,0.3,-0.04,0.08,-0.09,-0.03,-0.06,0.01,-0.02,0.05,0.05,0.1,-0.03,-0.1,0.01,0.02,0.06,-0.02,-0.04
YearBuilt,0.06,0.57,-0.42,1.0,0.62,0.38,0.26,0.01,0.49,0.23,-0.08,-0.18,0.1,0.44,-0.12,0.24,-0.06,0.26,-0.4,0.12,0.01,-0.01,-0.28,-0.03,0.29,-0.21,-0.1,0.21,-0.22,0.0,0.05,-0.19,0.01,0.02,0.13,0.27,-0.48,-0.19,-0.07,0.11,0.28,0.12,0.11,0.03,0.18,-0.08,0.56,-0.6,-0.08,-0.15,-0.2,0.25,0.32,-0.26,0.55,-0.65,-0.23,0.07,-0.05,0.1,0.2,-0.19,0.43,-0.48,-0.32,-0.06,-0.09,0.33,-0.03,-0.26,-0.03,-0.15,0.29
YearRemodAdd,0.04,0.55,0.03,0.62,1.0,0.26,0.24,0.12,0.46,0.17,-0.08,-0.16,0.17,0.39,-0.11,0.21,-0.09,0.07,-0.17,0.11,-0.01,-0.06,-0.15,-0.01,0.23,-0.06,-0.13,0.17,-0.17,-0.05,-0.01,-0.28,-0.04,-0.05,0.09,0.25,-0.15,-0.11,-0.08,0.05,0.25,0.09,0.06,0.02,0.2,-0.12,0.54,-0.59,0.03,-0.16,-0.01,0.06,0.29,-0.13,0.42,-0.55,-0.11,0.09,-0.05,0.0,0.24,-0.19,0.54,-0.61,-0.15,0.02,-0.08,0.15,-0.05,-0.13,-0.04,-0.1,0.17
TotalBsmtSF,-0.27,0.51,-0.23,0.38,0.26,1.0,0.89,-0.24,0.35,-0.13,0.03,0.04,0.29,0.51,-0.07,-0.06,-0.05,0.28,-0.26,0.06,-0.04,-0.13,-0.12,0.05,0.05,-0.03,-0.03,-0.13,-0.11,-0.1,0.03,-0.02,-0.03,0.03,0.17,0.3,-0.18,-0.08,-0.06,0.0,0.02,0.14,0.11,0.05,0.37,-0.05,0.25,-0.38,-0.01,-0.06,-0.08,0.1,0.42,-0.14,0.08,-0.28,-0.09,0.02,-0.01,0.04,0.38,-0.08,0.13,-0.31,-0.12,-0.02,-0.02,0.12,-0.03,-0.11,-0.03,-0.04,0.12
1stFlrSF,-0.26,0.48,-0.15,0.26,0.24,0.89,1.0,-0.23,0.37,-0.15,0.1,0.07,0.39,0.5,-0.07,-0.11,-0.06,0.3,-0.25,0.07,-0.05,-0.16,-0.11,0.04,-0.01,0.04,-0.03,-0.15,-0.12,-0.1,0.01,0.01,-0.05,0.09,0.16,0.26,-0.14,-0.04,-0.04,-0.0,-0.03,0.13,0.13,0.05,0.33,-0.05,0.2,-0.32,-0.01,-0.05,-0.02,0.04,0.38,-0.09,0.03,-0.22,-0.06,0.03,-0.01,0.02,0.35,-0.09,0.12,-0.28,-0.11,0.02,-0.03,0.09,-0.04,-0.1,-0.01,-0.04,0.11
2ndFlrSF,0.31,0.28,0.01,0.01,0.12,-0.24,-0.23,1.0,0.42,0.62,0.51,0.06,0.62,0.12,0.0,0.11,0.0,-0.1,0.04,-0.09,0.03,0.05,-0.01,0.03,-0.03,0.07,-0.04,0.19,-0.03,0.02,-0.11,-0.21,-0.0,0.03,0.25,-0.01,0.09,0.07,-0.14,0.03,0.06,-0.01,-0.03,-0.05,0.04,-0.02,0.17,-0.18,0.05,-0.01,0.0,-0.01,0.03,0.01,0.15,-0.17,0.03,0.04,-0.02,-0.04,0.05,-0.0,0.15,-0.18,0.03,0.04,-0.0,-0.05,-0.03,0.02,0.0,0.02,-0.02
FullBath,0.11,0.58,-0.22,0.49,0.46,0.35,0.37,0.42,1.0,0.13,0.36,0.09,0.55,0.42,-0.08,0.17,-0.02,0.11,-0.22,0.06,-0.01,-0.07,-0.16,-0.01,0.11,-0.03,-0.11,0.18,-0.16,-0.07,-0.03,-0.3,0.06,0.13,0.17,0.21,-0.12,0.0,-0.17,0.06,0.2,0.09,0.1,-0.04,0.19,-0.08,0.43,-0.49,0.06,-0.08,-0.06,0.08,0.24,-0.07,0.4,-0.52,-0.07,0.05,-0.03,0.01,0.17,-0.11,0.38,-0.44,-0.12,0.04,-0.02,0.1,-0.04,-0.11,-0.02,-0.04,0.12
HalfBath,0.2,0.25,-0.07,0.23,0.17,-0.13,-0.15,0.62,0.13,1.0,0.24,-0.06,0.36,0.13,-0.04,0.14,-0.07,0.05,-0.12,-0.06,0.05,0.13,-0.1,-0.02,-0.01,0.0,-0.08,0.2,-0.07,0.03,-0.1,-0.11,0.02,0.09,0.17,0.01,-0.11,-0.03,-0.04,0.05,0.09,-0.0,-0.01,0.06,0.06,-0.04,0.15,-0.17,0.01,-0.05,-0.08,0.09,0.06,-0.08,0.18,-0.19,-0.07,0.01,-0.02,0.04,0.09,-0.03,0.11,-0.15,-0.04,0.0,-0.04,0.04,-0.03,-0.06,-0.01,-0.06,0.08


In [24]:
# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=42)

# Define the model. Set random_state to 1
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(train_X, train_y)
rf_val_predictions = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)

print("Validation MAE for Random Forest Model: {:,.0f}".format(rf_val_mae))

Validation MAE for Random Forest Model: 19,551


In [25]:
# To improve accuracy, create a new Random Forest model which you will train on all training data
rf_model_on_full_data = RandomForestRegressor(random_state=42)

# fit rf_model_on_full_data on all data from the 
rf_model_on_full_data.fit(X, y)

RandomForestRegressor(random_state=42)

In [27]:
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()
test_X = my_imputer.fit_transform(test_X)

# make predictions which we will submit. 
test_preds = rf_model_on_full_data.predict(test_X)

# The lines below shows you how to save your data in the format needed to score it in the competition
output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': test_preds})

output.to_csv('submission.csv', index=False)