In [3]:
# Lets do our general imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error,r2_score
import joblib

df = pd.read_csv('../data/encoded_real_estate_data.csv')

df.head()

Unnamed: 0,text,year_built,beds,baths,baths_full,baths_half,garage,lot_sqft,sqft,stories,lastSoldPrice,soldOn,listPrice,type_condos,type_land,type_mobile,type_multi_family,type_single_family,type_townhomes,status_ready_to_build
0,"Come check out this amazing, move-in ready, 5 ...",1951,4,4,3,1,1,5615.0,3000.0,2,0.0,Not Sold,750000.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,Move in ready 2 story Mount Greenwood home on ...,1922,4,3,2,1,2,6566.0,2900.0,2,105000.0,1991-04-16,499900.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,Investor Special MONEY MAKING Gem. This is you...,1947,4,2,2,0,3,5375.0,1170.0,2,60000.0,2019-10-28,325600.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,Renovated Brick 1.5 story house with finished ...,1945,4,3,2,1,2,6138.0,2511.0,2,170000.0,1996-05-30,620000.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,This almost 3000sq foot home was thoughtfully ...,1907,4,4,2,2,2,8476.0,2870.0,3,0.0,Not Sold,850000.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
# Lets drop unused columns
drop_cols = ['text', 'soldOn']
df_model = df.drop(columns = drop_cols)

# Lets split up our features and targets
X = df_model.drop(columns = ['listPrice'])  # Our features
y = df_model['listPrice']                   # Our target variable

# Setting up our test train split for the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

# Training our model
model = RandomForestRegressor(n_estimators = 100, random_state = 42)
model.fit(X_train, y_train)

# Lets evaluate our model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Lets see our models accuracy to see if this is a good fit
print(f"MAE: ${mae:,.0f}")
print(f"R^2 Score: {r2:.2f}")

MAE: $187,452
R^2 Score: 0.67


In [22]:
# We got an r^2 of 0.67, which isnt too bad, but we can do better.
# Using feature engineering we can make this better, so lets take it back to the basics

# We should add some new columns to get better results
# Starting off I believe that we should add house age to the mix.
from datetime import datetime
import numpy as np

# Gets our current year
current_year = datetime.now().year

# Create a column that shows the age of the house
df_model['house_age'] = current_year - df_model['year_built']

# Adding a price per square feet column
df_model["price_per_sqft"] = df_model["listPrice"] / df_model["sqft"].replace(0, np.nan)

# Lastly lets try adding a lot ratio to see if this affects pricing
df_model["lot_ratio"] = df_model["lot_sqft"] / df_model["sqft"].replace(0, np.nan)

# Take a peek at our new glorious dataset
df_model.head()

Unnamed: 0,year_built,beds,baths,baths_full,baths_half,garage,lot_sqft,sqft,stories,lastSoldPrice,...,type_condos,type_land,type_mobile,type_multi_family,type_single_family,type_townhomes,status_ready_to_build,house_age,price_per_sqft,lot_ratio
0,1951,4,4,3,1,1,5615.0,3000.0,2,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,74,250.0,1.871667
1,1922,4,3,2,1,2,6566.0,2900.0,2,105000.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,103,172.37931,2.264138
2,1947,4,2,2,0,3,5375.0,1170.0,2,60000.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,78,278.290598,4.594017
3,1945,4,3,2,1,2,6138.0,2511.0,2,170000.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,80,246.91358,2.444444
4,1907,4,4,2,2,2,8476.0,2870.0,3,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,118,296.167247,2.95331


In [None]:
# Lets try our new model after engineering the new features
X = df_model.drop(columns=['listPrice'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)
model = RandomForestRegressor(n_estimators = 100, random_state = 42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mae_revised = mean_absolute_error(y_test, y_pred)
r2_revised = r2_score(y_test, y_pred)

print(f"MAE: ${mae_revised:,.0f}")
print(f"R^2 Score: {r2_revised:.2f}")

MAE: $41,185
R^2 Score: 0.88


In [25]:
# Our model improved greatly, and I believe 88% is satisfactory, lets export our model now.
joblib.dump(model, "house_price_model_v2.pkl")

['house_price_model_v2.pkl']