In [66]:
# Import your packages used to explore the data
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import seaborn as sns

In [67]:
# Uploading the data
df = pd.read_csv("kc_house_data.csv")

In [113]:
# Viewing the top 5 rows of data
df.head()

In [69]:
# We can already see we do not need some of these columns. 
#I would initially already remove the date and id columns as they will have no impact on the predicting factors

In [111]:
# However, before we do that lets explore some more.
df.isnull().sum()

In [112]:
# Lets look at data types.
df.dtypes

In [72]:
# Great. We can already see what we need to amend for this model. Price, bedrooms, bathrooms and floors

In [105]:
# However, before we start cleaning our data. Lets look at the correlation. 
#There is no point wasting time amending data that we will later delete
df.corr()

In [106]:
# Lets make the above a bit more legible
plt.figure(figsize=(15,15))
sns.heatmap(df.corr(), cmap="BuGn", annot=True)

In [109]:
# Lets change price to a integer so we have a whole number to predict.
df = df.astype({'price': np.int64})
# In this example I have made bathrooms an integer. 
#However, it would be prudent to first round the number down, and then make it an integer
df = df.astype({'bathrooms': np.int64})
print(df.head())

In [77]:
# If we had any alphanumeric characters, we could use the df_new = pd.get_dummies = ['column name'] to change to numerics

In [102]:
#Lets create our split data. We remove our target variable (price) && any other weakly correlated variables
# In our py file I selected the most probable search factors, as this would be presented to a end user to fill in
# Many details such as conditions or grade, although impactful on price would not be as important in an initial valuation
X = df.drop(['price', 'id', 'date', 'lat', 'long'], axis=1)
y = df['price']

In [103]:
# You can check that columns have correctly been dropped
print(X)

In [104]:
#print(y)

In [89]:
#Lets test the models, before we choose our best model for the .py files
import sklearn.model_selection
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=50)



In [90]:
# Lets test the models we have before progressing into our .py files

# First up is the linear regression

from sklearn.linear_model import LinearRegression

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
linear_model.score(X_test, y_test)

0.6603523350206355

In [92]:
# Second up is Decision Tree Regression

from sklearn.tree import DecisionTreeRegressor

dec_model = DecisionTreeRegressor()
dec_model.fit(X_train, y_train)
dec_model.score(X_test, y_test)

0.5662796002447262

In [93]:
# Third up is the Random Forest Regression

from sklearn.ensemble import RandomForestRegressor

random_model = RandomForestRegressor()
random_model.fit(X_train, y_train)
random_model.score(X_test, y_test)

0.7821620708180751

In [101]:
# Finally is XGBoost Regression. I did not run this as I had not installed the package yet.

from xgboost import XGBRegressor

xgb_model = XGBRegressor()
xgb_model.fit(X_train, y_train)
xgb_model.score(X_test, y_test)

In [None]:
# I did not run the xgboost as I did not have the package installed on my personal machine
# Instead I went and used the Random Forest Regression for my project.