In [2]:
# Import libraries
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import osmnx as ox 
import networkx as nx
import folium


# Import data
df = pd.read_csv("C:\\Users\\lucin\\OneDrive\\Desktop\\GeospatialDataScience\\geospatial-data-science\\labs\\lab5\\lab5data\\seattle_house_prices.csv")

In [13]:
from shapely.geometry.polygon import Polygon
from shapely.geometry.multipolygon import MultiPolygon
from shapely.geometry import LineString, MultiLineString

In [3]:
# Convert DataFrame to GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['long'], df['lat']))
gdf = gdf.set_crs(4326, allow_override=True)

# Reproject everything to UTM 10N (EPSG:32610)
gdf_utm = gdf.to_crs('EPSG:32610')

### Question 1

1. There are _19451_ houses in the dataset.

2. There are 7 features for predicting house price.

3. There are no null features of the dataset.

In [4]:
null_gdf = gdf.info()
null_gdf

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 19451 entries, 0 to 19450
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   price        19451 non-null  int64   
 1   bedrooms     19451 non-null  int64   
 2   bathrooms    19451 non-null  float64 
 3   sqft_living  19451 non-null  int64   
 4   sqft_lot     19451 non-null  int64   
 5   yr_built     19451 non-null  int64   
 6   lat          19451 non-null  float64 
 7   long         19451 non-null  float64 
 8   geometry     19451 non-null  geometry
dtypes: float64(3), geometry(1), int64(5)
memory usage: 1.3 MB


4. sqrt_living (correl = 0.702), bathrooms (correl = 0.524), and bedrooms (correl = 0.315) are the most correlated with housing price.

5. sqft_lot (correl = 0.090), yr_built (correl = 0.052), and long (correl = 0.020) are least correlated with housing price.

In [5]:
# Compute correlation matrix
corr_matrix = gdf_utm.corr()

# Display just house value correlations
corr_matrix["price"].sort_values(ascending= False)

price          1.000000
sqft_living    0.702296
bathrooms      0.524395
bedrooms       0.315804
lat            0.308082
sqft_lot       0.090125
yr_built       0.052453
long           0.020092
Name: price, dtype: float64

### Question 2

#### Add extra features

In [46]:
# Specify type of data
tags = {'amenity': 'university'}

# Download building geometries from OSM
colleges = ox.geometries_from_place('Seattle, WA, USA', tags)


colleges = colleges.to_crs('EPSG:32610')

# Get bathroom and EMU centroids
colleges['centroid'] = colleges['geometry'].apply(
  lambda x: x.centroid if type(x) == Polygon else (
  x.centroid if type(x) == MultiPolygon else x))


# Convert back to WGS84
colleges['centroid']= colleges['centroid'].to_crs('EPSG:4326')


m = folium.Map(location=[47.6, -122.348], zoom_start=11)

for i in range(0, colleges.shape[0]):
    my_string = 'name: {}'.format(colleges.iloc[i]['name'])
    folium.Marker([colleges.iloc[i]['centroid'].y, colleges.iloc[i]['centroid'].x],
                 popup=my_string).add_to(m)
m

  gdf = gdf.append(_geocode_query_to_gdf(q, wr, by_osmid))


#### Create model

In [6]:
# Define feature list
feature_list =  ['sqft_living', 'bathrooms', 'bedrooms', 'lat', 'sqft_lot', 'yr_built', 'long']

# Define features and labels 
X = gdf_utm[feature_list]
y = gdf_utm['price']

# Standarize data
scaler = StandardScaler()  
X_scaled = scaler.fit_transform(X)

In [7]:
# Split data 
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [8]:
# Define model
forest_reg = RandomForestRegressor(n_estimators = 30)

# Fit model
forest_reg.fit(X_train, y_train)

RandomForestRegressor(n_estimators=30)

In [9]:
# Predict test labels predictions
predictions = forest_reg.predict(X_test)

# Compute mean-squared-error
final_mse = mean_squared_error(y_test , predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

157146.06863451775

#### Lucy's notepad:
##### Data to include:
* Parks
* Walkability
* Neighborhood
* Distance to beaches
* Distance to downtown
* Distance to university
