# Explore predictor variables for floor height

Explore correlation between exisiting floor height data and DEM and other building attributes

In [9]:
import geopandas as gpd
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import os

In [10]:
pd.set_option('display.max_columns', None)

In [11]:
# Use only points matched to footprint
df_footprint = gpd.read_file('launceston_FFH_footprint_geometry.geojson')

## Match to DEM

In [12]:
import rioxarray

#DEM file
#5m
dempath = '/Users/madeleineseehaber/Library/CloudStorage/OneDrive-FrontierSI/127 Residential Dwelling Floor Height/4 Executing/GA_data_documentation/Launceston DEM/5m_DEM_70570.zip'#'/Users/Fangyuan/FrontierSI/Projects - Documents/Projects - Data Analytics/127 Residential Dwelling Floor Height/4 Executing/GA_data_documentation/Launceston DEM/5m_DEM_70570.zip'

# Load the DEM raster from the ZIP file
dem_raster = rioxarray.open_rasterio(f'zip://{dempath}!5m_DEM.tif')

# Ensure that df_footprint is a GeoDataFrame with geometries
# Initialize lists to store min and max values
min_values = []
max_values = []

df_footprint = df_footprint.to_crs(dem_raster.rio.crs)
# Iterate through each polygon in df_footprint
for irow, row in df_footprint.iterrows():
    # Mask the DEM raster with the polygon geometry
    masked_dem = dem_raster.rio.clip(df_footprint.geometry.iloc[irow:irow+1], drop=True)

    # Flatten the masked array and remove any NoData values
    dem_values = masked_dem.values[masked_dem.values != masked_dem.rio.nodata]

    # Append the min and max values
    min_values.append(dem_values.min() if dem_values.size > 0 else None)
    max_values.append(dem_values.max() if dem_values.size > 0 else None)

# Add min and max values to df_footprint
df_footprint['min_dem_5m'] = min_values
df_footprint['max_dem_5m'] = max_values



In [13]:
from rasterio.enums import Resampling

#DEM file
#5m
dempath = '/Users/madeleineseehaber/Library/CloudStorage/OneDrive-FrontierSI/127 Residential Dwelling Floor Height/4 Executing/GA_data_documentation/Launceston DEM/5m_DEM_70570.zip'#'/Users/Fangyuan/FrontierSI/Projects - Documents/Projects - Data Analytics/127 Residential Dwelling Floor Height/4 Executing/GA_data_documentation/Launceston DEM/5m_DEM_70570.zip'

# Load the DEM raster from the ZIP file
dem_raster_5m = rioxarray.open_rasterio(f'zip://{dempath}!5m_DEM.tif')

# Calculate scaling factor to reach 1m resolution
scale_factor = 5  # From 5m to 1m

# Upscale to 1m resolution by interpolation
dem_raster = dem_raster_5m.rio.reproject(
    dem_raster_5m.rio.crs,
    shape=(dem_raster_5m.shape[1] * scale_factor, dem_raster_5m.shape[2] * scale_factor),
    resampling=Resampling.bilinear  # Use bilinear interpolation for smoother scaling
)

# Ensure that df_footprint is a GeoDataFrame with geometries
# Initialize lists to store min and max values
min_values = []
max_values = []

df_footprint = df_footprint.to_crs(dem_raster.rio.crs)
# Iterate through each polygon in df_footprint
for irow, row in df_footprint.iterrows():
    # Mask the DEM raster with the polygon geometry
    masked_dem = dem_raster.rio.clip(df_footprint.geometry.iloc[irow:irow+1], drop=True)

    # Flatten the masked array and remove any NoData values
    dem_values = masked_dem.values[masked_dem.values != masked_dem.rio.nodata]

    # Append the min and max values
    min_values.append(dem_values.min() if dem_values.size > 0 else None)
    max_values.append(dem_values.max() if dem_values.size > 0 else None)

# Add min and max values to df_footprint
df_footprint['min_dem_1m_from_5m'] = min_values
df_footprint['max_dem_1m_from_5m'] = max_values

In [14]:

# Define output path for the VRT
vrt_path = '/Users/madeleineseehaber/Library/CloudStorage/OneDrive-FrontierSI/127 Residential Dwelling Floor Height/4 Executing/GA_data_documentation/Launceston DEM/1m_DEM.vrt'#'/Users/Fangyuan/FrontierSI/Projects - Documents/Projects - Data Analytics/127 Residential Dwelling Floor Height/4 Executing/GA_data_documentation/Launceston DEM/1m_DEM.vrt'

# Load the DEM raster from the ZIP file
dem_raster = rioxarray.open_rasterio(vrt_path)

# Ensure that df_footprint is a GeoDataFrame with geometries
# Initialize lists to store min and max values
min_values = []
max_values = []

df_footprint = df_footprint.to_crs(dem_raster.rio.crs)
# Iterate through each polygon in df_footprint
for irow, row in df_footprint.iterrows():
    # Mask the DEM raster with the polygon geometry
    masked_dem = dem_raster.rio.clip(df_footprint.geometry.iloc[irow:irow+1], drop=True)

    # Flatten the masked array and remove any NoData values
    dem_values = masked_dem.values[masked_dem.values != masked_dem.rio.nodata]

    # Append the min and max values
    min_values.append(dem_values.min() if dem_values.size > 0 else None)
    max_values.append(dem_values.max() if dem_values.size > 0 else None)

# Add min and max values to df_footprint
df_footprint['min_dem_1m'] = min_values
df_footprint['max_dem_1m'] = max_values


In [15]:
# Remove LCC_FLOOR ==0
df_footprint = df_footprint[df_footprint.LCC_FLOOR>0]

In [16]:
df_footprint.to_file('launceston_FFH_footprint_geometry_dem_range.geojson')

### Compare 5m and 1m DEM

In [None]:
df_footprint.head()

In [18]:
df_footprint['range_dem_5m'] = df_footprint['max_dem_5m'] - df_footprint['min_dem_5m']
df_footprint['range_dem_1m'] = df_footprint['max_dem_1m'] - df_footprint['min_dem_1m']
df_footprint['range_dem_1m_from_5m'] = df_footprint['max_dem_1m_from_5m'] - df_footprint['min_dem_1m_from_5m']

In [None]:
df_footprint['range_dem_5m'].plot.hist(bins=50, label='range_dem_5m')
df_footprint['range_dem_1m'].plot.hist(bins=50, label='range_dem_1m', alpha=0.7)
df_footprint['range_dem_1m_from_5m'].plot.hist(bins=50, label='range_dem_1m_from_5m', alpha=0.7)
plt.legend()

In [None]:
plt.scatter(df_footprint['min_dem_5m'], df_footprint['min_dem_1m'], alpha=0.6)
plt.scatter(df_footprint['max_dem_5m'], df_footprint['max_dem_1m'], alpha=0.6)

In [21]:
df_footprint['FFH_1m'] = df_footprint['LCC_FLOOR'] - df_footprint['min_dem_1m']
df_footprint['FFH_5m'] = df_footprint['LCC_FLOOR'] - df_footprint['min_dem_5m']
df_footprint['FFH_1m_from_5m'] = df_footprint['LCC_FLOOR'] - df_footprint['min_dem_1m_from_5m']

In [None]:
plt.scatter(df_footprint[df_footprint['FFH_1m']>0]['range_dem_1m'], df_footprint[df_footprint['FFH_1m']>0]['range_dem_1m_from_5m'], alpha=0.6)
plt.scatter(df_footprint[df_footprint['FFH_1m']>0]['range_dem_1m'], df_footprint[df_footprint['FFH_1m']>0]['range_dem_5m'], alpha=0.6)

In [None]:
plt.scatter(df_footprint[df_footprint['FFH_1m']>0]['range_dem_1m'], df_footprint[df_footprint['FFH_1m']>0]['FFH_1m'], alpha=0.6)

In [None]:
plt.scatter(df_footprint[df_footprint['FFH_1m']>0]['NEXIS_FLOO'], df_footprint[df_footprint['FFH_1m']>0]['FFH_1m'], alpha=0.6)

In [None]:
df_footprint['min_dem_1m'].plot.hist(bins=50, label='min_dem')
df_footprint['min_dem_5m'].plot.hist(bins=50, alpha=0.7, label='max_dem')
df_footprint['DEM'].plot.hist(bins=50, alpha=0.7, label='DEM')
plt.legend()

### Clean up categorical fields 

make all lower case; take first letter to combine y/yes, n/no etc

In [None]:
cate_cols = df_footprint.select_dtypes(exclude=[float, int]).columns[:-1]
print(cate_cols)
for col in cate_cols:
    df_footprint[col] = df_footprint[col].str.lower()

In [27]:
df_footprint['Survey_C_1'] = df_footprint['Survey_C_1'].str[0:1]
df_footprint['Survey_Par'] = df_footprint['Survey_Par'].str[0:1]
df_footprint['Survey_Gab'] = df_footprint['Survey_Gab'].str[0:1]

In [None]:
df_footprint['LCC_TYPE'].unique()

## Explore regression model using different features


In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [None]:


ffh_col = 'LCC_FLOOR' #'FFH_1m'
df_r = df_footprint#[df_footprint[ffh_col]>0]
#df_r = df_r[(df_r[ffh_col]>=0) & (df_r[ffh_col]<3)]

cat_features = ['Survey_G_1','LCC_TYPE', 'Survey_C_1', 'NEXIS_ROOF', 'NEXIS_WALL', 'NEXIS_CONS']
num_features = ['min_dem_1m', 'max_dem_1m', 'range_dem_1m', 'Survey_Grd', 'Survey_Chi', 'NEXIS_FOOT', 'NEXIS_FLOO', 'LOCAL_YEAR']

# Prepare the features and target variable
X = df_r[cat_features+num_features]
y = df_r[ffh_col]

# One-hot encode categorical variables
encoder = OneHotEncoder()#sparse=False)
X_encoded_categorical = encoder.fit_transform(X[cat_features]).toarray()

# Normalize numeric features
scaler = StandardScaler()
X_normalized_numeric = scaler.fit_transform(X[num_features])

# Concatenate the encoded categorical features with the normalized numeric features
X_encoded = np.hstack((X_encoded_categorical, X_normalized_numeric))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Create a Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate explained variance
explained_variance = r2_score(y_test, y_pred)
print(f"Explained Variance (R^2): {explained_variance:.4f}")

# Perform cross-validation
cv_scores = cross_val_score(model, X_encoded, y, cv=5)  # 5-fold cross-validation
mean_cv_score = np.mean(cv_scores)
print(f"Cross-Validation Accuracy (R^2): {mean_cv_score:.4f}")

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

plt.scatter(y_test, y_pred, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')  # Diagonal line
plt.title(f'Actual vs Predicted {ffh_col}')
plt.xlabel(f'Actual {ffh_col}')
plt.ylabel(f'Predicted {ffh_col}')
plt.grid(True)

# Assuming 'model' is your trained RandomForestRegressor and X_encoded contains feature names
importances = model.feature_importances_
# Get the feature names (assuming you have them stored somewhere)
feature_names = np.array(encoder.get_feature_names_out())
all_feature_names = np.concatenate((feature_names, num_features))
# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]

print(all_feature_names[indices][:10])

**Predict LCC_FLOOR using DEM and Building attributes**

As expected, floor elevation is mostly driven by DEM

In [None]:

ffh_col = 'FFH_1m'
df_r = df_footprint[df_footprint[ffh_col]>0]
#df_r = df_r[(df_r[ffh_col]>=0) & (df_r[ffh_col]<3)]

cat_features = ['Survey_G_1', 'LCC_TYPE', 'Survey_C_1', 'NEXIS_ROOF', 'NEXIS_WALL', 'NEXIS_CONS']
num_features = ['min_dem_1m', 'max_dem_1m', 'range_dem_1m', 'Survey_Grd', 'Survey_Chi', 'NEXIS_FOOT', 'NEXIS_FLOO', 'LOCAL_YEAR']

cat_features = ['Survey_G_1', 'NEXIS_ROOF', 'NEXIS_WALL', 'NEXIS_YEAR'] #LCC_TYPE', 'Survey_C_1', 'Survey_Roo', 'Survey_Wal']
num_features = ['min_dem_1m', 'range_dem_1m', 'LOCAL_YEAR' ]#, 'max_dem_1m', 'Survey_Grd', 'Survey_Chi', 'NEXIS_FOOT', 'NEXIS_FLOO']

# Prepare the features and target variable
X = df_r[cat_features+num_features]
y = df_r[ffh_col]

# One-hot encode categorical variables
encoder = OneHotEncoder()#sparse=False)
X_encoded_categorical = encoder.fit_transform(X[cat_features]).toarray()

# Normalize numeric features
scaler = StandardScaler()
X_normalized_numeric = scaler.fit_transform(X[num_features])

# Concatenate the encoded categorical features with the normalized numeric features
X_encoded = np.hstack((X_encoded_categorical, X_normalized_numeric))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Create a Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate explained variance
explained_variance = r2_score(y_test, y_pred)
print(f"Explained Variance (R^2): {explained_variance:.4f}")

# Perform cross-validation
cv_scores = cross_val_score(model, X_encoded, y, cv=5)  # 5-fold cross-validation
print(cv_scores)
mean_cv_score = np.mean(cv_scores)
print(f"Cross-Validation Accuracy (R^2): {mean_cv_score:.4f}")

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

plt.scatter(y_test, y_pred, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')  # Diagonal line
plt.title(f'Actual vs Predicted {ffh_col}')
plt.xlabel(f'Actual {ffh_col}')
plt.ylabel(f'Predicted {ffh_col}')
plt.grid(True)

# Assuming 'model' is your trained RandomForestRegressor and X_encoded contains feature names
importances = model.feature_importances_
# Get the feature names (assuming you have them stored somewhere)
feature_names = np.array(encoder.get_feature_names_out())
all_feature_names = np.concatenate((feature_names, num_features))
# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]

print(all_feature_names[indices][:10], importances[indices][:10])

In [None]:

ffh_col = 'FFH_5m'
df_r = df_footprint[df_footprint[ffh_col]>0]
#df_r = df_r[(df_r[ffh_col]>=0) & (df_r[ffh_col]<3)]

cat_features = ['Survey_G_1','LCC_TYPE', 'Survey_C_1', 'Survey_Roo', 'Survey_Wal']
num_features = ['min_dem_1m', 'max_dem_1m', 'range_dem_1m', 'Survey_Grd', 'Survey_Chi', 'NEXIS_FOOT', 'NEXIS_FLOO', 'LOCAL_YEAR']

# Prepare the features and target variable
X = df_r[cat_features+num_features]
y = df_r[ffh_col]

# One-hot encode categorical variables
encoder = OneHotEncoder()#sparse=False)
X_encoded_categorical = encoder.fit_transform(X[cat_features]).toarray()

# Normalize numeric features
scaler = StandardScaler()
X_normalized_numeric = scaler.fit_transform(X[num_features])

# Concatenate the encoded categorical features with the normalized numeric features
X_encoded = np.hstack((X_encoded_categorical, X_normalized_numeric))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Create a Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate explained variance
explained_variance = r2_score(y_test, y_pred)
print(f"Explained Variance (R^2): {explained_variance:.4f}")

# Perform cross-validation
cv_scores = cross_val_score(model, X_encoded, y, cv=5)  # 5-fold cross-validation
print(cv_scores)
mean_cv_score = np.mean(cv_scores)
print(f"Cross-Validation Accuracy (R^2): {mean_cv_score:.4f}")

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

plt.scatter(y_test, y_pred, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')  # Diagonal line
plt.title(f'Actual vs Predicted {ffh_col}')
plt.xlabel(f'Actual {ffh_col}')
plt.ylabel(f'Predicted {ffh_col}')
plt.grid(True)

# Assuming 'model' is your trained RandomForestRegressor and X_encoded contains feature names
importances = model.feature_importances_
# Get the feature names (assuming you have them stored somewhere)
feature_names = np.array(encoder.get_feature_names_out())
all_feature_names = np.concatenate((feature_names, num_features))
# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]

print(all_feature_names[indices][:10])

In [None]:

ffh_col = 'FFH_1m_from_5m'
df_r = df_footprint[df_footprint[ffh_col]>0]
#df_r = df_r[(df_r[ffh_col]>=0) & (df_r[ffh_col]<3)]

cat_features = ['Survey_G_1','LCC_TYPE', 'Survey_C_1', 'Survey_Roo', 'Survey_Wal']
num_features = ['min_dem_1m', 'max_dem_1m', 'range_dem_1m', 'Survey_Grd', 'Survey_Chi', 'NEXIS_FOOT', 'NEXIS_FLOO', 'LOCAL_YEAR']

# Prepare the features and target variable
X = df_r[cat_features+num_features]
y = df_r[ffh_col]

# One-hot encode categorical variables
encoder = OneHotEncoder()#sparse=False)
X_encoded_categorical = encoder.fit_transform(X[cat_features]).toarray()

# Normalize numeric features
scaler = StandardScaler()
X_normalized_numeric = scaler.fit_transform(X[num_features])

# Concatenate the encoded categorical features with the normalized numeric features
X_encoded = np.hstack((X_encoded_categorical, X_normalized_numeric))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Create a Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate explained variance
explained_variance = r2_score(y_test, y_pred)
print(f"Explained Variance (R^2): {explained_variance:.4f}")

# Perform cross-validation
cv_scores = cross_val_score(model, X_encoded, y, cv=5)  # 5-fold cross-validation
print(cv_scores)
mean_cv_score = np.mean(cv_scores)
print(f"Cross-Validation Accuracy (R^2): {mean_cv_score:.4f}")

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

plt.scatter(y_test, y_pred, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')  # Diagonal line
plt.title(f'Actual vs Predicted {ffh_col}')
plt.xlabel(f'Actual {ffh_col}')
plt.ylabel(f'Predicted {ffh_col}')
plt.grid(True)

# Assuming 'model' is your trained RandomForestRegressor and X_encoded contains feature names
importances = model.feature_importances_
# Get the feature names (assuming you have them stored somewhere)
feature_names = np.array(encoder.get_feature_names_out())
all_feature_names = np.concatenate((feature_names, num_features))
# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]

print(all_feature_names[indices][:10])

**Predict FFH using DEM, DEM range and building attributes**

DEM range is the most important feature; 1m DEM works better than 5m.

### Check negative FFH

Negative FFH tend to occur on steep slopes

In [None]:

df_footprint[df_footprint.FFH<0][['LID', 'ADDRESS', 'Survey_G_1', 'DEM', 'min_dem_1m', 'max_dem_1m', 'LCC_FLOOR', 'FFH', 'FFH_1m']]


In [None]:
plt.scatter(df_footprint[df_footprint['FFH_1m']>0]['range_dem_1m'], df_footprint[df_footprint['FFH_1m']>0]['FFH_1m'], alpha=0.6, label='FFH>0')
plt.scatter(df_footprint[df_footprint['FFH_1m']<=0]['range_dem_1m'], df_footprint[df_footprint['FFH_1m']<=0]['FFH_1m'], alpha=0.6, label='FFH<0')
plt.xlabel('DEM Range within footprint')
plt.ylabel('FFH_1m')
plt.legend()