# Oak Wilt Radius Model

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

cluster_features_path = Path('../data/oak_wilt_cluster_features.csv')
cluster_members_path = Path('../data/oak_wilt_cluster_members.csv')

cluster_features = pd.read_csv(cluster_features_path)
cluster_members = pd.read_csv(cluster_members_path)

cluster_features['radius_ft'] = cluster_features['radius_km'] * 3280.84
cluster_features['spread_ft_per_year'] = cluster_features['spread_rate_km_per_year'] * 3280.84
cluster_features = cluster_features[cluster_features['radius_km'] >= 0.01].reset_index(drop=True)
cluster_features

Unnamed: 0,cluster_id,original_label,point_count,unique_years,start_year,end_year,year_span,annual_counts,centroid_lat,centroid_lon,...,point_density_per_km2,spread_rate_km_per_year,dominant_species,dominant_landowner,eps_m,temporal_scale_m,min_samples,min_unique_years,radius_ft,spread_ft_per_year
0,1,0,5,2,2007,2008,1,"{""2007"": 2, ""2008"": 3}",30.263611,-97.805261,...,1257.981433,0.035569,Live Oak,Unknown,50.0,20.0,5,2,116.696546,116.696546
1,2,1,24,6,2014,2019,5,"{""2014"": 5, ""2015"": 4, ""2016"": 4, ""2017"": 8, ""...",30.249808,-97.780018,...,1213.951129,0.015866,Live Oak,Private,50.0,20.0,5,2,260.26462,52.052924
2,3,2,12,2,2014,2016,2,"{""2014"": 4, ""2016"": 8}",30.386146,-97.677552,...,2216.122834,0.020758,Live Oak,Municipal - ROW,50.0,20.0,5,2,136.208379,68.104189
3,4,3,14,3,2015,2017,2,"{""2015"": 9, ""2016"": 1, ""2017"": 4}",30.200778,-97.815266,...,1103.215659,0.031778,Live Oak,Private,50.0,20.0,5,2,208.518179,104.25909
4,5,4,25,2,2015,2017,2,"{""2015"": 14, ""2017"": 11}",30.413399,-97.682038,...,3230.609809,0.024815,Live Oak,Municipal - ROW,50.0,20.0,5,2,162.831291,81.415645
5,6,5,10,2,2015,2017,2,"{""2015"": 4, ""2017"": 6}",30.414475,-97.683011,...,1139.693674,0.026424,Live Oak,Municipal - ROW,50.0,20.0,5,2,173.386806,86.693403
6,7,6,22,3,2016,2018,2,"{""2016"": 10, ""2017"": 4, ""2018"": 8}",30.229197,-97.848443,...,1794.736932,0.031232,Live Oak,Private,50.0,20.0,5,2,204.937321,102.468661
7,8,9,19,2,2017,2018,1,"{""2017"": 4, ""2018"": 15}",30.181995,-97.912295,...,13510.996948,0.021157,Live Oak,Municipal - Parkland,50.0,20.0,5,2,69.413384,69.413384
8,9,11,110,3,2017,2019,2,"{""2017"": 24, ""2018"": 45, ""2019"": 41}",30.211066,-97.798941,...,1059.176155,0.090909,Live Oak,Municipal - Parkland,50.0,20.0,5,2,596.516361,298.258181
9,10,12,5,3,2016,2018,2,"{""2016"": 1, ""2017"": 2, ""2018"": 2}",30.346969,-97.758388,...,857.246706,0.021544,Live Oak,Private,50.0,20.0,5,2,141.3651,70.68255


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

cluster_features = cluster_features.dropna(subset=['spread_rate_km_per_year']).copy()
cluster_features['log_density'] = np.log1p(cluster_features['point_density_per_km2'])
feature_cols = [
    'point_count',
    'unique_years',
    'year_span',
    'area_km2',
    'log_density',
    'spread_rate_km_per_year'
 ]
X = cluster_features[feature_cols].astype(np.float32)
y_radius = cluster_features['radius_ft'].astype(np.float32)

X_train, X_val, y_train, y_val = train_test_split(
    X, y_radius, test_size=0.2, random_state=42
 )

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
y_train = y_train.to_numpy(dtype=np.float32)
y_val = y_val.to_numpy(dtype=np.float32)

X_train_scaled.shape, X_val_scaled.shape

((8, 6), (3, 6))

In [3]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

tf.random.set_seed(42)

inputs = keras.Input(shape=(X_train_scaled.shape[1],), name='features')
x = layers.Dense(64, activation='relu')(inputs)
x = layers.Dropout(0.2)(x)
x = layers.Dense(32, activation='relu')(x)
outputs = layers.Dense(1, name='radius_ft')(x)

model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss='mse',
    metrics=[keras.metrics.RootMeanSquaredError(name='rmse')]
 )
model.summary()

In [4]:
callbacks = [
    keras.callbacks.EarlyStopping(monitor='val_rmse', patience=20, restore_best_weights=True)
 ]

history = model.fit(
    X_train_scaled,
    y_train,
    validation_data=(X_val_scaled, y_val),
    epochs=500,
    batch_size=8,
    callbacks=callbacks,
    verbose=0
 )

model.evaluate(X_val_scaled, y_val, verbose=0)

[961.8093872070312, 31.013051986694336]

In [5]:
preds_val = model.predict(X_val_scaled, verbose=0).flatten()
comparison = pd.DataFrame({
    'pred_radius_ft': preds_val,
    'true_radius_ft': y_val
}).assign(error_ft=lambda df: df['pred_radius_ft'] - df['true_radius_ft'])
comparison.head()

Unnamed: 0,pred_radius_ft,true_radius_ft,error_ft
0,129.051208,173.38681,-44.335602
1,145.121033,116.696548,28.424484
2,151.940125,141.365097,10.575027


In [6]:
comparison['abs_error_ft'] = comparison['error_ft'].abs()
rmse_ft = float(np.sqrt(np.mean(np.square(comparison['error_ft']))))
mae_ft = float(comparison['abs_error_ft'].mean())
metrics = {
    'rmse_ft': rmse_ft,
    'val_rmse_ft': float(history.history['val_rmse'][-1]),
    'mae_ft': mae_ft
}
metrics, comparison[['pred_radius_ft', 'true_radius_ft', 'abs_error_ft']].describe()

({'rmse_ft': 31.013051986694336,
  'val_rmse_ft': 37.01430892944336,
  'mae_ft': 27.778371810913086},
        pred_radius_ft  true_radius_ft  abs_error_ft
 count        3.000000        3.000000      3.000000
 mean       142.037460      143.816162     27.778372
 std         11.751892       28.424500     16.889559
 min        129.051208      116.696548     10.575027
 25%        137.086121      129.030823     19.499756
 50%        145.121033      141.365097     28.424484
 75%        148.530579      157.375954     36.380043
 max        151.940125      173.386810     44.335602)