In [11]:
import pyodbc
import pandas as pd
import geopandas as gpd
import rioxarray
import xarray as xr
import rasterio
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial import cKDTree




In [12]:

print("Loading elevation dataset...")
elevation_dataset = rioxarray.open_rasterio(r"../local_dataset/dataset/elevation_full.tif", masked=True)
elevation_dataset = elevation_dataset[0]

print("Loading soil dataset...")
soil_dataset = pd.read_parquet(r"../local_dataset/dataset/soil_full.parquet", engine="fastparquet")


print("Loading fire dataset...")
fire_dataset = pd.read_parquet(r"../local_dataset/dataset/fire_full.parquet", engine="fastparquet")




Loading elevation dataset...
Loading soil dataset...
Loading fire dataset...


In [2]:
print("Elevation dataset shape:", elevation_dataset.shape)
print(elevation_dataset.rio.crs)
print(elevation_dataset.dtype)
print("=============================\n" * 3, end='')
print("Soil dataset shape:", soil_dataset.shape)
print(soil_dataset.dtypes)
print("=============================\n" * 3, end='')
print("Fire dataset shape:", fire_dataset.shape)
print(fire_dataset.dtypes)

Elevation dataset shape: (4392, 4965)
EPSG:4326
float32
Soil dataset shape: (7109938, 24)
lon              float64
lat              float64
COARSE             int64
SAND               int64
SILT               int64
CLAY               int64
TEXTURE_USDA     float64
TEXTURE_SOTER     object
BULK             float64
REF_BULK         float64
ORG_CARBON       float64
PH_WATER         float64
TOTAL_N          float64
CN_RATIO         float64
CEC_SOIL           int64
CEC_CLAY           int64
CEC_EFF          float64
TEB              float64
BSAT               int64
ALUM_SAT           int64
ESP                int64
TCARBON_EQ       float64
GYPSUM           float64
ELEC_COND          int64
dtype: object
Fire dataset shape: (56864, 3)
lon     float64
lat     float64
fire      int64
dtype: object


# Preprocessing before merging

In [None]:

merged_dataset = fire_dataset

# def snap_to_grid(values, grid_size):
#     return np.round(values / grid_size) * grid_size

# grid_size = 0.01

# print("Unique values before snapping:")
# print(fire_dataset["lon"].nunique(), fire_dataset["lat"].nunique())

# fire_dataset["lon"] = snap_to_grid(fire_dataset["lon"], grid_size)
# fire_dataset["lat"] = snap_to_grid(fire_dataset["lat"], grid_size)

# print("Unique values after snapping:")
# print(fire_dataset["lon"].nunique(), fire_dataset["lat"].nunique())


Unique values before snapping:
14440 14002
Unique values after snapping:
1924 1775


# Merge dataset with elevation

In [15]:
merged_dataset['elevation'] = elevation_dataset.sel(
    x=xr.DataArray(merged_dataset.lon, dims="points"),
    y=xr.DataArray(merged_dataset.lat, dims="points"),
    method="nearest"
).values
print("Merged dataset shape:", merged_dataset.shape)
print(merged_dataset.head())

Merged dataset shape: (56864, 4)
        lon       lat  fire  elevation
0   9.48947  31.49290     1      248.0
1   9.49053  31.49524     1      248.0
2   9.49368  31.49449     1      274.0
3   9.49154  31.49420     1      274.0
4  10.09115  36.93407     1        6.0


# Merge dataset with soil

In [16]:

merged_coords = merged_dataset[["lon", "lat"]].to_numpy()

# dataset B = grid points (right table)
soil_coords = soil_dataset[["lon", "lat"]].to_numpy()

# Build KDTree on grid (right table)
tree = cKDTree(soil_coords)
print("KDTree built.")

# Query nearest neighbor for each fire point
dist, idx = tree.query(merged_coords, k=1)
print("Nearest neighbors queried.")

# idx = index of nearest grid point
# Attach grid attributes to fire_df
merged_dataset = pd.concat([
    merged_dataset.reset_index(drop=True),
    soil_dataset.drop(columns=["lon", "lat"]).iloc[idx.flatten()].reset_index(drop=True)
], axis=1)
print("Merged dataset with soil.")
print("Final merged dataset shape:", merged_dataset.shape)
print(merged_dataset.head())

KDTree built.
Nearest neighbors queried.
Merged dataset with soil.
Final merged dataset shape: (56864, 26)
        lon       lat  fire  elevation  COARSE  SAND  SILT  CLAY  \
0   9.48947  31.49290     1      248.0       6    90     5     5   
1   9.49053  31.49524     1      248.0       6    90     5     5   
2   9.49368  31.49449     1      274.0       6    90     5     5   
3   9.49154  31.49420     1      274.0       6    90     5     5   
4  10.09115  36.93407     1        6.0       2    17    33    50   

   TEXTURE_USDA TEXTURE_SOTER  ...  CEC_SOIL  CEC_CLAY  CEC_EFF   TEB  BSAT  \
0          12.0             C  ...         4        61      4.0   4.0    89   
1          12.0             C  ...         4        61      4.0   4.0    89   
2          12.0             C  ...         4        61      4.0   4.0    89   
3          12.0             C  ...         4        61      4.0   4.0    89   
4           3.0             F  ...        40        68     55.0  55.0   100   

   ALUM_S