### This notebook demonstrates the basic workflow for training a machine learning model on the invasive species dataset and hyperspectral data. 

- Step 1: Load the data
- Step 2: Preprocess the data (extract spectral signatures)
- Step 3: Train the model
- Step 4: Evaluate the model
- Step 5: Make predictions
- Step 6: Visualize the results
- Step 7: Save the model
- Step 8: Load the model
- Step 9: Make predictions on new data
- Step 10: Visualize the results

In [1]:
import xarray as xr
import xvec
import matplotlib.pyplot as plt
import geopandas as gpd
from dask.diagnostics import ProgressBar

In [2]:
# Step 1 - Load the data

# Load hyperspectral data
# This is a cleaned version of the data, after removing the atmospheric contaminated regions.
ds = xr.open_zarr('/mnt/hdd1/fran/fran_sep2018_clean.zarr')
ds

Unnamed: 0,Array,Chunk
Bytes,60.66 GiB,3.85 MiB
Shape,"(202, 16310, 9885)","(202, 10, 1000)"
Dask graph,16310 chunks in 2 graph layers,16310 chunks in 2 graph layers
Data type,int16 numpy.ndarray,int16 numpy.ndarray
"Array Chunk Bytes 60.66 GiB 3.85 MiB Shape (202, 16310, 9885) (202, 10, 1000) Dask graph 16310 chunks in 2 graph layers Data type int16 numpy.ndarray",9885  16310  202,

Unnamed: 0,Array,Chunk
Bytes,60.66 GiB,3.85 MiB
Shape,"(202, 16310, 9885)","(202, 10, 1000)"
Dask graph,16310 chunks in 2 graph layers,16310 chunks in 2 graph layers
Data type,int16 numpy.ndarray,int16 numpy.ndarray


In [3]:
# Load invasive species point data
gdf = gpd.read_file('/home/geethen/invasives/hypinvalimap/data/aliens_sep2018.shp')
gdf.head()

Unnamed: 0,fid,class,group,layer,path,geometry
0,1.0,0,2,point,/Users/glennmoncrieff/Documents/qgis/point.gpk...,POINT (19.16467 -33.99555)
1,2.0,0,2,point,/Users/glennmoncrieff/Documents/qgis/point.gpk...,POINT (19.14254 -33.99885)
2,3.0,0,2,point,/Users/glennmoncrieff/Documents/qgis/point.gpk...,POINT (19.15589 -33.98977)
3,4.0,0,2,point,/Users/glennmoncrieff/Documents/qgis/point.gpk...,POINT (19.13976 -33.99951)
4,5.0,0,2,point,/Users/glennmoncrieff/Documents/qgis/point.gpk...,POINT (19.15018 -33.99258)


In [4]:
import xarray as xr
import matplotlib.pyplot as plt
import random
import numpy as np
import leafmap.foliumap as leafmap
from pyproj import Transformer
import folium
import geopandas as gpd

def plot_spectral_and_map(ds, gdf=None, crs_from='EPSG:32610', crs_to='EPSG:4326'):
    """
    Plot points on a map with the extent of `ds` using a bounding box and optionally visualize a GeoDataFrame.

    Parameters:
    ds: xarray Dataset containing spectral data
    gdf: GeoDataFrame to visualize on the map, colored by the 'class' column
    crs_from: The CRS of your input coordinates (default UTM Zone 10N)
    crs_to: The target CRS (default WGS84)
    """
    # Convert coordinates to lat/lon
    transformer = Transformer.from_crs(crs_from, crs_to, always_xy=True)
    
    # Create leafmap
    m = leafmap.Map(
        zoom=15,
        draw_control=False,
        measure_control=False,
        fullscreen_control=False,
        basemap='Esri.WorldImagery'
    )
    
    # Add satellite basemap
    m.add_basemap('SATELLITE')
    m.add_basemap('Esri.WorldImagery')
    m.add_basemap('ESA WorldCover 2021')
    
    # Add a bounding box around the dataset in red
    x_min, x_max = ds.x.min(), ds.x.max()
    y_min, y_max = ds.y.min(), ds.y.max()

    # Convert bounding box corners to lat/lon
    lon_min, lat_min = transformer.transform(x_min, y_min)
    lon_max, lat_max = transformer.transform(x_max, y_max)

    # Create a rectangle for the bounding box
    folium.Rectangle(
        bounds=[[lat_min, lon_min], [lat_max, lon_max]],
        color='red',
        weight=1,
        fill=False
    ).add_to(m)
    
    # If a GeoDataFrame is provided
    if gdf is not None:
        # Ensure the GeoDataFrame is in the target CRS
        if gdf.crs != crs_to:
            gdf = gdf.to_crs(crs_to)
        
        # Generate unique colors for each class
        classes = gdf['class'].unique()
        colors = {cls: f"#{random.randint(0, 0xFFFFFF):06x}" for cls in classes}
        
        # Add points to the map
        for _, row in gdf.iterrows():
            point_coords = row.geometry.coords[0]
            folium.CircleMarker(
                location=(point_coords[1], point_coords[0]),
                radius=5,
                color=colors[row['class']],
                fill=True,
                fill_color=colors[row['class']],
                fill_opacity=0.7,
                popup=f"Class: {row['class']}"
            ).add_to(m)
        
        # Center map on the centroid of the GeoDataFrame's bounding box
        bbox_centroid = gdf.geometry.unary_union.centroid
        m.set_center(bbox_centroid.x, bbox_centroid.y, zoom=12)
    
    # Display both plots
    plt.tight_layout()
    plt.show()
    display(m)

# Example usage:
plot_spectral_and_map(ds, gdf=gdf, crs_from='EPSG:22234')


<Figure size 640x480 with 0 Axes>

### Step 2: Preprocess data (extract spectral signatures at points)

In [None]:
def extract_points(ds, points):
    """
    Extracts data values at specified points from a locally stored dataset.

    Parameters:
    - ds: str, path to the dataset (e.g., netCDF or GeoTIFF).
    - points: GeoDataFrame, point locations to extract data.

    Returns:
    - DataFrame containing extracted data values and point indices.
    """
    # Extract data at points
    extracted = ds.xvec.extract_points(
        points['geometry'], 
        x_coords="x", 
        y_coords="y", 
        index=True
    )
    
    return extracted

df = extract_points(ds, gdf)
df.head()

Unnamed: 0,Array,Chunk
Bytes,50 B,50 B
Shape,"(5, 5)","(5, 5)"
Dask graph,1 chunks in 5 graph layers,1 chunks in 5 graph layers
Data type,int16 numpy.ndarray,int16 numpy.ndarray
"Array Chunk Bytes 50 B 50 B Shape (5, 5) (5, 5) Dask graph 1 chunks in 5 graph layers Data type int16 numpy.ndarray",5  5,

Unnamed: 0,Array,Chunk
Bytes,50 B,50 B
Shape,"(5, 5)","(5, 5)"
Dask graph,1 chunks in 5 graph layers,1 chunks in 5 graph layers
Data type,int16 numpy.ndarray,int16 numpy.ndarray


In [6]:
# Add class and group columns to spectral signatures
class_xr = gdf[['class','group']].to_xarray()
ds = ds.merge(class_xr.astype(int),join='left')
ds

Unnamed: 0,Array,Chunk
Bytes,60.66 GiB,3.85 MiB
Shape,"(202, 16310, 9885)","(202, 10, 1000)"
Dask graph,16310 chunks in 2 graph layers,16310 chunks in 2 graph layers
Data type,int16 numpy.ndarray,int16 numpy.ndarray
"Array Chunk Bytes 60.66 GiB 3.85 MiB Shape (202, 16310, 9885) (202, 10, 1000) Dask graph 16310 chunks in 2 graph layers Data type int16 numpy.ndarray",9885  16310  202,

Unnamed: 0,Array,Chunk
Bytes,60.66 GiB,3.85 MiB
Shape,"(202, 16310, 9885)","(202, 10, 1000)"
Dask graph,16310 chunks in 2 graph layers,16310 chunks in 2 graph layers
Data type,int16 numpy.ndarray,int16 numpy.ndarray


In [7]:
with ProgressBar():
    dsp = ds.persist()

[########################################] | 100% Completed | 351.46 s


In [9]:
dsp

Unnamed: 0,Array,Chunk
Bytes,60.66 GiB,3.85 MiB
Shape,"(202, 16310, 9885)","(202, 10, 1000)"
Dask graph,16310 chunks in 1 graph layer,16310 chunks in 1 graph layer
Data type,int16 numpy.ndarray,int16 numpy.ndarray
"Array Chunk Bytes 60.66 GiB 3.85 MiB Shape (202, 16310, 9885) (202, 10, 1000) Dask graph 16310 chunks in 1 graph layer Data type int16 numpy.ndarray",9885  16310  202,

Unnamed: 0,Array,Chunk
Bytes,60.66 GiB,3.85 MiB
Shape,"(202, 16310, 9885)","(202, 10, 1000)"
Dask graph,16310 chunks in 1 graph layer,16310 chunks in 1 graph layer
Data type,int16 numpy.ndarray,int16 numpy.ndarray


In [8]:
dsp.head()

Unnamed: 0,Array,Chunk
Bytes,250 B,250 B
Shape,"(5, 5, 5)","(5, 5, 5)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,int16 numpy.ndarray,int16 numpy.ndarray
"Array Chunk Bytes 250 B 250 B Shape (5, 5, 5) (5, 5, 5) Dask graph 1 chunks in 2 graph layers Data type int16 numpy.ndarray",5  5  5,

Unnamed: 0,Array,Chunk
Bytes,250 B,250 B
Shape,"(5, 5, 5)","(5, 5, 5)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,int16 numpy.ndarray,int16 numpy.ndarray


In [14]:
dtrain = dsp.where(dsp['group']==1,drop=True)
dtest = dsp.where(dsp['group']==2,drop=True)

#create separte datasets for labels and features
y_train = dtrain['class'].values.astype(int)
y_test = dtest['class'].values.astype(int)
X_train = dtrain['reflectance'].values
X_test = dtest['reflectance'].values

: 

In [13]:
dsp.head().xvec.to_geodataframe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,reflectance,class,group
wl,x,y,index,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.40661,313990.11,6266452.947,0,381,0,2
0.40661,313990.11,6266452.947,1,381,0,2
0.40661,313990.11,6266452.947,2,381,0,2
0.40661,313990.11,6266452.947,3,381,0,2
0.40661,313990.11,6266452.947,4,381,0,2
...,...,...,...,...,...,...
0.43367,314002.11,6266440.947,0,848,0,2
0.43367,314002.11,6266440.947,1,848,0,2
0.43367,314002.11,6266440.947,2,848,0,2
0.43367,314002.11,6266440.947,3,848,0,2


In [17]:
import hvplot.xarray
import warnings
warnings.filterwarnings('ignore')
hvplot.extension('bokeh')

# Sanity Check (plot spectral signatures for a class)
dsp_plot = dsp.where(dsp['class']==5, drop=True)
h = dsp_plot['reflectance'].hvplot.line(x='wavelength',by='index',
                                    color='green',ylim=(0,0.5),alpha=0.5,legend=False)
h

: 