In [1]:
# ***You need to run this GEE script for your state and homerange first***
# https://code.earthengine.google.com/b5d49bb675cc2d6583e866dc1dfb440b
#get taxon key for your AIS from gbif
#species.name_backbone(name='Dreissena polymorpha', kingdom='animal')
my_training_state = 'MN' # should be the postal code abbreviation for the state you created the environmental raster for....
my_nas_id = 5 # go to USGS NAS database for species_ids (e.g., 5 = Zebra Mussels; 237 = Eurasian watermilfoil; 551 = Bighead carp)
my_path = 'data/' + my_training_state + '/'
homerange_raster = my_path + "homerange_2003_2022.tif"
invaded_raster = my_path + "inv_rsd_2003_2022.tif"
my_countries = ["RU", "UA", "BG", "RO", "GE", "AZ", "TM", "KZ"] # Endemic range countries for your taxa
my_taxon = 2287072  # gbif taxon id ; Eurasian watermilfoil = 2362486; Zebra mussels = 2362486
limit = 10000 # This is for the gbif function so you don't blow up your computer... Just kidding that shouldn't happen : )
my_scale = 1000 

In [2]:
#Import required packages
import pandas as pd
import numpy as np
import geopandas as gpd
import json
import requests
import glob
from pygbif import occurrences as occ 
from pygbif import species
import rasterio
from matplotlib import pyplot as plt
from rasterio.features import rasterize
from rasterio.mask import mask
from shapely.geometry import Point

# Functions
def gbif_api_call(taxon, country, limit):
    """Fetch GBIF occurrences for a given taxon and country."""
    URL_BASE = 'https://api.gbif.org/v1/'
    url_request = f"{URL_BASE}occurrence/search?taxonKey={taxon}&country={country}&limit={limit}"  
    response = requests.get(url_request, timeout=30)
    return response.json()  # Return the JSON response directly

def nas_api_call(nas_id, state):
    URL_BASE = 'http://nas.er.usgs.gov/api/v2/'
    url_request = f"{URL_BASE}/occurrence/search?species_ID={nas_id}&state={my_training_state}"
    response = requests.get(url_request, timeout=None).json()
    results = pd.json_normalize(response, 'results')
    return results


def sample_multiband_geotiff_with_names(raster_path, gdf):
    """
    Samples a multi-band GeoTIFF at specified point locations from a GeoDataFrame,
    using band names from the raster.

    Parameters:
    - raster_path (str): Path to the GeoTIFF file.
    - gdf (GeoDataFrame): GeoDataFrame containing point geometries.

    Returns:
    - GeoDataFrame with additional columns for each band, using raster band names.
    """

    # Open the raster file
    with rasterio.open(raster_path) as src:
        # Reproject GeoDataFrame to match raster CRS if needed
        if gdf.crs != src.crs:
            gdf = gdf.to_crs(src.crs)

        # Convert point geometries to raster pixel coordinates
        coords = [(geom.x, geom.y) for geom in gdf.geometry]

        # Sample raster at point locations (returns a list of tuples with values per band)
        sampled_values = list(src.sample(coords))

        # Get band names (if available, otherwise use default names)
        band_names = src.descriptions if all(src.descriptions) else [f"band_{i+1}" for i in range(src.count)]

        # Create new columns in the GeoDataFrame with the corresponding band names
        for band_idx, band_name in enumerate(band_names):
            gdf[band_name] = [val[band_idx] for val in sampled_values]

    return gdf

def filter_dataframe_columns(df, feature_choices):
    return df[[col for col in df.columns if col in feature_choices or col == "geometry"]]

def extract_fields(data):
    """Extract relevant fields from GBIF response."""
    extracted_data = []
    for record in data:
        entry = {
            'key': record.get('key'),
            'species': record.get('species'),
            'decimalLatitude': record.get('decimalLatitude'),
            'decimalLongitude': record.get('decimalLongitude'),
            'countryCode': record.get('countryCode'),
            'year': record.get('year')
        }
        extracted_data.append(entry)
    return extracted_data

def MESS(ref_df, pred_df):
    # Extract geometry before dropping it
    geometry = None
    if "geometry" in pred_df.columns:
        geometry = pred_df["geometry"].copy()  # Save geometry separately
        pred_df = pred_df.drop(columns=["geometry", "predID"])  # Drop before calculations

    # Ensure reference DataFrame does not include geometry
    ref_numeric = ref_df.drop(columns=["geometry"], errors="ignore")  # Avoid geometry errors

    # Compute min and max values for each variable
    mins = dict(ref_numeric.min())
    maxs = dict(ref_numeric.max())

    def calculate_s(column):
        values = ref_numeric[column]  # Reference values
        sims = []

        for element in np.array(pred_df[column]):
            f = np.count_nonzero((values < element)) / values.size

            if f == 0:
                sim = ((element - mins[column]) / (maxs[column] - mins[column]))
            elif 0 < f <= 50:
                sim = 2 * f
            elif 50 < f < 100:
                sim = 2 * (1 - f)
            elif f == 100:
                sim = ((maxs[column] - element) / (maxs[column] - mins[column]))

            sims.append(sim)

        return sims

    # Compute similarity scores for each predictor
    sim_df = pd.DataFrame()
    for c in pred_df.columns:
        sim_df[c] = calculate_s(c)

    # Compute MESS values
    min_similarity = sim_df.min(axis=1)  # Least similar predictor's score
    MoD = sim_df.idxmin(axis=1)  # Least similar predictor's name

    # Combine results
    MESS = pd.concat([min_similarity, MoD], axis=1)
    MESS.columns = ["MESS_Score", "Least_Similar_Variable"]

    # Reattach geometry if it was present
    if geometry is not None:
        print("Before reattaching geometry:", MESS.dtypes)  # Debug print
    
        MESS["geometry"] = geometry  # Re-add geometry
        MESS = gpd.GeoDataFrame(MESS, geometry="geometry", crs=5070)  # Convert back to GeoDataFrame
        
        print("After reattaching geometry:", MESS.dtypes)  # Debug print
        print("Geometry column exists?", "geometry" in MESS.columns)
    
    return MESS

def export_mess(joined_gdf: gpd.GeoDataFrame, resolution: int = my_scale):
    # Ensure CRS is projected (use EPSG:5070 or appropriate for your region)
    if joined_gdf.crs.to_epsg() != 5070:
        joined_gdf = joined_gdf.to_crs(epsg=5070)

    # Get bounds
    bounds = joined_gdf.total_bounds  # [minx, miny, maxx, maxy]
    print(f"Bounds in projected CRS: {bounds}")

    # Compute raster size
    width = int(np.ceil((bounds[2] - bounds[0]) / resolution))
    height = int(np.ceil((bounds[3] - bounds[1]) / resolution))

    if width <= 0 or height <= 0:
        raise ValueError(f"Invalid raster dimensions: width={width}, height={height}")

    # Define transform
    transform = rasterio.transform.from_origin(bounds[0], bounds[3], resolution, resolution)

    # Ensure "mess" column exists and is numeric
    column_name = "MESS_Score"
    if column_name not in joined_gdf.columns:
        raise KeyError(f"Column '{column_name}' is missing from the GeoDataFrame!")

    joined_gdf[column_name] = joined_gdf[column_name].fillna(0).astype(float)

    # Prepare shapes for rasterization
    shapes = [(geom, value) for geom, value in zip(joined_gdf.geometry, joined_gdf[column_name]) if not np.isnan(value)]

    # Create raster
    raster = rasterize(
        shapes=shapes,
        out_shape=(height, width),
        transform=transform,
        fill=0,
        dtype=np.float32
    )

    # Save to file
    output_filename = f"{my_path}{my_training_state}_{column_name}.tif"
    with rasterio.open(
        output_filename, "w",
        driver="GTiff",
        height=height,
        width=width,
        count=1,
        dtype=rasterio.float32,
        crs=joined_gdf.crs,  # Use the same projected CRS
        transform=transform
    ) as dst:
        dst.write(raster, 1)
        dst.set_band_description(1, column_name)

    # Check raster output
    #print(f"Raster saved as: {output_filename}")
    #print(f"Unique raster values: {np.unique(raster)}")  # Ensure non-zero values exist

    # Plot the raster
    plt.figure(figsize=(10, 6))
    plt.imshow(raster, cmap="viridis", extent=[bounds[0], bounds[2], bounds[1], bounds[3]])
    plt.colorbar(label=f'{column_name}')
    plt.title('Rasterized MESS')
    plt.xlabel('X (meters)')
    plt.ylabel('Y (meters)')
    plt.show()

In [3]:
gbif_result = []
for country in my_countries:
    result = gbif_api_call(my_taxon, country, limit)
    gbif_result.extend(result.get("results", []))  # Append results directly
# Extract fields from all collected results
homerange_points = pd.DataFrame(extract_fields(gbif_result))
homerange_points = gpd.GeoDataFrame(
    homerange_points, geometry=gpd.points_from_xy(homerange_points.decimalLongitude, homerange_points.decimalLatitude)).dropna().set_crs(4269).to_crs(5070)

In [6]:
# Load raster dataset (assume multiband raster where each band is a predictor)
raster_path = invaded_raster
with rasterio.open(raster_path) as src:
    out_image = src.read()  # Read all bands without masking
    meta = src.meta  # Store metadata for later use
    transform = src.transform  # Affine transform for georeferencing

    # Extract band names or fallback to generic names
    band_names = [src.descriptions[i] if src.descriptions and src.descriptions[i] else f"Band_{i+1}" 
                  for i in range(src.count)]
    print("Extracted Band Names:", band_names)  # Debugging step

# Convert extracted raster data to a DataFrame
bands, height, width = out_image.shape
pixels = out_image.reshape(bands, -1).T  # Flatten to (num_pixels, num_bands)
pred_data = pd.DataFrame(pixels, columns=band_names)

# Handle NoData values (if applicable)
if meta.get("nodata") is not None:
    pred_data.replace(meta["nodata"], np.nan, inplace=True)

# Generate coordinates for each pixel
row_indices, col_indices = np.indices((height, width))
x_coords, y_coords = rasterio.transform.xy(transform, row_indices.flatten(), col_indices.flatten())

# Create geometries (Point objects)
geometries = [Point(x, y) for x, y in zip(x_coords, y_coords)]

# Convert DataFrame to GeoDataFrame
my_pred_data = gpd.GeoDataFrame(pred_data, geometry=geometries, crs=5070).reset_index().rename(columns ={'index':'predID'})

Extracted Band Names: ['NDTI', 'NDBI', 'NDCI', 'NDVI', 'GPP_Annual', 'GPP_Summer', 'Precip_Winter', 'Precip_Spring', 'Precip_Summer', 'Precip_Fall', 'Heat_Insolation', 'Topo_Diversity', 'gHM', 'NDSI', 'Flashiness', 'Runoff', 'Drawdown', 'LST_Annual', 'LST_Summer', 'LST_Winter', 'LST_Spring', 'LST_Fall']


In [8]:
feature_choices = ['NDBI', 'NDTI', 'NDSI', 'NDCI', 'GPP_Summer', 'gHM', 
            'Heat_Insolation', 'Topo_Diversity', 'Flashiness', 'LST_Summer',
            'LST_Winter','NDVI','LST_Spring','LST_Fall', 'Precip_Winter', 
            'Precip_Spring', 'Precip_Summer', 'Precip_Fall', 'Drawdown', 'Runoff', 'geometry', 'predID']
ref_data = sample_multiband_geotiff_with_names(homerange_raster, homerange_points)
my_ref_data = filter_dataframe_columns(ref_data, feature_choices).dropna()
my_pred_data = filter_dataframe_columns(my_pred_data, feature_choices).dropna()

In [None]:
my_mess = MESS(my_ref_data, my_pred_data)
my_mess_clean = my_mess.dropna()
export_mess(my_mess_clean)