# Set the Working Directory

In [3]:
from pyhere import here
import os

os.chdir(here())

# Load Libraries

In [4]:
import elapid as ela
import geopandas as gpd
import pandas as pd
import xarray as xr
import rioxarray as rxr
from sdm.geo import generate_model_raster
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from elapid import MaxentModel, GeographicKFold, distance_weights
from sdm.maxent import prepare_occurence_data, filter_bats, extract_split, cv_maxent, train_maxent, filter_gdf_to_grid
from sdm.geo import model_point_grid

from pathlib import Path

from sdm.features import load_evs, interpolate_nas, calculate_multiscale_variables

# Grid

I only want to keep one point for each species in each 100m grid square. I'm going to do this by loading the model raster which is a null raster which all the predictors are modelled on. I'll convert it to a geodataframe and then take the inner spatial join between the nearest points with a threshold of 100m.

# Bat Records & Background Points


In [5]:
bats = gpd.read_file('data/processed/bats-tidy.geojson')
bats.head()

Unnamed: 0,unique_id,latin_name,activity_type,date,x,y,accuracy,source_info,geometry
0,NYBG_2,Nyctalus noctula,In flight,1996-06-10,465350.0,457750.0,100,"{'source': 'NYBG', 'source_file': 'All Yorkshi...",POINT (465350.000 457750.000)
1,NYBG_9,Nyctalus noctula,In flight,1997-04-09,429550.0,477250.0,100,"{'source': 'NYBG', 'source_file': 'All Yorkshi...",POINT (429550.000 477250.000)
2,NYBG_11,Nyctalus noctula,In flight,1997-06-09,446450.0,435450.0,100,"{'source': 'NYBG', 'source_file': 'All Yorkshi...",POINT (446450.000 435450.000)
3,NYBG_14,Nyctalus noctula,In flight,1997-06-24,439650.0,467050.0,100,"{'source': 'NYBG', 'source_file': 'All Yorkshi...",POINT (439650.000 467050.000)
4,NYBG_37,Nyctalus noctula,In flight,1997-10-17,429550.0,477250.0,100,"{'source': 'NYBG', 'source_file': 'All Yorkshi...",POINT (429550.000 477250.000)


In [6]:
bats = bats[bats.accuracy <= 100]

In [7]:
latin_name = bats.latin_name.unique().tolist()
activity_type = bats.activity_type.unique().tolist()

print(f"Training models for {latin_name} and {activity_type}...")

Training models for ['Nyctalus noctula', 'Pipistrellus pipistrellus', 'Pipistrellus pygmaeus', 'Myotis daubentonii', 'Plecotus auritus', 'Myotis nattereri', 'Pipistrellus nathusii', 'Eptesicus serotinus', 'Nyctalus leisleri', 'Myotis brandtii', 'Myotis mystacinus', 'Vespertilio murinus'] and ['In flight', 'Roost']...


This function will be used once the data has been filtered to the right combination of species and behaviour to keep only the records which are unique to each 100m grid square.

In [8]:
background = gpd.read_file('data/processed/background-points.geojson')
background = background[["geometry"]]
background.head()

Unnamed: 0,geometry
0,POINT (519381.092 439000.590)
1,POINT (497328.840 447950.479)
2,POINT (468894.011 424350.104)
3,POINT (470246.381 445047.555)
4,POINT (483191.294 451016.796)


I'm going to build a series of models based upon different taxonomic and behaviour classifications. I will build a model for:
- Each species and any behaviour type
- Each species and roosting or foraging
- Each genus and any behaviour type
- Each genus and roosting or foraging

# Environmental Variables

In [12]:
ev_raster = Path("data/evs/evs-to-model.tif")

evs_to_model = rxr.open_rasterio(ev_raster, masked=True, band_as_variable=True).squeeze()
# rename the variables by their long name
for var in evs_to_model.data_vars:
    evs_to_model = evs_to_model.rename({var: evs_to_model[var].attrs["long_name"]})

evs_to_model

## Grid Points

We need a grid of points that represents the center of each raster. This is used for extracting annotated points for model training

In [10]:
import geopandas as gpd
grid_points = gpd.read_parquet("data/evs/grid-points.parquet")

In [13]:
## Annotate points


ev_columns = list(evs_to_model.data_vars.keys())
bats_ant = ela.annotate(
    bats, 
    str(ev_raster), 
    labels = ev_columns,
)
background = ela.annotate(
    background, 
    str(ev_raster), 
    labels = ev_columns,
)

Raster:   0%|                              | 0/1 [00:00<?, ?it/s]

Sample:   0%|                              | 0/13876 [00:00<?, ?it/s]

Raster:   0%|                              | 0/1 [00:00<?, ?it/s]

Sample:   0%|                              | 0/33647 [00:00<?, ?it/s]

In [21]:
bats_ant.head()

Unnamed: 0,unique_id,latin_name,activity_type,date,x,y,accuracy,source_info,geometry,climate_stats_temp_ann_var,...,ceh_landcover_grassland_500m,ceh_landcover_arable_500m,vom_vegetation_height_max_500m,ceh_landcover_broadleaved_woodland_500m,vom_vegetation_height_mean_500m,ceh_landcover_suburban_500m,os_cover_water_500m,ceh_landcover_coniferous_woodland_500m,ceh_landcover_urban_500m,ceh_landcover_improved_grassland_500m
0,NYBG_2,Nyctalus noctula,In flight,1996-06-10,465350.0,457750.0,100,"{'source': 'NYBG', 'source_file': 'All Yorkshi...",POINT (465350.000 457750.000),4.370765,...,0.0,1236.0,10.552817,1450.0,2.058665,139.0,2.73,391.0,20.0,6621.0
1,NYBG_9,Nyctalus noctula,In flight,1997-04-09,429550.0,477250.0,100,"{'source': 'NYBG', 'source_file': 'All Yorkshi...",POINT (429550.000 477250.000),4.276229,...,2.0,2161.0,11.823381,2267.0,1.776723,49.0,25.67,73.0,140.0,2726.0
2,NYBG_11,Nyctalus noctula,In flight,1997-06-09,446450.0,435450.0,100,"{'source': 'NYBG', 'source_file': 'All Yorkshi...",POINT (446450.000 435450.000),4.431757,...,0.0,8152.0,4.328964,312.0,0.395401,129.0,0.99,0.0,122.0,1210.0
3,NYBG_14,Nyctalus noctula,In flight,1997-06-24,439650.0,467050.0,100,"{'source': 'NYBG', 'source_file': 'All Yorkshi...",POINT (439650.000 467050.000),4.357553,...,0.0,1258.0,12.764629,339.0,1.336935,4514.0,5.96,0.0,1527.0,2002.0
4,NYBG_37,Nyctalus noctula,In flight,1997-10-17,429550.0,477250.0,100,"{'source': 'NYBG', 'source_file': 'All Yorkshi...",POINT (429550.000 477250.000),4.276229,...,2.0,2161.0,11.823381,2267.0,1.776723,49.0,25.67,73.0,140.0,2726.0


# Modelling

Modelling process:
1. Filter to the species and behaviour type
2. Get the unique points in each 100m grid square
3. Calculate the distance weights for the points
4. Define the model and fit using Cross Validation
5. Save the model

In [22]:
ela.MaxentModel().get_params()

{'beta_categorical': 1.0,
 'beta_hinge': 1.0,
 'beta_lqp': 1.0,
 'beta_multiplier': 1.5,
 'beta_threshold': 1.0,
 'clamp': True,
 'class_weights': 100,
 'convergence_tolerance': 2e-06,
 'feature_types': ['linear', 'hinge', 'product'],
 'n_cpus': 10,
 'n_hinge_features': 10,
 'n_lambdas': 100,
 'n_threshold_features': 10,
 'scorer': 'roc_auc',
 'tau': 0.5,
 'transform': 'cloglog',
 'use_lambdas': 'best',
 'use_sklearn': True}

In [23]:
def maxent_model(n_jobs=1) -> Pipeline:
    model = Pipeline(
        [
            ("scaler", StandardScaler()),
            (
                "maxent",
                MaxentModel(
                    feature_types=["linear", "hinge", "product"],
                    beta_multiplier=6,
                    n_cpus=n_jobs,
                    class_weights="balanced",
                ),
            ),
        ]
    )
    return model

In [24]:
# Generate every combination of latin name and activity type
from itertools import product

# Generate all combinations of latin name and activity type
filter_combinations = list(product(latin_name, activity_type))

In [25]:
training_data = []
for latin_name, activity_type in filter_combinations:
    presence = filter_bats(bats_ant, latin_name=latin_name, activity_type=activity_type)

    if len(presence) < 15:
        continue

    occurrence = prepare_occurence_data(
        presence, background, grid_points, input_vars=ev_columns
    )
    training_data.append({
        "latin_name": latin_name,
        "activity_type": activity_type,
        "occurrence": occurrence,
    })

In [26]:
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import as_completed
from sdm.maxent import eval_train_model
import pandas as pd
from tqdm import tqdm

# Get the number of cpus
import multiprocessing

num_cpus = multiprocessing.cpu_count()
fit_in_parallel = False
if fit_in_parallel:

    executor = ProcessPoolExecutor(num_cpus)

    # Submit tasks to the executor
    futures = [
        executor.submit(eval_train_model, data["occurrence"], maxent_model())
        for data in training_data
    ]

    # Collect results as they complete in the same order as submitted
    results = [future.result() for future in tqdm(futures, total=len(futures)) if future.result() is not None]
    # Close the executor
    executor.shutdown()
else:
    results = []
    for data in tqdm(training_data):
        try:
            result = eval_train_model(data["occurrence"], maxent_model(n_jobs=num_cpus))
            results.append(result)
        except Exception as e:
            print(f"Error processing {data['latin_name']} - {data['activity_type']}: {e}")
            continue

 12%|█▏        | 2/17 [00:49<05:31, 22.10s/it]Process SpawnProcess-48:
Process SpawnProcess-51:
Process SpawnProcess-47:
Process SpawnProcess-49:
Process SpawnProcess-50:
Process SpawnProcess-46:
Process SpawnProcess-43:
Process SpawnProcess-42:
Process SpawnProcess-44:
Process SpawnProcess-45:
Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/envs/bats/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/homebrew/anaconda3/envs/bats/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/homebrew/anaconda3/envs/bats/lib/python3.11/concurrent/futures/process.py", line 249, in _process_worker
    call_item = call_queue.get(block=True)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/anaconda3/envs/bats/lib/python3.11/multiprocessing/queues.py", line 103, in get
    res = self._recv_bytes()
          ^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/anaconda3

### Prepare the results dataframe

In [27]:
import numpy as np

# Convert the inputs and outputs to a dataframe
modelling_df = pd.DataFrame(
    [
        {
            "final_model": final_model,
            "cv_models": cv_models,
            "cv_scores": np.array(cv_scores),
        }
        for final_model, cv_models, cv_scores in results
    ]
)
inputs_df = pd.DataFrame(training_data)
# Combin them
results_df = pd.concat([inputs_df, modelling_df], axis=1)

# Mutate some columns
def count_presence(occurrence):
    return (occurrence["class"] == 1).sum()
def count_background(occurrence):
    return (occurrence["class"] == 0).sum()

results_df["n_presence"] = results_df.occurrence.apply(count_presence)
results_df["n_background"] = results_df.occurrence.apply(count_background)

results_df["mean_cv_score"] = results_df.cv_scores.apply(np.mean)
results_df["mean_cv_score"] = results_df["mean_cv_score"].round(3)
results_df["std_cv_score"] = results_df.cv_scores.apply(np.std)
results_df["std_cv_score"] = results_df["std_cv_score"].round(3)

results_df["folds"] = results_df.cv_scores.apply(len)


results_df["activity_type"] = results_df.activity_type.fillna("All")


In [46]:
results_df

Unnamed: 0,latin_name,activity_type,occurrence,final_model,cv_models,cv_scores,n_presence,n_background,mean_cv_score,std_cv_score,folds,prediction_path
0,Nyctalus noctula,In flight,climate_stats_temp_ann_var climate_bio...,"(StandardScaler(), MaxentModel(beta_multiplier...","[(StandardScaler(), MaxentModel(beta_multiplie...","[0.759127577579017, 0.6254603083949487, 0.7585...",364,20976,0.714,0.063,3,data/sdm_predictions/Nyctalus noctula_In fligh...
1,Nyctalus noctula,Roost,climate_stats_temp_ann_var climate_bio...,"(StandardScaler(), MaxentModel(beta_multiplier...","[(StandardScaler(), MaxentModel(beta_multiplie...","[0.5776765275576685, 0.6975078802206461, 0.568...",80,20976,0.614,0.059,3,data/sdm_predictions/Nyctalus noctula_Roost.tif
2,Pipistrellus pipistrellus,In flight,climate_stats_temp_ann_var climate_bio...,"(StandardScaler(), MaxentModel(beta_multiplier...","[(StandardScaler(), MaxentModel(beta_multiplie...","[0.5602067053520624, 0.6800190487185407, 0.704...",1411,20976,0.648,0.063,3,data/sdm_predictions/Pipistrellus pipistrellus...
3,Pipistrellus pipistrellus,Roost,climate_stats_temp_ann_var climate_bio...,"(StandardScaler(), MaxentModel(beta_multiplier...","[(StandardScaler(), MaxentModel(beta_multiplie...","[0.7113270982534977, 0.6917829074421908, 0.763...",1879,20976,0.722,0.03,3,data/sdm_predictions/Pipistrellus pipistrellus...
4,Pipistrellus pygmaeus,In flight,climate_stats_temp_ann_var climate_bio...,"(StandardScaler(), MaxentModel(beta_multiplier...","[(StandardScaler(), MaxentModel(beta_multiplie...","[0.5364680725055583, 0.7314956000955304, 0.673...",354,20976,0.647,0.082,3,data/sdm_predictions/Pipistrellus pygmaeus_In ...
5,Pipistrellus pygmaeus,Roost,climate_stats_temp_ann_var climate_bio...,"(StandardScaler(), MaxentModel(beta_multiplier...","[(StandardScaler(), MaxentModel(beta_multiplie...","[0.6270221455531966, 0.7717927805715051, 0.754...",382,20976,0.718,0.065,3,data/sdm_predictions/Pipistrellus pygmaeus_Roo...
6,Myotis daubentonii,In flight,climate_stats_temp_ann_var climate_bio...,"(StandardScaler(), MaxentModel(beta_multiplier...","[(StandardScaler(), MaxentModel(beta_multiplie...","[0.8773018130110204, 0.8139770374552983, 0.722...",152,20976,0.805,0.064,3,data/sdm_predictions/Myotis daubentonii_In fli...
7,Myotis daubentonii,Roost,climate_stats_temp_ann_var climate_bio...,"(StandardScaler(), MaxentModel(beta_multiplier...","[(StandardScaler(), MaxentModel(beta_multiplie...","[0.7295239309641312, 0.7271209972822875, 0.625...",146,20976,0.694,0.049,3,data/sdm_predictions/Myotis daubentonii_Roost.tif
8,Plecotus auritus,In flight,climate_stats_temp_ann_var climate_bio...,"(StandardScaler(), MaxentModel(beta_multiplier...","[(StandardScaler(), MaxentModel(beta_multiplie...","[0.6306035799972692, 0.7610738762346301, 0.754...",163,20976,0.715,0.06,3,data/sdm_predictions/Plecotus auritus_In fligh...
9,Plecotus auritus,Roost,climate_stats_temp_ann_var climate_bio...,"(StandardScaler(), MaxentModel(beta_multiplier...","[(StandardScaler(), MaxentModel(beta_multiplie...","[0.7830885057471263, 0.7377089808574693, 0.783...",695,20976,0.768,0.021,3,data/sdm_predictions/Plecotus auritus_Roost.tif


# Make Predictions

In [None]:
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import as_completed
from pathlib import Path
# Iterate through the df and make predictions using the best model
# save each prediction to a tif named after the latin name and activity type


prediction_paths = []
# Define the arguments for each task
tasks = []
for _, row in results_df.iterrows():
    latin_name = row.latin_name
    activity_type = row.activity_type
    model = row.final_model
    path_predict = Path(f"data/sdm_predictions/{latin_name}_{activity_type}.tif")
    prediction_paths.append(path_predict)
    tasks.append({
        "model": model,
        "raster_paths": [ev_raster],
        "output_path": path_predict,
        "latin_name": latin_name,
        "activity_type": activity_type,
    })
parrallel = False
if parrallel:
    # Submit the tasks to the executor
    executor = ProcessPoolExecutor()
    futures = [executor.submit(ela.apply_model_to_rasters, **task) for task in tasks]

    # Wait for the tasks to complete
    # Use a progress bar to track progress
    for future in tqdm(as_completed(futures), total=len(futures)):
        result = future.result()
else:
    # Iterate through the tasks and apply the model to the raster
    for task in tqdm(tasks):
        try:
            output_path = ela.apply_model_to_rasters(**task)
        except Exception as e:
            print(f"Error applying model to raster: {e}")
            continue


  0%|          | 0/17 [00:00<?, ?it/s]

Window:   0%|                              | 0/1403 [00:00<?, ?it/s]

Error applying model to raster: Found array with 0 sample(s) (shape=(0, 43)) while a minimum of 1 is required by StandardScaler.


Window:   0%|                              | 0/1403 [00:00<?, ?it/s]

Error applying model to raster: Found array with 0 sample(s) (shape=(0, 43)) while a minimum of 1 is required by StandardScaler.


Window:   0%|                              | 0/1403 [00:00<?, ?it/s]

 18%|█▊        | 3/17 [00:00<00:00, 24.36it/s]

Error applying model to raster: Found array with 0 sample(s) (shape=(0, 43)) while a minimum of 1 is required by StandardScaler.


Window:   0%|                              | 0/1403 [00:00<?, ?it/s]

Error applying model to raster: Found array with 0 sample(s) (shape=(0, 43)) while a minimum of 1 is required by StandardScaler.


Window:   0%|                              | 0/1403 [00:00<?, ?it/s]

Error applying model to raster: Found array with 0 sample(s) (shape=(0, 43)) while a minimum of 1 is required by StandardScaler.


Window:   0%|                              | 0/1403 [00:00<?, ?it/s]

 35%|███▌      | 6/17 [00:00<00:00, 27.09it/s]

Error applying model to raster: Found array with 0 sample(s) (shape=(0, 43)) while a minimum of 1 is required by StandardScaler.


Window:   0%|                              | 0/1403 [00:00<?, ?it/s]

Error applying model to raster: Found array with 0 sample(s) (shape=(0, 43)) while a minimum of 1 is required by StandardScaler.


Window:   0%|                              | 0/1403 [00:00<?, ?it/s]

Error applying model to raster: Found array with 0 sample(s) (shape=(0, 43)) while a minimum of 1 is required by StandardScaler.


Window:   0%|                              | 0/1403 [00:00<?, ?it/s]

 53%|█████▎    | 9/17 [00:00<00:00, 24.84it/s]

Error applying model to raster: Found array with 0 sample(s) (shape=(0, 43)) while a minimum of 1 is required by StandardScaler.


Window:   0%|                              | 0/1403 [00:00<?, ?it/s]

Error applying model to raster: Found array with 0 sample(s) (shape=(0, 43)) while a minimum of 1 is required by StandardScaler.


Window:   0%|                              | 0/1403 [00:00<?, ?it/s]

Error applying model to raster: Found array with 0 sample(s) (shape=(0, 43)) while a minimum of 1 is required by StandardScaler.


Window:   0%|                              | 0/1403 [00:00<?, ?it/s]

 71%|███████   | 12/17 [00:00<00:00, 26.22it/s]

Error applying model to raster: Found array with 0 sample(s) (shape=(0, 43)) while a minimum of 1 is required by StandardScaler.


Window:   0%|                              | 0/1403 [00:00<?, ?it/s]

Error applying model to raster: Found array with 0 sample(s) (shape=(0, 43)) while a minimum of 1 is required by StandardScaler.


Window:   0%|                              | 0/1403 [00:00<?, ?it/s]

Error applying model to raster: Found array with 0 sample(s) (shape=(0, 43)) while a minimum of 1 is required by StandardScaler.


Window:   0%|                              | 0/1403 [00:00<?, ?it/s]

 88%|████████▊ | 15/17 [00:00<00:00, 26.30it/s]

Error applying model to raster: Found array with 0 sample(s) (shape=(0, 43)) while a minimum of 1 is required by StandardScaler.


Window:   0%|                              | 0/1403 [00:00<?, ?it/s]

Error applying model to raster: Found array with 0 sample(s) (shape=(0, 43)) while a minimum of 1 is required by StandardScaler.


Window:   0%|                              | 0/1403 [00:00<?, ?it/s]

100%|██████████| 17/17 [00:00<00:00, 26.65it/s]

Error applying model to raster: Found array with 0 sample(s) (shape=(0, 43)) while a minimum of 1 is required by StandardScaler.





In [30]:
results_df["prediction_path"] = prediction_paths

In [31]:
# ML Flow logging
import mlflow
from mlflow import log_metric, log_param, log_params, log_artifacts, log_artifact
import json
input_var_json_path = 'models/input_variables.json'

with open(input_var_json_path, 'w') as f:
    json.dump(ev_columns, f)

mlflow.set_tracking_uri("./mlruns")
mlflow.set_experiment("Sheffield Bat Group - SDM - Maxent")


# Iterate over the results dataframe to log models, parameters and metrics

for _, row in tqdm(results_df.iterrows()):
    with mlflow.start_run(run_name = f"Model_{row['latin_name']}_{row['activity_type']}"):
        mlflow.set_tag("model", "Maxent")
        # Generate a species code from the first 3 letters of the genus and species
        # This makes it easier to identify the species in mlflow
        genus = row["latin_name"].split(" ")[0]
        species = row["latin_name"].split(" ")[1]
        species_code = genus[:3] + "_" + species[:3]
        mlflow.set_tag("species_code", species_code)

        mlflow.set_tag("latin_name", row["latin_name"])
        mlflow.set_tag("activity_type", row["activity_type"])
        # Log the parameters
        log_params(row[["n_presence", "n_background", "folds"]].to_dict())
        # Log model parameters
        log_params(row["final_model"].get_params())

        # Log the input variables which exceed the param length limit
        log_artifact(input_var_json_path, "input_variables")

        # Log the training data
        with NamedTemporaryFile(suffix = ".parquet") as f:
            occurence_gdf = row["occurrence"]
            occurence_gdf.to_parquet(f.name)
            log_artifact(f.name, "training_data")

        # Log the metrics
        log_metric("mean_cv_score", row["mean_cv_score"])
        log_metric("std_cv_score", row["std_cv_score"])

        # Log the predictions as an artifact
        log_artifact(row["prediction_path"], "predictions_raster")


        # Log the model
        mlflow.sklearn.log_model(row["final_model"], "model")



0it [00:00, ?it/s]


PermissionError: [Errno 13] Permission denied: '/Users/matthewwhittle/Data Science'

In [None]:
results_df.to_csv("data/sdm_predictions/results.csv", index=False)

# Pickle the results dataframe
results_df.to_pickle("data/sdm_predictions/results.pkl")

# Unnest the occurence dataframe and save as a parquet file

def extract_occurrence_df(row):
    row_occurrence = row["occurrence"]
    row_occurrence["latin_name"] = row["latin_name"]
    row_occurrence["activity_type"] = row["activity_type"]
    return row_occurrence

occurrence_gdf = pd.concat([extract_occurrence_df(row) for _, row in results_df.iterrows()])
occurrence_gdf.to_parquet("data/sdm_predictions/training-occurrence-data.parquet")
