# Step 3: Format of the inputs 📄

...


#### 📚 Required Libraries 
To fit and apply the MaxEnt model, I need the following libraries:
- **`pandas`**: For reading, manipulating, and analyzing the data. 
- **`geopandas`** : to load the inputs variables
- **`shapely`**: to work with bounding boxes and create the grid.
____

### 📄 Reformating to merge the zosteraceae points with the grid cell and the environmental values

In [1]:
import pandas as pd
import geopandas as gpd
from shapely import wkt

In [19]:
inputs_variables = pd.read_csv("data/02_inputs_environmental_variables.csv")
zosteraceae_points = pd.read_csv("data/01_filtered_Zosteraceae.csv")
clipped_grid_4326 = gpd.read_file("data/clipped_grid_4326.geojson")

In [20]:
zosteraceae_points.columns

Index(['gbifID', 'datasetKey', 'occurrenceID', 'kingdom', 'phylum', 'class',
       'order', 'family', 'genus', 'species', 'infraspecificEpithet',
       'taxonRank', 'scientificName', 'verbatimScientificName',
       'verbatimScientificNameAuthorship', 'countryCode', 'locality',
       'stateProvince', 'occurrenceStatus', 'individualCount',
       'publishingOrgKey', 'decimalLatitude', 'decimalLongitude',
       'coordinateUncertaintyInMeters', 'coordinatePrecision', 'elevation',
       'elevationAccuracy', 'depth', 'depthAccuracy', 'eventDate', 'day',
       'month', 'year', 'taxonKey', 'speciesKey', 'basisOfRecord',
       'institutionCode', 'collectionCode', 'catalogNumber', 'recordNumber',
       'identifiedBy', 'dateIdentified', 'license', 'rightsHolder',
       'recordedBy', 'typeStatus', 'establishmentMeans', 'lastInterpreted',
       'mediaType', 'issue', 'geometry'],
      dtype='object')

In [21]:
# Convert species points to GeoDataFrame
zosteraceae_gdf = gpd.GeoDataFrame(
    zosteraceae_points, 
    geometry=gpd.points_from_xy(zosteraceae_points['decimalLongitude'], zosteraceae_points['decimalLatitude']),
    crs="EPSG:4326"
)

# Convert grid cells to GeoDataFrame (if not already)
inputs_variables_gdf = gpd.GeoDataFrame(
    inputs_variables, 
    geometry=inputs_variables['geometry'].apply(wkt.loads), 
    crs="EPSG:4326"
)

# Spatial join to match points to cells
zosteraceae_with_env = gpd.sjoin(zosteraceae_gdf, inputs_variables_gdf, how='right', predicate='within')
# Add a presence column: 1 if a Zosteraceae point falls within the grid cell, 0 otherwise
zosteraceae_with_env['presence'] = zosteraceae_with_env['gbifID'].notna().astype(int)

In [22]:
zosteraceae_with_env

Unnamed: 0,index_left,gbifID,datasetKey,occurrenceID,kingdom,phylum,class,order,family,genus,...,bottomT,siconc,sob,thetao,uo,vo,VTM01_SW2,VSDX,geometry,presence
0,,,,,,,,,,,...,6.310916,0.0,35.101162,10.721393,0.054501,0.065077,,,"POLYGON ((10.03035 58.25918, 10.03459 58.25935...",0
1,,,,,,,,,,,...,6.310916,0.0,35.101162,10.721393,0.054501,0.065077,,,"POLYGON ((10.03003 58.26142, 10.03428 58.26158...",0
2,,,,,,,,,,,...,6.310916,0.0,35.101162,10.721393,0.054501,0.065077,,,"POLYGON ((10.02972 58.26365, 10.03396 58.26382...",0
3,,,,,,,,,,,...,6.310916,0.0,35.101162,10.721393,0.054501,0.065077,,,"POLYGON ((10.0294 58.26589, 10.03365 58.26606,...",0
4,,,,,,,,,,,...,6.310916,0.0,35.101162,10.721393,0.054501,0.065077,,,"POLYGON ((10.02909 58.26813, 10.03333 58.26829...",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240351,,,,,,,,,,,...,11.165191,0.0,13.510467,10.322074,-0.006674,0.015324,2.790750,0.024698,"POLYGON ((13.05616 55.68358, 13.06013 55.68364...",0
240352,,,,,,,,,,,...,11.199465,0.0,13.806524,10.325938,0.010994,0.012605,3.482053,0.027726,"POLYGON ((13.06113 55.66344, 13.0651 55.6635, ...",0
240353,,,,,,,,,,,...,11.199465,0.0,13.806524,10.325938,0.010994,0.012605,3.482053,0.027726,"POLYGON ((13.06102 55.66568, 13.06499 55.66575...",0
240354,,,,,,,,,,,...,11.199465,0.0,13.806524,10.325938,0.010994,0.012605,3.482053,0.027726,"POLYGON ((13.06091 55.66793, 13.06488 55.66799...",0


In [23]:
zosteraceae_with_env.columns

Index(['index_left', 'gbifID', 'datasetKey', 'occurrenceID', 'kingdom',
       'phylum', 'class', 'order', 'family', 'genus', 'species',
       'infraspecificEpithet', 'taxonRank', 'scientificName',
       'verbatimScientificName', 'verbatimScientificNameAuthorship',
       'countryCode', 'locality', 'stateProvince', 'occurrenceStatus',
       'individualCount', 'publishingOrgKey', 'decimalLatitude',
       'decimalLongitude', 'coordinateUncertaintyInMeters',
       'coordinatePrecision', 'elevation', 'elevationAccuracy', 'depth',
       'depthAccuracy', 'eventDate', 'day', 'month', 'year', 'taxonKey',
       'speciesKey', 'basisOfRecord', 'institutionCode', 'collectionCode',
       'catalogNumber', 'recordNumber', 'identifiedBy', 'dateIdentified',
       'license', 'rightsHolder', 'recordedBy', 'typeStatus',
       'establishmentMeans', 'lastInterpreted', 'mediaType', 'issue', 'chl',
       'no3', 'ph', 'po4', 'bottomT', 'siconc', 'sob', 'thetao', 'uo', 'vo',
       'VTM01_SW2', 'VSDX

In [24]:
# Filter the column of interest (remove duplicate 'geometry')
zosteraceae_with_env = zosteraceae_with_env[['geometry','chl','no3', 'ph', 'po4', 'bottomT', 'siconc', 'sob', 'thetao', 'uo', 'vo', 'VTM01_SW2', 'VSDX', 'presence']]
# Deleting the rows with any NaN values
zosteraceae_with_env = zosteraceae_with_env.dropna()
zosteraceae_with_env

Unnamed: 0,geometry,chl,no3,ph,po4,bottomT,siconc,sob,thetao,uo,vo,VTM01_SW2,VSDX,presence
14,"POLYGON ((10.03915 58.25727, 10.04339 58.25744...",1.319354,1.591813,8.212095,0.453248,6.350818,0.0,35.100361,10.726984,0.060392,0.065536,6.414486,0.036752,0
15,"POLYGON ((10.03883 58.25951, 10.04308 58.25968...",1.313369,1.584467,8.212026,0.452352,6.332529,0.0,35.100712,10.723794,0.056117,0.064474,6.485106,0.036403,0
16,"POLYGON ((10.03852 58.26175, 10.04277 58.26191...",1.313369,1.584467,8.212026,0.452352,6.332529,0.0,35.100712,10.723794,0.056117,0.064474,6.485106,0.036403,0
17,"POLYGON ((10.03821 58.26399, 10.04245 58.26415...",1.313369,1.584467,8.212026,0.452352,6.332529,0.0,35.100712,10.723794,0.056117,0.064474,6.485106,0.036403,0
18,"POLYGON ((10.03789 58.26622, 10.04214 58.26639...",1.313369,1.584467,8.212026,0.452352,6.332529,0.0,35.100712,10.723794,0.056117,0.064474,6.485106,0.036403,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240351,"POLYGON ((13.05616 55.68358, 13.06013 55.68364...",2.017805,4.840452,8.171505,0.337169,11.165191,0.0,13.510467,10.322074,-0.006674,0.015324,2.790750,0.024698,0
240352,"POLYGON ((13.06113 55.66344, 13.0651 55.6635, ...",1.999801,4.590124,8.169117,0.341800,11.199465,0.0,13.806524,10.325938,0.010994,0.012605,3.482053,0.027726,0
240353,"POLYGON ((13.06102 55.66568, 13.06499 55.66575...",1.999801,4.590124,8.169117,0.341800,11.199465,0.0,13.806524,10.325938,0.010994,0.012605,3.482053,0.027726,0
240354,"POLYGON ((13.06091 55.66793, 13.06488 55.66799...",1.999801,4.590124,8.169117,0.341800,11.199465,0.0,13.806524,10.325938,0.010994,0.012605,3.482053,0.027726,0


In [25]:
# Verify the presence column
count_presence = zosteraceae_with_env['presence'].value_counts()
print("Presence counts:\n", count_presence)

Presence counts:
 presence
0    232170
1       980
Name: count, dtype: int64


That is the intended result !

In [26]:
# Save the final input file
zosteraceae_with_env.to_csv("data/03_inputs_environmental_variables.csv", index=False)