# Step 3: Format of the inputs 📄

...


#### 📚 Required Libraries 
To reformat the inputs variables, I need the following libraries:
...
____

### ...

In [82]:
import pandas as pd
import geopandas as gpd
from shapely import wkt

In [83]:
inputs_variables = pd.read_csv("inputs_variables_polygons.csv")
zosteraceae_points = pd.read_csv("filtered_Artportalen-Zosteraceae.csv")
clipped_grid_4326 = gpd.read_file("clipped_grid_4326.geojson")

In [84]:
zosteraceae_points.columns

Index(['gbifID', 'datasetKey', 'occurrenceID', 'kingdom', 'phylum', 'class',
       'order', 'family', 'genus', 'species', 'infraspecificEpithet',
       'taxonRank', 'scientificName', 'verbatimScientificName',
       'verbatimScientificNameAuthorship', 'countryCode', 'locality',
       'stateProvince', 'occurrenceStatus', 'individualCount',
       'publishingOrgKey', 'decimalLatitude', 'decimalLongitude',
       'coordinateUncertaintyInMeters', 'coordinatePrecision', 'elevation',
       'elevationAccuracy', 'depth', 'depthAccuracy', 'eventDate', 'day',
       'month', 'year', 'taxonKey', 'speciesKey', 'basisOfRecord',
       'institutionCode', 'collectionCode', 'catalogNumber', 'recordNumber',
       'identifiedBy', 'dateIdentified', 'license', 'rightsHolder',
       'recordedBy', 'typeStatus', 'establishmentMeans', 'lastInterpreted',
       'mediaType', 'issue', 'geometry'],
      dtype='object')

In [94]:
# Convert species points to GeoDataFrame
zosteraceae_gdf = gpd.GeoDataFrame(
    zosteraceae_points, 
    geometry=gpd.points_from_xy(zosteraceae_points['decimalLongitude'], zosteraceae_points['decimalLatitude']),
    crs="EPSG:4326"
)

# Convert grid cells to GeoDataFrame (if not already)
inputs_variables_gdf = gpd.GeoDataFrame(
    inputs_variables, 
    geometry=inputs_variables['geometry'].apply(wkt.loads), 
    crs="EPSG:4326"
)

# Spatial join to match points to cells
zosteraceae_with_env = gpd.sjoin(zosteraceae_gdf, inputs_variables_gdf, how='right', predicate='within')
# Add a presence column: 1 if a Zosteraceae point falls within the grid cell, 0 otherwise
zosteraceae_with_env['presence'] = zosteraceae_with_env['gbifID'].notna().astype(int)

In [95]:
zosteraceae_with_env

Unnamed: 0,index_left,gbifID,datasetKey,occurrenceID,kingdom,phylum,class,order,family,genus,...,siconc,so,sob,thetao,uo,vo,VTM01_SW2,VSDX,geometry,presence
0,,,,,,,,,,,...,0.000002,29.058784,35.130917,10.248790,-0.027613,0.013473,,,"POLYGON ((10.03035 58.25918, 10.03459 58.25935...",0
1,,,,,,,,,,,...,0.000002,29.058784,35.130917,10.248790,-0.027613,0.013473,,,"POLYGON ((10.03003 58.26142, 10.03428 58.26158...",0
2,,,,,,,,,,,...,0.000002,29.058784,35.130917,10.248790,-0.027613,0.013473,,,"POLYGON ((10.02972 58.26365, 10.03396 58.26382...",0
3,,,,,,,,,,,...,0.000002,29.058784,35.130917,10.248790,-0.027613,0.013473,,,"POLYGON ((10.0294 58.26589, 10.03365 58.26606,...",0
4,,,,,,,,,,,...,0.000002,29.058784,35.130917,10.248790,-0.027613,0.013473,,,"POLYGON ((10.02909 58.26813, 10.03333 58.26829...",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240351,,,,,,,,,,,...,0.000301,8.760614,14.669808,10.083274,-0.005724,0.010729,3.607306,0.012438,"POLYGON ((13.05616 55.68358, 13.06013 55.68364...",0
240352,,,,,,,,,,,...,0.000416,8.715675,14.971891,10.116260,0.013035,0.005482,3.834101,0.014327,"POLYGON ((13.06113 55.66344, 13.0651 55.6635, ...",0
240353,,,,,,,,,,,...,0.000416,8.715675,14.971891,10.116260,0.013035,0.005482,3.834101,0.014327,"POLYGON ((13.06102 55.66568, 13.06499 55.66575...",0
240354,,,,,,,,,,,...,0.000416,8.715675,14.971891,10.116260,0.013035,0.005482,3.834101,0.014327,"POLYGON ((13.06091 55.66793, 13.06488 55.66799...",0


In [96]:
zosteraceae_with_env.columns

Index(['index_left', 'gbifID', 'datasetKey', 'occurrenceID', 'kingdom',
       'phylum', 'class', 'order', 'family', 'genus', 'species',
       'infraspecificEpithet', 'taxonRank', 'scientificName',
       'verbatimScientificName', 'verbatimScientificNameAuthorship',
       'countryCode', 'locality', 'stateProvince', 'occurrenceStatus',
       'individualCount', 'publishingOrgKey', 'decimalLatitude',
       'decimalLongitude', 'coordinateUncertaintyInMeters',
       'coordinatePrecision', 'elevation', 'elevationAccuracy', 'depth',
       'depthAccuracy', 'eventDate', 'day', 'month', 'year', 'taxonKey',
       'speciesKey', 'basisOfRecord', 'institutionCode', 'collectionCode',
       'catalogNumber', 'recordNumber', 'identifiedBy', 'dateIdentified',
       'license', 'rightsHolder', 'recordedBy', 'typeStatus',
       'establishmentMeans', 'lastInterpreted', 'mediaType', 'issue', 'chl',
       'no3', 'ph', 'po4', 'siconc', 'so', 'sob', 'thetao', 'uo', 'vo',
       'VTM01_SW2', 'VSDX', 'g

In [97]:
# Filter the column of interest
zosteraceae_with_env = zosteraceae_with_env[['geometry','chl', 'no3', 'ph', 'po4', 'siconc', 'so', 'sob', 'thetao', 'uo', 'vo', 'VTM01_SW2', 'VSDX','presence']]
# Deleting the rows with any NaN values
zosteraceae_with_env = zosteraceae_with_env.dropna()
zosteraceae_with_env

Unnamed: 0,geometry,chl,no3,ph,po4,siconc,so,sob,thetao,uo,vo,VTM01_SW2,VSDX,presence
38,"POLYGON ((10.03995 58.28205, 10.04419 58.28221...",1.291090,1.944273,8.170156,0.446924,4.143011e-08,28.985157,35.130569,10.238485,-0.029739,0.008287,5.449118,0.027333,0
39,"POLYGON ((10.03963 58.28428, 10.04388 58.28445...",1.291090,1.944273,8.170156,0.446924,4.143011e-08,28.985157,35.130569,10.238485,-0.029739,0.008287,5.449118,0.027333,0
40,"POLYGON ((10.03932 58.28652, 10.04357 58.28669...",1.291090,1.944273,8.170156,0.446924,4.143011e-08,28.985157,35.130569,10.238485,-0.029739,0.008287,5.449118,0.027333,0
41,"POLYGON ((10.039 58.28876, 10.04325 58.28892, ...",1.291090,1.944273,8.170156,0.446924,4.143011e-08,28.985157,35.130569,10.238485,-0.029739,0.008287,5.449118,0.027333,0
54,"POLYGON ((10.04419 58.28221, 10.04844 58.28238...",1.290995,1.946527,8.170217,0.446902,6.275653e-08,28.972874,35.130547,10.236970,-0.026636,0.007502,5.449118,0.027333,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240351,"POLYGON ((13.05616 55.68358, 13.06013 55.68364...",2.066166,4.442574,8.116159,0.476782,3.005194e-04,8.760614,14.669808,10.083274,-0.005724,0.010729,3.607306,0.012438,0
240352,"POLYGON ((13.06113 55.66344, 13.0651 55.6635, ...",2.093574,4.126968,8.116018,0.473815,4.159018e-04,8.715675,14.971891,10.116260,0.013035,0.005482,3.834101,0.014327,0
240353,"POLYGON ((13.06102 55.66568, 13.06499 55.66575...",2.093574,4.126968,8.116018,0.473815,4.159018e-04,8.715675,14.971891,10.116260,0.013035,0.005482,3.834101,0.014327,0
240354,"POLYGON ((13.06091 55.66793, 13.06488 55.66799...",2.093574,4.126968,8.116018,0.473815,4.159018e-04,8.715675,14.971891,10.116260,0.013035,0.005482,3.834101,0.014327,0


In [98]:
# Verify the presence column
count_presence = zosteraceae_with_env['presence'].value_counts()
print("Presence counts:\n", count_presence)

Presence counts:
 presence
0    220219
1       663
Name: count, dtype: int64


That is the intended result !

In [99]:
# Save the final input file
zosteraceae_with_env.to_csv("zosteraceae_with_env.csv", index=False)