# Import Required Libraries
Import the necessary libraries, including requests and pandas.

In [0]:
# Importing the required libraries
import requests
import pandas as pd
from pyinaturalist import get_observations
import numpy as np
import json

# Send GET Request to iNaturalist API
Use the requests library to send a GET request to the iNaturalist API.

In [0]:
laval_place_id = 27655
montreal_place_id = 187355
gatineau_place_id = 142292

# Initialize an empty DataFrame to store the observations
def get_observations_dataframe(place_id,place_name):
    df = pd.DataFrame()
    for page in range(1, 10):
        observation = get_observations(place_id=place_id, verifiable=True,per_page=200, page=str(page), quality_grade="research")
        df = df.append(observation["results"], ignore_index=True)
        df["location"] = place_name
    return df

df_naturalist = pd.DataFrame()
# Example usage
data_laval = get_observations_dataframe(laval_place_id,"Laval")
data_montreal = get_observations_dataframe(montreal_place_id,"Montreal")
data_gatineau = get_observations_dataframe(gatineau_place_id,"Gatineau")


df_naturalist = pd.concat([data_laval, data_montreal, data_gatineau], ignore_index=True)
df_naturalist['observed_on'] = pd.to_datetime(df_naturalist['observed_on'], utc=True)

# Save the DataFrame to Parquet
df_naturalist.to_parquet("naturalist_sample.parquet")

In [0]:

# Initialize an empty DataFrame to store the observations
def get_observations_dataframe_by_geojson(latitude,longitude):
    df = pd.DataFrame()
    for page in range(1, 2):
        observation = get_observations(verifiable=True,per_page=200, page=str(page), quality_grade="research", lat=latitude, lng= longitude,radius=50)
        df = df.append(observation["results"], ignore_index=True)

    return df

data_shawinigan = get_observations_dataframe_by_geojson(46.67892,-72.876228)

# Building Observations Dataset

In [0]:
df_naturalist = pd.read_parquet("naturalist_sample.parquet")
display(df_naturalist)

In [0]:
import os

# Get all files in the data folder
# Read the 'sentinelle_liste_sp.csv' file from the workspace
file_path = 'sentinelle_liste_sp.csv'
df_sentinelle = pd.read_csv(file_path)
# Filter the files based on the ones that contain "sentinelle" in their name

# display(df_sentinelle)
# Add any additional code here for further processing or analysis

In [0]:
import numpy as np

def add_isInvasive_column(df_inaturalist, df_sentinelle):
    cols_to_return = list(df_inaturalist)
    cols_to_return.append("isInvasive")
    df_inaturalist['species_guess'] = df_inaturalist[['species_guess']].apply(lambda x: x.astype(str).str.lower())
    df_sentinelle["Nom_francais"] = df_sentinelle[['Nom_francais']].apply(lambda x: x.astype(str).str.lower())
    merged_df = df_inaturalist.merge(right=df_sentinelle, left_on='species_guess', right_on='Nom_francais', how='left')
    merged_df["isInvasive"] = np.where((merged_df["Nom_francais"].isnull()) | (merged_df["Nom_francais"] == np.nan), False, True)
    return merged_df[cols_to_return]

# joined_df = add_isInvasive_column(df_naturalist, df_sentinelle)
# joined_df[joined_df["isInvasive"] == True]

In [0]:
# Keep only the 'species_guess' column and add a new column with the value 'Laval'
# Convert the values in the 'species_guess' column to lowercase and select only the 'species_guess' column

def get_community_observations_df(df_naturalist, df_sentinelle):
    df = df_naturalist.copy()
    df["id"] = "c_" + df["id"].astype(str)
    df["observed_on"] = df["observed_on"].dt.strftime("%Y-%m-%d")
    df.rename(columns={'observed_on': 'observation_date'}, inplace=True)
    df['species_guess'] = df['species_guess'].astype(str).str.lower()
    df = add_isInvasive_column(df, df_sentinelle)
    df['longitude'] = df['geojson'].apply(lambda x: x['coordinates'][0] if 'coordinates' in x else None)
    df['latitude'] = df['geojson'].apply(lambda x: x['coordinates'][1] if 'coordinates' in x else None)
    df["source"] = "Community"
    df["image_url"] = df['observation_photos'].apply(lambda x: x[0]['photo']["url"].replace("square.jpeg", "large.jpeg") if len(x) >= 1 else None)
    df = df[['id', 'species_guess', 'location', 'observation_date', 'isInvasive', 'latitude', 'longitude', 'source', 'image_url']]
    return df

In [0]:
def get_government_observations(df):
    df['id'] = "g_" + df["properties"].apply(lambda x: str(x['OBJECTID']) if 'OBJECTID' in x else None)
    df["species_guess"] = df["properties"].apply(lambda x: x['Nom_espece_français'].lower() if 'Nom_espece_français' in x else None)
    df["location"] = df["properties"].apply(lambda x: x['Nom_region_administrative'] if 'Nom_region_administrative' in x else None)
    df['location'] = df['location'].replace("Montréal", "Montreal").replace("Outaouais", "Gatineau")
    df["observation_date"] = df["properties"].apply(lambda x: x['Date_observation'] if 'Date_observation' in x else None)
    df["latitude"] = df["properties"].apply(lambda x: x['Latitude'] if 'Latitude' in x else None)
    df["longitude"] = df["properties"].apply(lambda x: x['Longitude'] if 'Longitude' in x else None)
    df["isInvasive"] = True
    df["source"] = "Government"
    df["image_url"] = df["properties"].apply(lambda x: x['Lien_photo'] if 'Lien_photo' in x else None)
    df = df[['id', 'species_guess', 'location', 'observation_date', 'isInvasive', 'latitude', 'longitude', 'source', 'image_url']]
    df = df[df['location'].isin(['Montreal', 'Laval', 'Gatineau'])]
    return df

In [0]:
# Save the DataFrame as a JSON file with UTF-8 encoding and records orientation
gov_df = pd.read_json("especes_exo_envahissantes.json")
gov_df = get_government_observations(gov_df)
community_df = get_community_observations_df(df_naturalist, df_sentinelle)

union_df = pd.concat([community_df, gov_df], ignore_index=True)

In [0]:
display(gov_df)

In [0]:
from pyspark.sql.functions import explode_outer, col
# I want to store in a df the array in the feature field from table
df = spark.table("hive_metastore.default.animaux_precaire")

def clean_animaux_precaires(df):
    df = df.select(explode_outer(col("features"))).select("col.properties").select("properties.*")
    animaux_precaires_df = df.toPandas()
    cols_animaux_precaires = ["COSEWIC", "GGROUPE", "GROUPE", "LOIEMV", "SCOMNAME", "SNAME"]
    animaux_precaires_df = animaux_precaires_df[cols_animaux_precaires].drop_duplicates()
    return animaux_precaires_df

animaux_precaires_df = clean_animaux_precaires(df)
# display(animaux_precaires_df)

In [0]:
def add_isPrecarious_column(observations_df, precarious_df):
    df = observations_df.copy()
    cols_to_return = list(df) + ["isPrecarious"]
    
    df['species_guess'] = df['species_guess'].astype(str).str.lower()
    precarious_df["SCOMNAME"] = precarious_df['SCOMNAME'].astype(str).str.lower()
    
    merged_df = df.merge(right=precarious_df, left_on='species_guess', right_on='SCOMNAME', how='left')
    merged_df["isPrecarious"] = np.where((merged_df["SCOMNAME"].isnull()) | (merged_df["SCOMNAME"] == np.nan), False, True)
    
    return merged_df[cols_to_return]

df_withPrecarious = add_isPrecarious_column(union_df, animaux_precaires_df)
display(df_withPrecarious)

In [0]:
df_withPrecarious.to_json("observations.json", orient="records", force_ascii=False)

# Creating JSON Files with Species Information 

In [0]:
df_sentinelle.drop_duplicates().to_json("sentinelle_liste_sp.json", orient="records", force_ascii=False)

In [0]:
animaux_precaires_df.drop_duplicates().to_json("animaux_precaires.json", orient="records", force_ascii=False)