# Create a file containing metadata about stations

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

## Extract biol stations from taxons file

In [None]:
# Get the current directory of the notebook
current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir, os.pardir))
# Import biological data
tax_file = os.path.join(parent_dir, "biol_data", "taxons", "all_taxons1980_2019.csv")
tax_df = pd.read_csv(tax_file, delimiter=";")

In [None]:
# ensure that text is good
tax_df["code_station_hydrobio"] = tax_df["code_station_hydrobio"].astype(str)
tax_df["code_station_hydrobio"] = tax_df["code_station_hydrobio"].str.strip()
sta_df = tax_df.drop_duplicates(["code_station_hydrobio"]).reset_index(drop=True) # keeping unique station_ID
sta_df = sta_df[["code_station_hydrobio", "coordonnee_x", "coordonnee_y"]]
print(sta_df["code_station_hydrobio"].shape[0] )

In [None]:
# export stations to csv
sta_df.to_csv("2join_sta.csv", sep=";", index=False)

The resulting file needs to be joined on qgis then the joined file should be used for the rest

## joined file

In [None]:
# Get the current directory of the notebook
current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir, os.pardir))
# Import biological data
joined_file = os.path.join(parent_dir, "biol_data", "archives", "joined_sta.csv")
df = pd.read_csv(joined_file, delimiter=";")

In [None]:
df.keys()

for phychem:
df = df[["code_station", "coordonnee_x", "coordonnee_y", "joined_ORD_STRA", "joined_HYBAS_L12", "joined_DIST_DN_KM"]]


In [None]:
# get the interesting cols and rename them

df = df[["code_station_hydrobio", "coordonnee_x", "coordonnee_y", "joined_ORD_STRA", "joined_HYBAS_L12", "joined_DIST_DN_KM"]]
df = df.fillna(-1) # if -1 then station was outside range 500m
df.rename(columns={'code_station_hydrobio': 'station_ID', 'joined_ORD_STRA': 'ORD_STRA', 'joined_HYBAS_L12': 'HYBAS_ID', 'joined_DIST_DN_KM': 'DIST_DN_KM'}, inplace=True)
df['station_ID'] = df['station_ID'].apply(lambda x: x.lstrip('0') if x.startswith('0') else x)

In [None]:
# Create the README content
header_txt = "StationIDs and their corresponding geographical information: coordonates, Strahler Order, Hydroshed L12, distance from outlet."

readme_content = {
    'Header': header_txt,
    'station_ID': "Station code using french referentiel",
    'coordonnee_x': "EPSG:2154 ; RGF93 / Lambert-93",
    'coordonnee_y': "EPSG:2154 ; RGF93 / Lambert-93",
    'ORD_STRA': "Strahler Order. Stations were joined by nearest river using P=500m. If value=-1 then station is outside perimeter.",
    'HYBAS_ID': "Hydroshed Level 12. If value=-1 then station is outside perimeter.",
    'DIST_DN_KM': "Distance from station to basin outlet (km). If value=-1 then station is outside perimeter.",
    "Nbr Tot. Stations":f"{df.shape[0]}",
    "Nbr Stations outside P=500m":f"{df['ORD_STRA'].value_counts().loc[-1.0]}"
}
readme_df = pd.DataFrame(list(readme_content.items()), columns=['Column', 'Explanation'])

In [None]:
# create an excel with 2 sheets

with pd.ExcelWriter('metadata_biolstat.xlsx') as writer:
    readme_df.to_excel(writer, sheet_name='README', index=False)
    df.to_excel(writer, sheet_name='geodata', index=False)