# Create a flowchart to show the filters applied to stations

## Import libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# importing necessary functions from other files

import os
import sys

# to facilitate the use of notebooks
%load_ext autoreload
%autoreload 2

# Get the current directory of the notebook
current_dir = os.getcwd()

# importing necessary functions
from statistics_functions import process_df

parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
functions_dir = os.path.join(parent_dir, "only_biol")
print(functions_dir)

# Add this directory to sys.path
if functions_dir not in sys.path:
    sys.path.append(functions_dir)

from EPT_sta import only_taxons


## Define functions

In [None]:
def joined_filter(df):
    df = df[df["ORD_STRA"] != -1].reset_index(drop=True)
    return df

In [None]:
def meas2ope(meas_df, spe_type, field_ID, field_date, coord_x, coord_y):

    meas_df = meas_df.drop_duplicates(subset=[field_ID, field_date]) # creating ope instead of measurements

    # keep only interesting col
    meas_df = meas_df[[field_ID, field_date, coord_x, coord_y]]
    # Convert "code_station_hydrobio" column to string to sort
    meas_df[field_ID] = meas_df[field_ID].astype(str)
    meas_df.sort_values(by=field_ID, inplace=True)
    # remove 0 if starts with 0
    meas_df[field_ID] = meas_df[field_ID].apply(lambda x: x.lstrip('0') if x.startswith('0') else x)
    
    meas_df.reset_index(drop=True, inplace=True)

    return meas_df


In [None]:
def most_freq_filter(df, field_ID, field_date, coord_x, coord_y):

    grouped_df = process_df(df, field_id_station=field_ID, field_date=field_date, coord_x=coord_x, coord_y=coord_y)
    meas_3 = grouped_df.loc[grouped_df['MonthIntStd'].notna()]
    meas_3 = meas_3[field_ID].unique().shape[0]
    
    most_freq_df = grouped_df.loc[(grouped_df['MonthIntMean'] <= 24) & grouped_df['MonthIntStd'].notna()].reset_index(drop=True)
    interv_24 = most_freq_df[field_ID].unique().shape[0]

    return meas_3, interv_24, most_freq_df

In [None]:
def add_coord(left_df, right_df, left_id, right_id):
    """ add coord columns """

    # converting both to str so its the same type
    left_df[left_id] = left_df[left_id].astype(str)
    right_df[right_id] = right_df[right_id].astype(str)
    # merging
    left_df = pd.merge(left_df, right_df[right_df[right_id].isin(left_df[left_id])][[right_id, 'coordonnee_x', 'coordonnee_y']], left_on=left_id, right_on = right_id, how='left')
    # rearranging col
    left_df = left_df.drop(right_id, axis=1)

    return left_df

In [None]:
def isin(df1, field1, df2, field2):

    # converting both to str so its the same type
    df1[field1] = df1[field1].astype(str)
    df2[field2] = df2[field2].astype(str)

    df1 = df1[df1[field1].isin(df2[field2])]

    return df1

In [None]:
def apply_filters(sta_df, meas_df, spe_type, field_ID, field_date, coord_x, coord_y):

    steps = [
        {'nb_sta': str(sta_df["station_ID"].unique().shape[0]), 'step_name': 'Initial stations'}
    ]
    
    # joined filter
    sta_df = joined_filter(df = sta_df)
    steps.append({'nb_sta': str(sta_df["station_ID"].unique().shape[0]), 'step_name': 'Joined\np = 500m'})

    # apply prev. filter to meas as well. For phychem already interest
    meas_df = isin(meas_df, field_ID, sta_df, "station_ID")
    
    if spe_type=="Physico-chemical":
        # say now the number of stations related to 8 params
        # keep only good cols
        meas_df = meas_df[['code_station','date_prelevement','code_parametre', 'libelle_parametre']]
        steps.append({'nb_sta': str(meas_df[field_ID].unique().shape[0]), 'step_name': '8 param. meas.'})  

    else:
        # for biol EPT filter to get only interest taxons
        meas_df = only_taxons(tax_df=meas_df, meta_df=sta_df)
        steps.append({'nb_sta': str(meas_df[field_ID].unique().shape[0]), 'step_name': 'EPT meas.'})

    # most frequent filter, splitted in two parts
    # create ope df
    ope_df = meas2ope(meas_df, spe_type, field_ID, field_date, coord_x, coord_y)

    # most_freq_df is grouped
    meas_3, interv_24, most_freq_df = most_freq_filter(df=ope_df, field_ID=field_ID, field_date=field_date, coord_x=coord_x, coord_y=coord_y)
    steps.append({'nb_sta': str(meas_3), 'step_name': '>= 3 meas.'})
    steps.append({'nb_sta': str(interv_24), 'step_name': '<= 24 months'})

    final_meas = isin(meas_df, field_ID, most_freq_df, field_ID)

    return steps, final_meas # return final df of the filter chain: most freq of interesting species


In [None]:
def draw_scheme(spe_type, steps):
    fig, ax = plt.subplots(figsize=(14, 1.3))  # Increased the width of the figure
    step_width = 4
    ax.set_xlim(-step_width/2, len(steps) * step_width + step_width/2)  # Adjusted x-axis limits
    ax.set_ylim(0, 4)

    plt.title(f"\n{spe_type} stations filtered\n\n", fontsize=16, fontname="Georgia")


    for i, step in enumerate(steps):
        # Annotating the step with the provided name
        ax.annotate(step['step_name'], (i * 4 + 1.5, 3.5), textcoords="offset points", xytext=(0,1), ha='center', fontsize=12, fontname="Georgia")
        # Drawing the text for the number of stations
        ax.text(i * 4 + 1.5, 1.4, step['nb_sta'], ha='center', fontsize=12, fontname="Georgia",
                bbox=dict(facecolor='lavender', edgecolor='lavender'))
        # Drawing the arrow to the next step (only for the names)
        if i < len(steps) - 1:
            ax.arrow(i * 4 + 3, 3.8, 0.75, 0, head_width=0.1, head_length=0.1, fc='royalblue', ec='royalblue')

    ax.axis('off')
    plt.show()


## Import needed files

In [None]:
# Get the current directory of the notebook
current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir, os.pardir))

In [None]:
# Import biological data
biol_file = os.path.join(parent_dir, "biol_data", "stations", "metadata_biolstat.xlsx")
biol_df = pd.read_excel(biol_file, sheet_name=1)

biol_meas_file = os.path.join(parent_dir, "biol_data", "taxons", "all_taxons1980_2019.csv")
biol_meas_df = pd.read_csv(biol_meas_file, delimiter=";")

In [None]:
# Import phychem data
phychem_file = os.path.join(parent_dir, "phychem_data", "stations", "metadata_phychemstat.xlsx")
phychem_df = pd.read_excel(phychem_file, sheet_name=1)

phychem_meas_file = os.path.join(parent_dir, "phychem_data", "phychem_param_80_19", "all_concatenated1980_2019.csv")
phychem_meas_df = pd.read_csv(phychem_meas_file, delimiter=";")

## Display the chart

In [None]:
spe_types = ["Biological", "Physico-chemical"]
finals_df = []

for spe_type in spe_types:

    field_names= {
    "field_ID" : "",
    "field_date" : "date_prelevement",
    "coord_x" : "coordonnee_x",
    "coord_y" : "coordonnee_y"
    }

    if spe_type=="Biological":

        sta_df = biol_df
        meas_df = biol_meas_df

        field_names["field_ID"] = "code_station_hydrobio"

    else:

        sta_df = phychem_df
        meas_df = phychem_meas_df

        meas_df = add_coord(left_df=meas_df, right_df=sta_df, left_id="code_station", right_id="station_ID")

        field_names["field_ID"] = "code_station"

    # final_df is most frequent of interesting species in both cases
    steps, final_df = apply_filters(sta_df=sta_df, meas_df=meas_df, spe_type=spe_type, field_ID=field_names["field_ID"], field_date=field_names["field_date"], coord_x=field_names["coord_x"], coord_y=field_names["coord_y"])
    # store final df to compare for bio and phychem
    finals_df.append(final_df)
    # draw flowchart
    draw_scheme(spe_type, steps=steps)

biol = finals_df[0]
phychem = finals_df[1][['code_station','date_prelevement','code_parametre', 'libelle_parametre', 'resultat']]

common stations:
biol2phychem = isin(biol, "code_station_hydrobio", phychem, "code_station")
biol2phychem["code_station_hydrobio"].unique().shape[0]

## Keep end of chain

In [None]:
# keep the list of stations considered most frequent
MF_phychem = finals_df[1][['code_station','date_prelevement','code_parametre', 'libelle_parametre', 'resultat']]
MF_phychem_sta = MF_phychem["code_station"].unique().tolist()

MF_biol = finals_df[0]
MF_biol_sta = MF_biol["code_station_hydrobio"].unique().tolist()

In [None]:
# export in text file
file_name = "mostfreq_sta.txt"

# Saving the list to the file
with open(file_name, 'w') as file:
    for item in MF_biol_sta:
        file.write(f"{item}\n")

In [None]:
# export in csv file
MF_biol.to_csv("taxons_EPT.csv", sep=";", index=False)