# EPT triangle analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import mpltern
import os

file comes from filtering stations (last step)

In [None]:
# Get the current directory of the notebook
current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir, os.pardir))
# Import phychem data
EPT_file = os.path.join(parent_dir, "biol_data", "taxons", "taxons_freqEPT.csv")
EPT_df = pd.read_csv(EPT_file, delimiter=";")

In [None]:
# drop duplicates because different codes measured for same ept type
EPT_df = EPT_df.drop_duplicates(subset=["code_station_hydrobio", "date_prelevement", "type_EPT"])
EPT_df["date_prelevement"] = pd.to_datetime(EPT_df["date_prelevement"])

In [None]:
def EPT_meas(start_date, end_date, df):
    # extract decade per decade
    EPT_meas_df = df.loc[(df['date_prelevement'].dt.year >= start_date) & (df['date_prelevement'].dt.year <= end_date)]

    EPT_meas_df = EPT_meas_df.groupby(['code_station_hydrobio','ORD_STRA']).agg({
        'type_EPT': list,
    }).reset_index()

    EPT_meas_df['E'] = EPT_meas_df['type_EPT'].apply(lambda x: x.count('E'))
    EPT_meas_df['P'] = EPT_meas_df['type_EPT'].apply(lambda x: x.count('P'))
    EPT_meas_df['T'] = EPT_meas_df['type_EPT'].apply(lambda x: x.count('T'))

    print(f"Number of measured taxons related to E, P and T\nfor all stations, {start_date}-{end_date}\n")
    print(f'Stations that measured E: {(EPT_meas_df["E"] != 0).sum()} / {EPT_meas_df.shape[0]}\nStations that measured P: {(EPT_meas_df["P"] != 0).sum()} / {EPT_meas_df.shape[0]}\nStations that measured T: {(EPT_meas_df["T"] != 0).sum()} / {EPT_meas_df.shape[0]}\n')

    print(EPT_meas_df[["code_station_hydrobio","ORD_STRA", "E", "P", "T"]])

    return EPT_meas_df

In [None]:
def EPT_frac(df):

    df['E_%'] = (df['E'] / (df['E'] + df['P'] + df['T'])) * 100
    df['P_%'] = (df['P'] / (df['E'] + df['P'] + df['T'])) * 100
    df['T_%'] = (df['T'] / (df['E'] + df['P'] + df['T'])) * 100

    return df

In [None]:
start_year = 2010
end_year = 2020

In [None]:
EPT_meas_df = EPT_meas(start_date=start_year, end_date=end_year, df=EPT_df)

In [None]:
EPT_meas_df = EPT_frac(df=EPT_meas_df)
df = EPT_meas_df[["code_station_hydrobio","ORD_STRA", "E_%", "P_%", "T_%"]]

In [None]:
df["E_%"] = df["E_%"].apply(lambda x: float(x))
df["P_%"] = df["P_%"].apply(lambda x: float(x))
df["T_%"] = df["T_%"].apply(lambda x: float(x))

## Ternary plot

In [None]:
# Create ternary coordinates
t, l, r = df['E_%'], df['P_%'], df['T_%']

# Map ORD_STRA numbers to colors
color_map = {1: 'blue', 2: 'green', 3: 'red', 4: 'purple', 5: 'orange', 6: 'yellow'}
colors = df['ORD_STRA'].map(color_map)

# Create ternary plot
fig, ax = plt.subplots(figsize=(8, 6), subplot_kw={'projection': 'ternary'})

# Plot the stations with colors based on ORD_STRA
pc = ax.scatter(t, l, r, c=colors)


# Create a legend with colored dots and ORD_STRA numbers
handles = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=10) for color in color_map.values()]
labels = [str(ord_stra) for ord_stra in color_map.keys()]
plt.legend(handles, labels, title='ORD_STRA', loc='center left', bbox_to_anchor=(1, 0.5))


# Calculate midpoint coordinates of each triangle side

ax.text(0.5, -0.085, "T [%]", ha='center', va='center', fontsize=12, fontweight='bold', transform=ax.transAxes)
ax.text(0.12, 0.5, "P [%]", ha='center', va='center', fontsize=12, fontweight='bold', transform=ax.transAxes, rotation=60)
ax.text(0.87, 0.5, "E [%]", ha='center', va='center', fontsize=12, fontweight='bold', transform=ax.transAxes, rotation=-60)



ax.set_title("E, P, and T [%] for each MFS\n", fontsize=15)

plt.show()

# Plot per HS number

In [None]:
# Group by 'ORD_STRA' and sum the 'E', 'P', and 'T' columns
grouped_df = EPT_meas_df.groupby('ORD_STRA').sum()
grouped_df.set_index(pd.Index(range(1, len(grouped_df) + 1)), inplace=True)
grouped_df = grouped_df[["E", "P", "T"]]
grouped_df


In [None]:
grouped_df.plot(kind='bar')
plt.xlabel('Strahler Order')
plt.ylabel('Measurement counts')
plt.xticks(rotation=360)
plt.title(f'Taxons distribution MFS for Strahler Order, {start_year}-{end_year}')
plt.legend(title='Taxons')
plt.show()

In [None]:
# same plot but normalized over sum taxons for each strahler number

# Calculate the sum of each col
row_sums = grouped_df.sum(axis=1)

# Divide each element by the sum of the corresponding row
normalized_df = grouped_df.div(row_sums, axis=0)

# Plot the normalized DataFrame
normalized_df.plot(kind='bar')

plt.xlabel('Strahler Order')
plt.ylabel('Normalized Measurement Counts')  # Updated ylabel
plt.xticks(rotation=360)
plt.title(f'Normalized over Strahler Order - taxons distribution for Strahler Order, {start_year}-{end_year}')
plt.legend(title='Taxons')
plt.show()

In [None]:
# same plot but normalized over all taxons
# Calculate the sum of each column (some of E, of P, of T)
row_sums = grouped_df.sum(axis=0)

# Divide each element by the sum of the corresponding row
normalized_df = grouped_df.div(row_sums)

# Plot the normalized DataFrame
normalized_df.plot(kind='bar')

plt.xlabel('Strahler Order')
plt.ylabel('Normalized Measurement Counts')  # Updated ylabel
plt.xticks(rotation=360)
plt.title(f'Normalized over category taxons - taxons distribution for Strahler Order, {start_year}-{end_year}')
plt.legend(title='Taxons')
plt.show()