In [50]:
import os
os.chdir("/home/millieginty/OneDrive/git-repos/EMMA/")

import pandas as pd
import matplotlib.pyplot as plt
from itertools import combinations
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import os

def plot_bivariates(file_path, site, year, output_dir):
    """
    Reads in chemistry data, filters by site, normalizes tracer values,
    and creates all possible bivariate plots saved to a .jpg.

    Parameters
    ----------
    file_path : str
        Path to CSV file containing the data.
    site : str
        Site name to filter for (e.g., 'Hungerford', 'Wade', 'Potash').
    year : str
        RI nomenclature name of data series (i.e., RI23)
    output_dir : str
        Directory to save the output .jpg plot.
    """

    # --- Load and filter data ---
    data = pd.read_csv(file_path)
    data = data.loc[
        (data['Site'] == site) & 
        (data['Type'].isin(['Grab', 'Grab/Isco', 'Baseflow']))
    ]

    # --- Define solutes ---
    solutes = [
        'Fe_mg_L', 'Mn_mg_L', 'Cu_mg_L', 'Zn_mg_L', 'Si_mg_L', 'K_mg_L',
        'P_mg_L', 'Mg_mg_L', 'Na_mg_L', 'Al_mg_L', 'Ca_mg_L', 'Fl_mg_L', # I get rid of F for Wade
        'Cl_mg_L', 'NO2_mg_L', 'NO3_mg_L', 'PO4_mg_L', 'SO4_mg_L',
        'dD', 'd18O', 'TOC mg_L'
    ]

    # --- Create clean column names ---
    rename_dict = {}
    for col in solutes:
        if col == 'Fl_mg_L':
            rename_dict[col] = 'F (mg/L)'
        elif col == 'TOC mg_L':
            rename_dict[col] = 'DOC (mg/L)'
        elif 'mg_L' in col:
            rename_dict[col] = col.replace('_mg_L', ' (mg/L)').replace('_', ' ')
        elif col == 'dD':
            rename_dict[col] = 'δD (‰)'
        elif col == 'd18O':
            rename_dict[col] = 'δ¹⁸O (‰)'

    # --- Subset and rename ---
    solute_df = data[solutes].rename(columns=rename_dict)

    # --- Normalize (Min-Max) ---
    scaler = MinMaxScaler()
    solute_norm = pd.DataFrame(
        scaler.fit_transform(solute_df),
        columns=solute_df.columns,
        index=solute_df.index
    )

    # --- Plot all bivariate combinations ---
    pairs = list(combinations(solute_norm.columns, 2))
    n_pairs = len(pairs)
    n_cols = 5
    n_rows = int((n_pairs / n_cols) + 1)

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, n_rows * 3))
    axes = axes.flatten()

    for i, (x, y) in enumerate(pairs):
        ax = axes[i]
        ax.scatter(solute_norm[x], solute_norm[y], alpha=0.6, edgecolor='k', s=40)
        ax.set_xlabel(x, fontsize=12)
        ax.set_ylabel(y, fontsize=12)
        ax.tick_params(axis='both', which='major', labelsize=10)

    # Hide unused subplots
    for j in range(i + 1, len(axes)):
        axes[j].axis('off')

    plt.suptitle(f"Bivariate plots of normalized tracers at {site}", fontsize=16)
    plt.tight_layout(rect=[0, 0, 1, 0.98])

    # --- Save output ---
    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(output_dir, f"{site}_{year}_bivariate_plots.jpg")
    plt.savefig(output_file, dpi=300)
    plt.close()

    print(f"✅ Bivariate plots saved to: {output_file}")

def save_correlation_matrix(file_path, site, year, output_dir):
    """
    Reads in chemistry data, filters by site, normalizes tracer values,
    computes a Pearson correlation matrix, and saves it as a CSV.

    Parameters
    ----------
    file_path : str
        Path to CSV file containing the data.
    site : str
        Site name to filter for (e.g., 'Hungerford', 'Wade', 'Potash').
    year : str
        RI nomenclature name of data series (i.e., RI23)
    output_dir : str
        Directory to save the correlation matrix CSV.
    """

    import os
    import pandas as pd
    from sklearn.preprocessing import StandardScaler

    # --- Load and filter data ---
    data = pd.read_csv(file_path)
    data = data.loc[
        (data['Site'] == site) &
        (data['Type'].isin(['Grab', 'Grab/Isco', 'Baseflow']))
    ]

    # --- Define solutes ---
    solutes = [
        'Fe_mg_L', 'Mn_mg_L', 'Cu_mg_L', 'Zn_mg_L', 'Si_mg_L', 'K_mg_L',
        'P_mg_L', 'Mg_mg_L', 'Na_mg_L', 'Al_mg_L', 'Ca_mg_L', 'Fl_mg_L',# I get rid of F for Wade
        'Cl_mg_L', 'NO2_mg_L', 'NO3_mg_L', 'PO4_mg_L', 'SO4_mg_L',
        'dD', 'd18O', 'TOC mg_L'
    ]

    # --- Create clean column names ---
    rename_dict = {}
    for col in solutes:
        if col == 'Fl_mg_L':
            rename_dict[col] = 'F (mg/L)'
        elif col == 'TOC mg_L':
            rename_dict[col] = 'DOC (mg/L)'
        elif 'mg_L' in col:
            rename_dict[col] = col.replace('_mg_L', ' (mg/L)').replace('_', ' ')
        elif col == 'dD':
            rename_dict[col] = 'δD (‰)'
        elif col == 'd18O':
            rename_dict[col] = 'δ¹⁸O (‰)'

    solute_df = data[solutes].rename(columns=rename_dict)

    # --- Normalize (Min-Max) ---
    scaler = MinMaxScaler()
    solute_norm = pd.DataFrame(
        scaler.fit_transform(solute_df),
        columns=solute_df.columns,
        index=solute_df.index
    )

    # --- Compute Pearson correlation matrix ---
    corr_matrix = solute_norm.corr(method='pearson')

    # --- Save CSV ---
    csv_path = os.path.join(output_dir, f"{site}_{year}_correlation_matrix.csv")
    corr_matrix.to_csv(csv_path, float_format="%.3f")

    # --- Plot heatmap ---
    plt.figure(figsize=(15, 15))
    sns.set(style="white", font_scale=1.2)
    cmap = sns.diverging_palette(220, 20, as_cmap=True)

    ax = sns.heatmap(
        corr_matrix,
        cmap=cmap,
        vmin=-1, vmax=1,
        center=0,
        annot=True, fmt=".2f",
        square=True,
        linewidths=0.5,
        cbar_kws={"shrink": 0.8, "label": "Pearson r"}
    )

    plt.title(f"Pearson Correlation Matrix — {site}", fontsize=16, pad=20)
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()

    # --- Save figure ---
    fig_path = os.path.join(output_dir, f"{site}_{year}_correlation_heatmap.jpg")
    plt.savefig(fig_path, dpi=300)
    plt.close()

    print(f"✅ Correlation matrix saved to: {csv_path}")
    print(f"✅ Heatmap saved to: {fig_path}")

In [49]:
# File path and output directory (adjust as needed)
file_path = "isco_metadata/LCBP_RI_sample_index/RI23-IC-ICP-isotope-toc-joined.csv"
output_dir = "output/bivariate_plots"

# Generate plots and correlation matrix
plot_bivariates(file_path, "Hungerford", "RI23", output_dir)
save_correlation_matrix(file_path, "Hungerford", "RI23", output_dir)

✅ Bivariate plots saved to: output/bivariate_plots/Hungerford_RI23_bivariate_plots.jpg
✅ Correlation matrix saved to: output/bivariate_plots/Hungerford_RI23_correlation_matrix.csv
✅ Heatmap saved to: output/bivariate_plots/Hungerford_RI23_correlation_heatmap.jpg


In [51]:
# File path and output directory (adjust as needed)
file_path = "isco_metadata/LCBP_RI_sample_index/RI23-IC-ICP-isotope-toc-joined.csv"
output_dir = "output/bivariate_plots"

# Generate plots and correlation matrix
plot_bivariates(file_path, "Wade", "RI23", output_dir)
save_correlation_matrix(file_path, "Wade", "RI23", output_dir)

✅ Bivariate plots saved to: output/bivariate_plots/Wade_RI23_bivariate_plots.jpg
✅ Correlation matrix saved to: output/bivariate_plots/Wade_RI23_correlation_matrix.csv
✅ Heatmap saved to: output/bivariate_plots/Wade_RI23_correlation_heatmap.jpg
