In [None]:
"""
6-METRIC Precipitation BOXPLOT DASHBOARD  |  pandas ≥ 2.x
-------------------------------------------
• Grid 2 × 3
• Metric title printed **inside** each panel (top-centre)
• Colour-blind-friendly palette (Paul Tol 9-colour “bright” scheme)
• Large legend centred below panels
• Output → AllDatasets_Boxplots_4metrics.png
"""

# ── IMPORTS ──────────────────────────────────────────────────────────────
import os, pandas as pd, matplotlib.pyplot as plt, seaborn as sns, numpy as np
from matplotlib.lines import Line2D

# ── PATHS ────────────────────────────────────────────────────────────────
metrics_root = (r"D:\PhD\GLB\EMDNA(Historical data)\Ensembles\New folder"
                r"\Ensemble files\Metric for all datasets-Prcp")

dataset_order = ["CHIRPS", "EMDNA", "ERA5", "MERRA2", "PRISM", "RDRS"]
metrics        = ["r95p", "r99p", "rx1day", "rx5day", "wetdays", "cwd"]
titles   = {"rx1day":"a) RX1DAY",  "rx5day":"b) RX5DAY", "cwd": "e) CWD",
            "r95p":"c) R95p",   "r99p":"d) R99p", "wetdays": "f) WetDays"}

png_out = os.path.join(metrics_root, "AllDatasets_Boxplots_6metrics-paper.png")
tif_out = os.path.join(metrics_root, "AllDatasets_Boxplots_6metrics-paper.tif")

# ── VERIFY DATASET FOLDERS ───────────────────────────────────────────────
dataset_dirs = {d: os.path.join(metrics_root, d) for d in dataset_order}
missing = [d for d, p in dataset_dirs.items() if not os.path.isdir(p)]
if missing:
    raise FileNotFoundError(f"Missing dataset folders: {missing}")

# ── COLLECT VALUES ───────────────────────────────────────────────────────
metric_data = {m: {ds: [] for ds in ["Obs"] + dataset_order} for m in metrics}

def first_col(df, prefix):
    return next((c for c in df.columns if c.lower().startswith(prefix)), None)

for ds, path in dataset_dirs.items():
    for m in metrics:
        f = os.path.join(path, f"{m}.xlsx")
        if not os.path.isfile(f):
            continue
        df  = pd.read_excel(f)
        obs = first_col(df, "obs_")
        mod = next((c for c in df.columns
                    if not c.lower().startswith(("obs_", "ratio_", "diff_"))
                    and pd.api.types.is_numeric_dtype(df[c])), None)
        if obs and mod:
            metric_data[m]["Obs"].extend(df[obs].dropna())
            metric_data[m][ds]   .extend(df[mod].dropna())

# ── COLOUR PALETTE  (colour-blind friendly) ──────────────────────────────
palette = {
    "Obs":    "#9E9E9E",  # mid-grey
    "CHIRPS": "#E69F00",  # orange
    "EMDNA":  "#56B4E9",  # sky-blue
    "ERA5":   "#009E73",  # bluish-green
    "MERRA2": "#F0E442",  # yellow
    "PRISM":  "#0072B2",  # blue
    "RDRS":   "#D55E00",  # vermillion
}

# ── FIGURE GRID 2 × 3 ────────────────────────────────────────────────────
fig, axes = plt.subplots(2, 3, figsize=(27.4, 23.4))
axes_flat = axes.flatten()
grid_map  = ["rx1day", "rx5day", "r95p",
             "r99p", "cwd", "wetdays"]

for ax, m in zip(axes_flat, grid_map):
    wide = pd.DataFrame({k: pd.Series(v) for k, v in metric_data[m].items()})
    if wide.dropna(how="all").empty:
        ax.set_visible(False)
        continue

    long = (wide
            .melt(var_name="Dataset", value_name=" ")
            .dropna()
            .astype({"Dataset": "category"}))
    long["Dataset"] = long["Dataset"].cat.set_categories(
                          ["Obs"] + dataset_order, ordered=True)

    sns.boxplot(data=long, x="Dataset", y=" ", hue="Dataset",
                ax=ax, palette=palette, width=0.45, linewidth=1.1,
                showfliers=False, legend=False,
                boxprops={"edgecolor": "k"},
                medianprops={"color": "k", "linewidth": 1.3})

    # ── metric label INSIDE panel ────────────────────────────────────────
    ax.annotate(titles[m], xy=(0.5, 0.97), xycoords="axes fraction",
                ha="center", va="top",
                fontsize=30, fontweight="bold")
    
    # fixed y‑axis for r95p 
    if m in {"r95p"}:
        ax.set_ylim(2500, 8000)
        ax.set_yticks(np.arange(2000, 8000, 1000))

    # fixed y‑axis for r99p 
    if m in {"r99p"}:
        ax.set_ylim(500, 3000)
        ax.set_yticks(np.arange(500, 3000, 500))
        
    # fixed y‑axis for Rx1day
    if m in {"rx1day"}:
        ax.set_ylim(30, 210)
        ax.set_yticks(np.arange(30, 210, 40))
                
    # fixed y‑axis for Rx5day
    if m in {"rx5day"}:
        ax.set_ylim(50, 310)
        ax.set_yticks(np.arange(50, 310, 60))

        # fixed y‑axis for wetdays
    if m in {"wetdays"}:
        ax.set_ylim(500, 2000)
        ax.set_yticks(np.arange(500, 2000, 400))

        # fixed y‑axis for cwd
    if m in {"cwd"}:
        ax.set_ylim(3, 24)
        ax.set_yticks(np.arange(3, 24, 4))
    # axes cosmetics
    ax.set_xlabel("")
    ax.set_xticklabels([])
    ax.set_xticks([])# or ax.set_xlabel('')
    #ax.set_xticklabels(ax.get_xticklabels(), fontsize=18)
    ax.tick_params(axis="y", labelsize=24)           # larger y-labels
    ax.yaxis.grid(True, ls="--", alpha=.35)
    ax.set_axisbelow(True)

# ── LEGEND  (centred, below panels) ───────────────────────────────────────
handles = [Line2D([], [], marker='s', markersize=20, linestyle='',
                  markerfacecolor=palette[l], markeredgecolor='k', label=l)
           for l in ["Obs"] + dataset_order]

fig.legend(handles=handles, loc="lower center", ncol=7, frameon=False,
           bbox_to_anchor=(0.5, 0.03),
           prop={"size": 30, "weight": "bold"})

plt.tight_layout(rect=[0, 0.12, 1, 1])   # room for legend
fig.savefig(png_out, dpi=600, bbox_inches="tight", pad_inches=0.50)
fig.savefig(png_out, dpi=96, bbox_inches="tight", pad_inches=0.50)
fig.savefig(tif_out, dpi=600, bbox_inches="tight", pad_inches=0.50)
plt.close(fig)
print("Saved →", png_out)
print("Saved →", tif_out)


In [None]:
"""
6‑METRIC Temperature BOXPLOT DASHBOARD  |  pandas ≥ 2.x
-------------------------------------------
• Grid 2 × 3
• Metric title printed inside each panel (top‑centre)
• Paul Tol “bright” colour‑blind palette
• Legend centred below panels
• Output → AllDatasets_Boxplots_4metrics.png
"""

# ── IMPORTS ──────────────────────────────────────────────────────────────
import os, pandas as pd, matplotlib.pyplot as plt, seaborn as sns, numpy as np
from matplotlib.lines import Line2D

# ── PATHS ────────────────────────────────────────────────────────────────
metrics_root  = (r"D:\PhD\GLB\EMDNA(Historical data)\Ensembles\New folder"
                 r"\Ensemble files\Metric for all datasets-Temp")
dataset_order = ["EMDNA", "ERA5", "MERRA2", "PRISM", "RDRS"]
metrics        = ["TN10p", "TX90p", "TXx", "TNn","CSDI", "WSDI"]
titles   = {"TN10p":"c) TN10p","TX90p":"d) TX90p", "TNn":"b) TNn", "TXx":"a) TXx", "CSDI": "e) CSDI", "WSDI": "f) WSDI"}
png_out        = os.path.join(metrics_root, "AllDatasets_Boxplots_6metrics-paper.png")
tif_out        = os.path.join(metrics_root, "AllDatasets_Boxplots_6metrics-paper.tif")

# ── VERIFY DATASET FOLDERS ───────────────────────────────────────────────
dataset_dirs = {d: os.path.join(metrics_root, d) for d in dataset_order}
missing      = [d for d, p in dataset_dirs.items() if not os.path.isdir(p)]
if missing:
    raise FileNotFoundError(f"Missing dataset folders: {missing}")

# ── HELPERS ──────────────────────────────────────────────────────────────
def find_obs_col(df):
    for c in df.columns:
        cl = c.lower()
        if cl.startswith("obs_") or cl.endswith("_obs"):
            return c
    return None

def find_mod_col(df, obs_col):
    for c in df.columns:
        if c == obs_col:
            continue
        cl = c.lower()
        if (pd.api.types.is_numeric_dtype(df[c]) and
            not any(k in cl for k in ("ratio", "diff", "lat", "lon", "elev"))):
            return c
    return None

# ── COLLECT VALUES ───────────────────────────────────────────────────────
metric_data = {m: {ds: [] for ds in ["Obs"] + dataset_order} for m in metrics}

for ds, path in dataset_dirs.items():
    for m in metrics:
        f = os.path.join(path, f"{m}.xlsx")
        if not os.path.isfile(f):
            continue
        df  = pd.read_excel(f)
        obs = find_obs_col(df)
        mod = find_mod_col(df, obs)
        if obs and mod:
            metric_data[m]["Obs"].extend(df[obs].dropna())
            metric_data[m][ds]   .extend(df[mod].dropna())

# ── PAUL‑TOL “BRIGHT” PALETTE ────────────────────────────────────────────
palette = {
    "Obs"   : "#999999",
    "EMDNA" : "#56B4E9",
    "ERA5"  : "#009E73",
    "MERRA2": "#F0E442",
    "PRISM" : "#0072B2",
    "RDRS"  : "#D55E00",
}

# ── FIGURE GRID 2 × 3 ────────────────────────────────────────────────────
fig, axes = plt.subplots(2, 3, figsize=(27.3, 23.4))
axes_flat = axes.flatten()
grid_map  = ["TXx", "TNn", "TN10p",
             "TX90p", "CSDI", "WSDI"]

for ax, m in zip(axes_flat, grid_map):
    wide = pd.DataFrame({k: pd.Series(v) for k, v in metric_data[m].items()})
    if wide.dropna(how="all").empty:
        ax.set_visible(False)
        continue

    long = (wide
            .melt(var_name="Dataset", value_name=" ")
            .dropna()
            .astype({"Dataset": "category"}))
    long["Dataset"] = long["Dataset"].cat.set_categories(
                          ["Obs"] + dataset_order, ordered=True)

    sns.boxplot(data=long, x="Dataset", y=" ", hue="Dataset",
                ax=ax, palette=palette, width=0.45, linewidth=1.1,
                showfliers=False, legend=False,
                boxprops={"edgecolor": "k"},
                medianprops={"color": "k", "linewidth": 1.3})

    # metric label (exact code, not upper‑cased)
    ax.annotate(titles[m], xy=(0.5, 0.97), xycoords="axes fraction",
                ha="center", va="top", fontsize=30, fontweight="bold")

    # fixed y‑axis for TN10p / TX90p
    if m in {"TN10p", "TX90p"}:
        ax.set_ylim(8, 10.5)
        ax.set_yticks(np.arange(8, 10.5, 0.5))
        
    # fixed y‑axis for TXx
    if m in {"TXx"}:
        ax.set_ylim(22, 44)
        ax.set_yticks(np.arange(22, 44, 4))
                
    # fixed y‑axis for TNn
    if m in {"TNn"}:
        ax.set_ylim(-42, -6)
        ax.set_yticks(np.arange(-42, -9, 6))

        # fixed y‑axis for CSDI
    if m in {"CSDI"}:
        ax.set_ylim(-2, 8)
        ax.set_yticks(np.arange(-2, 8, 2))

        # fixed y‑axis for WSDI
    if m in {"WSDI"}:
        ax.set_ylim(-2, 10)
        ax.set_yticks(np.arange(-2, 10, 2))
        
    # cosmetics
    ax.set_xlabel("")
    ax.set_xticks([])
    ax.tick_params(axis="y", labelsize=24)
    ax.yaxis.grid(True, ls="--", alpha=.35)
    ax.set_axisbelow(True)

# ── LEGEND ────────────────────────────────────────────────────────────────
handles = [Line2D([], [], marker='s', markersize=24, linestyle='',
                  markerfacecolor=palette[l], markeredgecolor='k', label=l)
           for l in ["Obs"] + dataset_order]

fig.legend(handles=handles, loc="lower center", ncol=6, frameon=False,
           bbox_to_anchor=(0.5, 0.03),
           prop={"size":30, "weight":"bold"})

plt.tight_layout(rect=[0, 0.12, 1, 1])
fig.savefig(png_out, dpi=600, bbox_inches="tight", pad_inches=0.50)
fig.savefig(png_out, dpi=96, bbox_inches="tight", pad_inches=0.50)
fig.savefig(tif_out, dpi=600, bbox_inches="tight", pad_inches=0.50)
plt.close(fig)
print("Saved →", png_out)
print("Saved →", tif_out)


In [None]:
"""
6-PANEL CDF DASHBOARD – precipitation indices
---------------------------------------------
Metrics   : RX1DAY, RX5DAY, R95pTot, R99pTot, CWD, WetDays
Datasets  : Obs (dark-grey), EMDNA (orange), ERA5 (green), PRISM (purple)

Layout    : 2 × 3 grid — last row only the **centre** cell (WetDays) is used.
Output    : CDF_ExtremePrcp_6panel.png   (saved beside metrics_root)
"""

# ── IMPORTS ────────────────────────────────────────────────────────────────
import os, numpy as np, pandas as pd, matplotlib.pyplot as plt
from matplotlib.lines import Line2D

# ── PATHS ──────────────────────────────────────────────────────────────────
metrics_root = (r"D:\PhD\GLB\EMDNA(Historical data)\Ensembles\New folder\Ensemble "
                r"files\Metric for all datasets-Prcp")

datasets = ["EMDNA", "ERA5", "PRISM", "RDRS", "MERRA2"]             # order in legend & plotting
metrics  = ["rx1day", "rx5day", "r95p", "r99p", "cwd", "wetdays"]
titles   = {"rx1day":"a) RX1DAY",  "rx5day":"b) RX5DAY", "cwd": "e) CWD",
            "r95p":"c) R95p",   "r99p":"d) R99p", "wetdays": "f) WetDays"}

png_out  = os.path.join(metrics_root, "CDF_ExtremePrcp_6panel-paper.png")
tif_out  = os.path.join(metrics_root, "CDF_ExtremePrcp_6panel-paper.tif")

# ── VERIFY DATASET FOLDERS ────────────────────────────────────────────────
for d in datasets:
    path = os.path.join(metrics_root, d)
    if not os.path.isdir(path):
        raise FileNotFoundError(f"Dataset folder missing: {path}")

# ── COLUMN-PREFIX MAP (edit if your header prefixes differ) ───────────────
col_key = {"EMDNA": "emd",
           "ERA5" : "era5",
           "PRISM": "prism",
           "RDRS": "rdrs",
           "MERRA2": "merra2"}

# ── COLOUR PALETTE ────────────────────────────────────────────────────────
COL = {"Obs"  : "#7F3C8D",   
       "EMDNA": "#0072B2",   
       "ERA5" : "#009E73",   
       "PRISM": "#E69F00",
       "RDRS" : "#D55E00",
       "MERRA2": "#CC79A7"}  

# ── HELPER FUNCTIONS ──────────────────────────────────────────────────────
def first_col(df, prefix):
    return next((c for c in df.columns if c.lower().startswith(prefix)), None)

def ecdf(arr):
    x = np.sort(arr)
    y = np.arange(1, len(x)+1) / len(x) if len(x) else np.array([])
    return x, y

# ── GATHER DATA : data[metric][dataset] = list --──────────────────────────
data = {m: {ds: [] for ds in ["Obs"] + datasets} for m in metrics}

for ds in datasets:
    ddir = os.path.join(metrics_root, ds)
    for m in metrics:
        fp = os.path.join(ddir, f"{m}.xlsx")
        if not os.path.isfile(fp):
            continue
        df = pd.read_excel(fp)

        obs = first_col(df, "obs_")
        mod = next((c for c in df.columns
                    if c.lower().startswith(col_key[ds])
                    and not c.lower().startswith(("ratio_","diff_"))), None)

        if obs:
            data[m]["Obs"]  += df[obs].dropna().tolist()
        if mod:
            data[m][ds]     += df[mod].dropna().tolist()

# ── 2×3 FIGURE (centre-only last row) ─────────────────────────────────────
fig, axes = plt.subplots(2, 3, figsize=(27, 22))
axes = axes.ravel()

grid_map = ["rx1day", "rx5day", "r95p",
            "r99p", "cwd", "wetdays"]

for ax, key in zip(axes, grid_map):
    if key is None:
        ax.set_visible(False)
        continue

    # ── ECDF curves: Obs first, then re‑analyses ────────────────────
    for name in ["Obs"] + datasets:
        arr = np.asarray(data[key][name])
        if arr.size == 0:
            continue
        xs, ys = ecdf(arr)

        # fatter strokes: 4 pt for Obs, 3 pt for reanalysis sets
        lw = 4 if name == "Obs" else 3

        ax.step(xs, ys, where="post",
                color=COL[name],
                lw=lw,
                label=name)

    # ── cosmetics ────────────────────────────────────────────────────
    ax.set_title(titles[key], fontsize=32, fontweight="bold", pad=6)

    ax.set_ylabel("Probability", fontsize=22)
    #ax.set_xlabel(titles[key],   fontsize=28)

    # bigger tick labels
    ax.tick_params(axis="both", labelsize=22)   # ← increase tick‑number font

    # light grid
    ax.grid(ls="--", alpha=.35)
    ax.set_axisbelow(True)

# ── SINGLE LEGEND  (bold text + bigger swatches) ─────────────────────────
legend_labels = ["Obs"] + datasets            # → ["Obs","EMDNA","ERA5","PRISM"]
handles = [Line2D([], [],                    # empty data → legend only
                  color=COL[l], linewidth=6, # fat line in legend
                  label=l)
           for l in legend_labels]

fig.legend(handles, legend_labels, loc="lower center", ncol=6, frameon=True,
           prop={"size":32, "weight":"bold"})

# ── OVERALL TITLE ────────────────────────────────────────────────────────
#fig.suptitle("CDF of Extreme Precipitation Metrics", fontsize=42,
             #fontweight="bold", y=1.03)

#plt.tight_layout(rect=[0, 0.07, 1, 1])   # leave room for title & legend

# ── extra breathing‑room between panels ─────────────────────────────
plt.subplots_adjust(wspace=0.15,   # horizontal gap  (0 = none, 1 = huge)
                    hspace=0.15)   # vertical   gap

# ── EXPORT  ──────────────────────────────────────────────────────────────
fig.savefig(png_out, dpi=600, bbox_inches="tight", pad_inches=0.50)
fig.savefig(png_out, dpi=150, bbox_inches="tight", pad_inches=0.50)
fig.savefig(tif_out, dpi=600, bbox_inches="tight", pad_inches=0.50,
            format="tiff")
plt.close(fig)
print("Saved 6‑panel CDF dashboard →", png_out)
print("Saved 6‑panel CDF dashboard →", tif_out)

In [None]:
"""
6-PANEL CDF DASHBOARD – temperature indices
---------------------------------------------
Metrics   : RX1DAY, RX5DAY, R95pTot, R99pTot, CWD, DryDays, WetDays
Datasets  : Obs (dark-grey), EMDNA (orange), ERA5 (green), PRISM (purple)

Layout    : 2 × 3 grid — last row only the **centre** cell (WetDays) is used.
Output    : CDF_ExtremePrcp_9panel.png   (saved beside metrics_root)
"""

# ── IMPORTS ────────────────────────────────────────────────────────────────
import os, numpy as np, pandas as pd, matplotlib.pyplot as plt
from matplotlib.lines import Line2D

# ── PATHS ──────────────────────────────────────────────────────────────────
metrics_root = (r"D:\PhD\GLB\EMDNA(Historical data)\Ensembles\New folder\Ensemble "
                r"files\Metric for all datasets-Temp")

datasets = ["EMDNA", "ERA5", "PRISM", "RDRS", "MERRA2"]             # order in legend & plotting
metrics  = ["TN10p", "TX90p", "TNn", "TXx", "CSDI", "WSDI"]
titles   = {"TN10p":"c) TN10p","TX90p":"d) TX90p", "TNn":"b) TNn", "TXx":"a) TXx", "CSDI": "e) CSDI", "WSDI": "f) WSDI"}

png_out  = os.path.join(metrics_root, "CDF_ExtremeTemp_6panel-paper.png")
tif_out  = os.path.join(metrics_root, "CDF_ExtremeTemp_6panel-paper.tif")

# ── VERIFY DATASET FOLDERS ────────────────────────────────────────────────
for d in datasets:
    path = os.path.join(metrics_root, d)
    if not os.path.isdir(path):
        raise FileNotFoundError(f"Dataset folder missing: {path}")

# ── COLUMN-PREFIX MAP (edit if your header prefixes differ) ───────────────
col_key = {"EMDNA": "emd",
           "ERA5" : "era5",
           "PRISM": "prism",
           "RDRS": "rdrs",
           "MERRA2": "merra2"}

# ── COLOUR PALETTE ────────────────────────────────────────────────────────
COL = {"Obs"  : "#7F3C8D",   
       "EMDNA": "#0072B2",   
       "ERA5" : "#009E73",   
       "PRISM": "#E69F00",
       "RDRS": "#D55E00",
       "MERRA2": "#CC79A7"}  

# ── HELPER FUNCTIONS ──────────────────────────────────────────────────────
def find_obs_col(df):
    """return column that is obs (looks for 'obs_' prefix OR '_obs' suffix)"""
    for c in df.columns:
        cl = c.lower()
        if cl.startswith("obs_") or cl.endswith("_obs"):
            return c
    return None

def find_model_col(df, metric, ds):
    """column like 'CSDI_era5' ;  metric & ds case-insensitive"""
    target_suffix = f"{metric.lower()}_{ds.lower()}"
    # exact match
    for c in df.columns:
        if c.lower() == target_suffix:
            return c
    # fallback: endswith _ds
    for c in df.columns:
        if c.lower().endswith(f"_{ds.lower()}"):
            return c
    return None

def ecdf(values):
    v = np.sort(values)
    p = np.arange(1, len(v)+1) / len(v) if len(v) else np.array([])
    return v, p

# ── GATHER DATA : data[metric][dataset] = list --──────────────────────────
data = {m: {ds: [] for ds in ["Obs"] + datasets} for m in metrics}

for ds in datasets:
    ddir = os.path.join(metrics_root, ds)
    for m in metrics:
        fp = os.path.join(ddir, f"{m}.xlsx")
        if not os.path.isfile(fp):
            continue
        df = pd.read_excel(fp)

        obs_col = find_obs_col(df)
        mod_col = find_model_col(df, m, ds)

        if obs_col:
            data[m]["Obs"].extend(df[obs_col].dropna().tolist())
        if mod_col:
            data[m][ds].extend(df[mod_col].dropna().tolist())

# ── 2×3 FIGURE (centre-only last row) ─────────────────────────────────────
fig, axes = plt.subplots(2, 3, figsize=(27, 22))
axes = axes.ravel()

grid_map = ["TXx", "TNn", "TN10p",
            "TX90p", "CSDI", "WSDI"]

for ax, key in zip(axes, grid_map):
    if key is None:
        ax.set_visible(False)
        continue

    # ── ECDF curves: Obs first, then re‑analyses ────────────────────
    for name in ["Obs"] + datasets:
        arr = np.asarray(data[key][name])
        if arr.size == 0:
            continue
        xs, ys = ecdf(arr)

        # fatter strokes: 4 pt for Obs, 3 pt for reanalysis sets
        lw = 4 if name == "Obs" else 3

        ax.step(xs, ys, where="post",
                color=COL[name],
                lw=lw,
                label=name)

    # ── cosmetics ────────────────────────────────────────────────────
    ax.set_title(titles[key], fontsize=34, fontweight="bold", pad=6)

    ax.set_ylabel("Probability", fontsize=22)
    #ax.set_xlabel(titles[key],   fontsize=28)

    # bigger tick labels
    ax.tick_params(axis="both", labelsize=22)   # ← increase tick‑number font

    # light grid
    ax.grid(ls="--", alpha=.35)
    ax.set_axisbelow(True)

# ── SINGLE LEGEND  (bold text + bigger swatches) ─────────────────────────
legend_labels = ["Obs"] + datasets            # → ["Obs","EMDNA","ERA5","PRISM"]
handles = [Line2D([], [],                    # empty data → legend only
                  color=COL[l], linewidth=6, # fat line in legend
                  label=l)
           for l in legend_labels]

fig.legend(handles, legend_labels, loc="lower center", ncol=6, frameon=True,
           prop={"size":32, "weight":"bold"})

# ── OVERALL TITLE ────────────────────────────────────────────────────────
#fig.suptitle("CDF of Extreme Temperature Metrics", fontsize=42,
#             fontweight="bold", y=1.03)

#plt.tight_layout(rect=[0, 0.07, 1, 1])   # leave room for title & legend

# ── extra breathing‑room between panels ─────────────────────────────
plt.subplots_adjust(wspace=0.15,   # horizontal gap  (0 = none, 1 = huge)
                    hspace=0.15)   # vertical   gap

# ── EXPORT  ──────────────────────────────────────────────────────────────
fig.savefig(png_out, dpi=600, bbox_inches="tight", pad_inches=0.50)
fig.savefig(png_out, dpi=150, bbox_inches="tight", pad_inches=0.50)
fig.savefig(tif_out, dpi=600, bbox_inches="tight", pad_inches=0.50,
            format="tiff")
plt.close(fig)
print("Saved 6‑panel CDF dashboard →", png_out)
print("Saved 6‑panel CDF dashboard →", tif_out)

In [None]:
"""
SEASONAL 4-PANEL BOXPLOTS – precipitation indices
-------------------------------------------------
Metrics   : R95pTot, R99pTot, RX1DAY, RX5DAY
Datasets  : Obs (mid-grey), EMDNA (sky-blue), ERA5 (bluish-green), PRISM (orange)

Input     :  Seasonal\EMDNA.xlsx   (… _obs + _emd columns)
             Seasonal\ERA5.xlsx    (… _obs + _era5 columns)
             Seasonal\PRISM.xlsx    (… _obs + _prism columns)

Output     :  Seasonal_Boxplots_<DJF|MAM|JJA|SON>.png   (4 files)
"""

# ───────────────── IMPORTS ────────────────────────────────────────────────
import os, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
from matplotlib.lines import Line2D

# ───────────────── PATHS ─────────────────────────────────────────────────
root = (r"D:\PhD\GLB\EMDNA(Historical data)\Ensembles\New folder\Ensemble "
        r"files\Metric for all datasets-Prcp\Seasonal")

wb_emd  = os.path.join(root, "EMDNA.xlsx")
wb_prism = os.path.join(root, "PRISM.xlsx")
wb_era5 = os.path.join(root, "ERA5.xlsx")

if not (os.path.isfile(wb_emd) and os.path.isfile(wb_prism) and os.path.isfile(wb_era5)):
    raise FileNotFoundError("Seasonal workbooks not found in → " + root)

# ───────────────── CONSTANTS ─────────────────────────────────────────────
seasons  = ["DJF", "MAM", "JJA", "SON"]
metrics  = ["r95amt", "r99amt", "rx1day", "rx5day"]
titles   = {m: m.upper() for m in metrics}
titles.update({"r95amt":"R95p", "r99amt":"R99p"})

palette  = {"Obs": "#9E9E9E", "EMDNA": "#56B4E9", "ERA5": "#009E73", "PRISM": "#E69F00"}

# ───────────────── READ WHOLE WORKBOOKS (single sheet) ───────────────────
df_emd  = pd.read_excel(wb_emd)
df_prism = pd.read_excel(wb_prism)
df_era5 = pd.read_excel(wb_era5)

# ───────────────── HELPER: fetch columns ─────────────────────────────────
def grab(series_name, metric, suffix):
    """return column if present else empty Series"""
    col = f"{metric}_{suffix}"
    return series_name.get(col, pd.Series(dtype=float))

# ───────────────── PROCESS EACH SEASON ───────────────────────────────────
for seas in seasons:

    sub_emd  = df_emd[df_emd["season"]  == seas]
    sub_prism  = df_prism[df_prism["season"]  == seas]
    sub_era5 = df_era5[df_era5["season"] == seas]

    if sub_emd.empty and sub_prism.empty and sub_era5.empty:
        print(f"[WARN] No rows for season {seas} – skipped")
        continue

    # metric_data[metric][dataset] → list
    metric_data = {m: {"Obs": [], "EMDNA": [], "PRISM": [], "ERA5": []} for m in metrics}

    for m in metrics:
        obs_vals = grab(sub_emd, m, "obs")   # obs present in both files – pick one
        if obs_vals.empty:
            obs_vals = grab(sub_era5, m, "obs")
        if obs_vals.empty:
            obs_vals = grab(sub_prism, m, "obs")    

        metric_data[m]["Obs"]   = obs_vals.dropna().tolist()
        metric_data[m]["EMDNA"] = grab(sub_emd,  m, "emd" ).dropna().tolist()
        metric_data[m]["PRISM"] = grab(sub_prism,  m, "prism" ).dropna().tolist()
        metric_data[m]["ERA5"]  = grab(sub_era5, m, "era5").dropna().tolist()

    # ──────────── PLOT 2 × 2 GRID (centre cell last row) ────────────────
    fig, axes = plt.subplots(2, 2, figsize=(21, 23))
    axes = axes.ravel()
    grid_map = ["r95amt","r99amt",
                "rx1day", "rx5day"]

    for ax, key in zip(axes, grid_map):
        if key is None:
            ax.set_visible(False)
            continue

        wide = pd.DataFrame(
            {k: pd.Series(v) for k, v in metric_data[key].items()}
        )
        if wide.dropna(how="all").empty:
            ax.set_visible(False)
            continue

        long = (wide
                .melt(var_name="Dataset", value_name=" ")
                .dropna()
                .astype({"Dataset": "category"}))
        long["Dataset"] = long["Dataset"].cat.set_categories(
            ["Obs", "EMDNA", "PRISM", "ERA5"], ordered=True
        )

        sns.boxplot(data=long, x="Dataset", y=" ", hue="Dataset",
                    ax=ax, palette=palette, width=0.45, linewidth=1.1,
                    showfliers=False, legend=False,
                    boxprops=dict(edgecolor="k"),
                    medianprops=dict(color="k", linewidth=1.3))
        # ── metric label centred at top of its panel ──────────────────────
        ax.annotate(
            titles[key],          # pretty name from the titles dict
            xy=(0.5, 0.97),       # centred, a bit below the top edge
            xycoords="axes fraction",
            ha="center", va="top",
            fontsize=34, fontweight="bold"
        )

        # ────────────────────────────────────────────────────────────────
        # season‑specific y‑axis windows  →  limits & major‑tick step
        # ----------------------------------------------------------------
        season_ycfg = {
            "DJF": {                 # low, high, tick_step
                "r95amt": (   0, 2200, 400),
                "r99amt": (   0,  800, 100),
                "rx1day": (  20,   185, 30),
                "rx5day": (  20,  260, 40),
            },
            "MAM": {
                "r95amt": ( 0, 2200, 400),
                "r99amt": ( 0,  800, 100),
                "rx1day": (  20,  185, 30),
                "rx5day": (  20,  260, 40),
            },
            "JJA": {
                "r95amt": ( 0, 2200, 400),
                "r99amt": ( 0,  800, 100),
                "rx1day": (  20,  185, 30),
                "rx5day": (  20,  260, 40),
            },
            "SON": {
                "r95amt": ( 0, 2200, 400),
                "r99amt": ( 0,  800, 100),
                "rx1day": (  20,  185, 30),
                "rx5day": (  20,  260, 40),
            },
        }

        if seas in season_ycfg and key in season_ycfg[seas]:
            ymin, ymax, step = season_ycfg[seas][key]
            ax.set_ylim(ymin, ymax)
            # NB: stop *before* ymax so the very top tick/label is omitted
            ax.set_yticks(np.arange(ymin, ymax, step))
        # if the pair isn't listed, axes stay on automatic scaling


        # cosmetics
        ax.set_xlabel("")
        ax.set_xticks([])                    # remove x‑tick marks & labels
        ax.tick_params(axis="y", labelsize=24)
        ax.yaxis.grid(True, ls="--", alpha=.35)
        ax.set_axisbelow(True)
        
    # unified legend
    handles = [Line2D([], [], marker='s', ms=24, linestyle='',
                      markerfacecolor=palette[d], markeredgecolor='k', label=d)
               for d in ["Obs", "EMDNA", "PRISM", "ERA5"]]
    fig.legend(handles=handles, loc="lower center", ncol=4,
               frameon=False, bbox_to_anchor=(0.5, 0.03),
               prop={"size":32, "weight":"bold"})

    plt.suptitle(f"{seas}", y=1.02, fontsize=42, fontweight="bold")
    plt.tight_layout(rect=[0, 0.12, 1, 1])

    out_png = os.path.join(root, f"Seasonal_Boxplots_4-4N_{seas}.png")
    out_tif = os.path.join(root, f"Seasonal_Boxplots_4-4N_{seas}.tif")
    fig.savefig(out_png, dpi=600, bbox_inches="tight", pad_inches=0.70)
    fig.savefig(out_tif, dpi=600, bbox_inches="tight", pad_inches=0.70, format="tiff")
    plt.close(fig)
    print("Saved →", out_png)
    print("Saved →", out_tif)


In [None]:
"""
SEASONAL 4-PANEL BOXPLOTS – Temperature indices
-------------------------------------------------
Metrics   : R95pTot, R99pTot, RX1DAY, RX5DAY
Datasets  : Obs (mid-grey), EMDNA (sky-blue), ERA5 (bluish-green), PRISM (orange)

Input     :  Seasonal\EMDNA.xlsx   (… _obs + _emdna columns)
             Seasonal\ERA5.xlsx    (… _obs + _era5 columns)
             Seasonal\PRISM.xlsx    (… _obs + _prism columns)
             Seasonal\MERRA2.xlsx    (… _obs + _merra2 columns)
             Seasonal\RDRS.xlsx    (… _obs + _rdrs columns)

Output     :  Seasonal_Boxplots_<DJF|MAM|JJA|SON>.png   (4 files)
"""

# ───────────────── IMPORTS ────────────────────────────────────────────────
import os, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
from matplotlib.lines import Line2D

# ───────────────── PATHS ─────────────────────────────────────────────────
root = (r"D:\PhD\GLB\EMDNA(Historical data)\Ensembles\New folder\Ensemble "
        r"files\Metric for all datasets-Temp\Seasonal")

wb_emdna  = os.path.join(root, "EMDNA.xlsx")
wb_prism = os.path.join(root, "PRISM.xlsx")
wb_era5 = os.path.join(root, "ERA5.xlsx")
wb_rdrs = os.path.join(root, "RDRS.xlsx")
wb_merra2 = os.path.join(root, "MERRA2.xlsx")

if not (os.path.isfile(wb_emdna) and os.path.isfile(wb_prism) and os.path.isfile(wb_era5) and os.path.isfile(wb_rdrs) and os.path.isfile(wb_merra2)):
    raise FileNotFoundError("Seasonal workbooks not found in → " + root)

# ───────────────── CONSTANTS ─────────────────────────────────────────────
seasons  = ["DJF", "MAM", "JJA", "SON"]
metrics  = ["TN10p", "TX90p", "TNn", "TXx"]
titles   = {m: m.upper() for m in metrics}
titles.update({"TN10p":"TN10p", "TX90p":"TX90p", "TNn":"TNn", "TNn":"TNn"})

palette  = {"Obs": "#9E9E9E", "EMDNA": "#56B4E9", "ERA5": "#009E73", "PRISM": "#E69F00", "RDRS": "#D55E00", "MERRA2": "#F0E442"}


# ───────────────── READ WHOLE WORKBOOKS (single sheet) ───────────────────
df_emdna  = pd.read_excel(wb_emdna)
df_prism = pd.read_excel(wb_prism)
df_era5 = pd.read_excel(wb_era5)
df_rdrs = pd.read_excel(wb_rdrs)
df_merra2 = pd.read_excel(wb_merra2)

# ───────────────── HELPER: fetch columns ─────────────────────────────────
def grab(series_name, metric, suffix):
    """return column if present else empty Series"""
    col = f"{metric}_{suffix}"
    return series_name.get(col, pd.Series(dtype=float))

# ───────────────── PROCESS EACH SEASON ───────────────────────────────────
for seas in seasons:

    sub_emdna  = df_emdna[df_emdna["season"]  == seas]
    sub_prism  = df_prism[df_prism["season"]  == seas]
    sub_era5 = df_era5[df_era5["season"] == seas]
    sub_rdrs = df_rdrs[df_rdrs["season"] == seas]
    sub_merra2 = df_merra2[df_merra2["season"] == seas]

    if sub_emdna.empty and sub_prism.empty and sub_era5.empty and sub_rdrs.empty and sub_merra2.empty:
        print(f"[WARN] No rows for season {seas} – skipped")
        continue

    # metric_data[metric][dataset] → list
    metric_data = {m: {"Obs": [], "EMDNA": [], "PRISM": [], "ERA5": [], "RDRS": [], "MERRA2": []} for m in metrics}

    for m in metrics:
        obs_vals = grab(sub_emdna, m, "obs")   # obs present in both files – pick one
        if obs_vals.empty:
            obs_vals = grab(sub_era5, m, "obs")
        if obs_vals.empty:
            obs_vals = grab(sub_prism, m, "obs") 
        if obs_vals.empty:
            obs_vals = grab(sub_rdrs, m, "obs") 
        if obs_vals.empty:
            obs_vals = grab(sub_merra2, m, "obs") 

        metric_data[m]["Obs"]   = obs_vals.dropna().tolist()
        metric_data[m]["EMDNA"] = grab(sub_emdna,  m, "emdna" ).dropna().tolist()
        metric_data[m]["PRISM"] = grab(sub_prism,  m, "prism" ).dropna().tolist()
        metric_data[m]["ERA5"]  = grab(sub_era5, m, "era5").dropna().tolist()
        metric_data[m]["RDRS"]  = grab(sub_rdrs, m, "rdrs").dropna().tolist()
        metric_data[m]["MERRA2"]  = grab(sub_merra2, m, "merra2").dropna().tolist()

    # ──────────── PLOT 2 × 2 GRID (centre cell last row) ────────────────
    fig, axes = plt.subplots(2, 2, figsize=(21, 23))
    axes = axes.ravel()
    grid_map = ["TN10p", "TX90p",
                "TXx", "TNn"]

    for ax, key in zip(axes, grid_map):
        if key is None:
            ax.set_visible(False)
            continue

        wide = pd.DataFrame(
            {k: pd.Series(v) for k, v in metric_data[key].items()}
        )
        if wide.dropna(how="all").empty:
            ax.set_visible(False)
            continue

        long = (wide
                .melt(var_name="Dataset", value_name=" ")
                .dropna()
                .astype({"Dataset": "category"}))
        long["Dataset"] = long["Dataset"].cat.set_categories(
            ["Obs", "EMDNA", "PRISM", "ERA5", "RDRS", "MERRA2"], ordered=True
        )

        sns.boxplot(data=long, x="Dataset", y=" ", hue="Dataset",
                    ax=ax, palette=palette, width=0.45, linewidth=1.1,
                    showfliers=False, legend=False,
                    boxprops=dict(edgecolor="k"),
                    medianprops=dict(color="k", linewidth=1.3))
        # ── metric label centred at top of its panel ──────────────────────
        ax.annotate(
            titles[key],          # pretty name from the titles dict
            xy=(0.5, 0.97),       # centred, a bit below the top edge
            xycoords="axes fraction",
            ha="center", va="top",
            fontsize=34, fontweight="bold"
        )

        # ────────────────────────────────────────────────────────────────
        # season‑specific y‑axis windows  →  limits & major‑tick step
        # ----------------------------------------------------------------
        season_ycfg = {
            "DJF": {                 # low, high, tick_step
                "TN10p": (   -5, 95, 20),
                "TX90p": (   -5,  75, 15),
                "TXx": (  -5,   45, 10),
                "TNn": (  -45,  20, 10),
            },
            "MAM": {
                "TN10p": ( -5, 95, 20),
                "TX90p": ( -5,  75, 15),
                "TXx": (  -5,  45, 10),
                "TNn": (  -45,  20, 10),
            },
            "JJA": {
                "TN10p": ( -5, 95, 20),
                "TX90p": ( -5, 75, 15),
                "TXx": (  -5,  45, 10),
                "TNn": (  -45,  20, 10),
            },
            "SON": {
                "TN10p": ( -5, 95, 20),
                "TX90p": ( -5,  75, 15),
                "TXx": (  -5,  45, 10),
                "TNn": (  -45,  20, 10),
            },
        }

        if seas in season_ycfg and key in season_ycfg[seas]:
            ymin, ymax, step = season_ycfg[seas][key]
            ax.set_ylim(ymin, ymax)
            # NB: stop *before* ymax so the very top tick/label is omitted
            ax.set_yticks(np.arange(ymin, ymax, step))
        # if the pair isn't listed, axes stay on automatic scaling


        # cosmetics
        ax.set_xlabel("")
        ax.set_xticks([])                    # remove x‑tick marks & labels
        ax.tick_params(axis="y", labelsize=24)
        ax.yaxis.grid(True, ls="--", alpha=.35)
        ax.set_axisbelow(True)
        
    # unified legend
    handles = [Line2D([], [], marker='s', ms=24, linestyle='',
                      markerfacecolor=palette[d], markeredgecolor='k', label=d)
               for d in ["Obs", "EMDNA", "PRISM", "ERA5", "RDRS", "MERRA2"]]
    fig.legend(handles=handles, loc="lower center", ncol=6,
               frameon=False, bbox_to_anchor=(0.5, 0.03),
               prop={"size":32, "weight":"bold"})

    plt.suptitle(f"{seas}", y=1.02, fontsize=42, fontweight="bold")
    plt.tight_layout(rect=[0, 0.12, 1, 1])

    out_png = os.path.join(root, f"Seasonal_Boxplots_4-4N_{seas}.png")
    out_tif = os.path.join(root, f"Seasonal_Boxplots_4-4N_{seas}.tif")
    fig.savefig(out_png, dpi=600, bbox_inches="tight", pad_inches=0.70)
    fig.savefig(out_tif, dpi=600, bbox_inches="tight", pad_inches=0.70, format="tiff")
    plt.close(fig)
    print("Saved →", out_png)
    print("Saved →", out_tif)


In [None]:
"""
Great Lakes Basin Stations Map 
-----------------------------------------------------------------------

"""

import os
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.patheffects import withStroke

import cartopy.crs as ccrs
import cartopy.feature as cfeature
from cartopy.mpl.ticker import LongitudeFormatter, LatitudeFormatter

from matplotlib.patheffects import withStroke


# ------------------------------------------------------------------
# USER PATHS
# ------------------------------------------------------------------
GLB_SHP = r"D:\PhD\GLB\greatlakes_subbasins\New folder\Great_Lakes.shp"
BOUND_SHP = (r"D:\PhD\GLB\greatlakes_subbasins\NA\us-canada-boundary-v1-3"
             r"\US-Canada_Boundary_v1.3.shp")
stations_csv = r"D:\PhD\GLB\Merged USA and CA\Entire GLB\filtered_stations_with_points.csv"

# Optional chosen grid cells (comment/None if not needed)
grid_csv = None  # r"D:\PhD\GLB\chosen_emdna_cells.csv"

# Add lake names?
LABEL_LAKES = True

# Desired grid spacing (degrees). Fewer lines = cleaner figure.
LON_STEP = 5.0
LAT_STEP = 2.0

# ------------------------------------------------------------------
# OUTPUT PATH
# ------------------------------------------------------------------
out_dir = r"D:\PhD\GLB\greatlakes_subbasins\Map of stations"
os.makedirs(out_dir, exist_ok=True)
png_out = os.path.join(out_dir, "GLB_Stations_Map2.png")
tif_out = os.path.join(out_dir, "GLB_Stations_Map2.tif")

# ------------------------------------------------------------------
# READ DATA
# ------------------------------------------------------------------
glb = gpd.read_file(GLB_SHP)
boundary = gpd.read_file(BOUND_SHP)

stations_df = pd.read_csv(stations_csv)
if not {"lat", "lon"}.issubset(stations_df.columns):
    raise ValueError("stations CSV must contain 'lat' & 'lon' columns.")

stations = gpd.GeoDataFrame(
    stations_df,
    geometry=gpd.points_from_xy(stations_df["lon"], stations_df["lat"]),
    crs="EPSG:4326"
)

# optional grid cells
if grid_csv and os.path.isfile(grid_csv):
    grid_df = pd.read_csv(grid_csv)
    if not {"grid_lat", "grid_lon"}.issubset(grid_df.columns):
        raise ValueError("grid_csv must contain 'grid_lat','grid_lon'")
    grid_gdf = gpd.GeoDataFrame(
        grid_df,
        geometry=gpd.points_from_xy(grid_df["grid_lon"], grid_df["grid_lat"]),
        crs="EPSG:4326"
    )
else:
    grid_gdf = None

# unify CRS
TARGET_CRS = "EPSG:4326"
for g in (glb, boundary):
    if g.crs != TARGET_CRS:
        g.to_crs(TARGET_CRS, inplace=True)

# dissolve GLB to single polygon for clean outline
if len(glb) > 1:
    glb_poly = glb.unary_union
    glb = gpd.GeoDataFrame(geometry=[glb_poly], crs=TARGET_CRS)

# ------------------------------------------------------------------
# MAP EXTENT (tight pad ~0.3° each side)
# ------------------------------------------------------------------
minx, miny, maxx, maxy = glb.total_bounds
pad_x = 0.3
pad_y = 0.3
extent = (minx - pad_x, maxx + pad_x, miny - pad_y, maxy + pad_y)  # (W,E,S,N)

# ------------------------------------------------------------------
# COLOR‑BLIND‑SAFE (Okabe–Ito subset)
# ------------------------------------------------------------------
CB_VERMILLION = "#D55E00"   # Stations
CB_BLUE       = "#0072B2"   # Basin outline
CB_BLACKISH   = "#3B3B3B"   # Intl boundary & north arrow
CB_PURPLE     = "#CC79A7"   # Grid cells (if used)

# Background colors
LAND_FACE  = "#F2F2F2"      # very light neutral grey
LAKE_FACE  = "#BBDFFF"      # pale cyan‑blue (won't compete w/ data)

# ------------------------------------------------------------------
# HELPER: tick arrays that stay *within* extent
# ------------------------------------------------------------------
def nice_ticks(lo, hi, step):
    """Return ticks >=lo & <=hi at step intervals."""
    start = np.ceil(lo / step) * step
    ticks = np.arange(start, hi + 1e-9, step)
    return np.round(ticks, 1)

xticks = nice_ticks(extent[0], extent[1], LON_STEP)
yticks = nice_ticks(extent[2], extent[3], LAT_STEP)

# ------------------------------------------------------------------
# HELPER: add north arrow
# ------------------------------------------------------------------
def add_north_arrow(ax, x=0.05, y=0.08, size=0.08,
                    color=CB_BLACKISH, text="N", textsize=12):
    """
    Draw a simple north arrow in *axes fraction* coordinates.
    (x,y) = base of arrow (0=left,0=bottom; 1=top/right).
    size  = length in axes fraction (0‑1 range).
    """
    ax.annotate(
        "", xy=(x, y + size), xytext=(x, y),
        xycoords="axes fraction",
        arrowprops=dict(arrowstyle="-|>", color=color,
                        lw=1.8, shrinkA=0, shrinkB=0)
    )
    ax.text(
        x, y + size + 0.015, text, transform=ax.transAxes,
        ha="center", va="bottom", fontsize=textsize, fontweight="bold",
        color=color,
        path_effects=[withStroke(linewidth=2.5, foreground="white")]  # halo
    )

# ------------------------------------------------------------------
# PLOT
# ------------------------------------------------------------------
proj = ccrs.PlateCarree()
fig = plt.figure(figsize=(15, 9), constrained_layout=False)
ax = plt.axes(projection=proj)

# set extent
ax.set_extent(extent, crs=proj)

# ---- background: land & lakes -----------------------------------
ax.add_feature(cfeature.LAND.with_scale("50m"),
               facecolor=LAND_FACE, edgecolor="none", zorder=0)
ax.add_feature(cfeature.LAKES.with_scale("50m"),
               facecolor=LAKE_FACE, edgecolor="none", zorder=0.5)
ax.add_feature(cfeature.OCEAN.with_scale("50m"),
               facecolor=LAKE_FACE, edgecolor="none", zorder=0.4)

# ---- basin outline only ------------------------------------------
glb.boundary.plot(ax=ax, transform=proj, color=CB_BLUE,
                  linewidth=2.5, zorder=3)

# ---- U.S.–Canada boundary dashed ---------------------------------
try:
    boundary_clip = gpd.clip(boundary, glb.buffer(1.0))
    if boundary_clip.empty:
        boundary_clip = boundary
except Exception:
    boundary_clip = boundary
boundary_clip.plot(ax=ax, transform=proj, color=CB_BLACKISH,
                   linewidth=1.5, linestyle="--", zorder=4)

# ---- stations ----------------------------------------------------
stations.plot(ax=ax, transform=proj, markersize=28, marker="o",
              facecolor=CB_VERMILLION, edgecolor="white",
              linewidth=0.4, zorder=5)

# ---- optional grid cells (open squares) --------------------------
if grid_gdf is not None:
    grid_gdf.plot(ax=ax, transform=proj, markersize=60, marker="s",
                  facecolor="none", edgecolor=CB_PURPLE,
                  linewidth=1.2, zorder=4.5)

# ------------------------------------------------------------------
# TICKS / GRIDLINES
# ------------------------------------------------------------------
ax.set_xticks(xticks, crs=proj)
ax.set_yticks(yticks, crs=proj)

lon_formatter = LongitudeFormatter(number_format=".1f", degree_symbol="°",
                                   dateline_direction_label=False)
lat_formatter = LatitudeFormatter(number_format=".1f", degree_symbol="°")
ax.xaxis.set_major_formatter(lon_formatter)
ax.yaxis.set_major_formatter(lat_formatter)

ax.tick_params(labelsize=10, length=4, width=0.8, pad=2)

# subtle dotted grid (aligned to ticks)
ax.grid(ls=":", color="gray", alpha=0.6, linewidth=0.6, zorder=1)

# thicker frame
for spine in ax.spines.values():
    spine.set_visible(True)
    spine.set_linewidth(1.5)
    spine.set_edgecolor("black")

# ------------------------------------------------------------------
# OPTIONAL LAKE LABELS (with white halo for readability)
# ------------------------------------------------------------------
if LABEL_LAKES:
    lake_labels = {
        "Lake Superior": (-87.5, 47.8),
        "Lake Michigan": (-86.5, 45.6),
        "Lake Huron":    (-82.5, 45.5),
        "Lake Erie":     (-81.0, 42.3),
        "Lake Ontario":  (-78, 43.7),
    }
    for txt, (lon, lat) in lake_labels.items():
        if (extent[0] <= lon <= extent[1]) and (extent[2] <= lat <= extent[3]):
            ax.text(
                lon, lat, txt,
                transform=proj,
                ha="center", va="center",
                fontsize=8, fontstyle="italic",
                color="#1A1A1A", alpha=0.9,
                zorder=6,
                path_effects=[withStroke(linewidth=2.5, foreground="white")]
            )

# ------------------------------------------------------------------
# LEGEND
# ------------------------------------------------------------------
handles = [
    Line2D([], [], marker="o", markersize=8, linestyle="",
           markerfacecolor=CB_VERMILLION, markeredgecolor="white",
           label="Stations"),
    Line2D([], [], color=CB_BLACKISH, linestyle="--", linewidth=2,
           label="USA–Canada Boundary"),
    Line2D([], [], color=CB_BLUE, linestyle="-", linewidth=3,
           label="Great Lakes Basin Border"),
]
if grid_gdf is not None:
    handles.insert(1, Line2D([], [], marker="s", markersize=8, linestyle="",
                             markerfacecolor="none", markeredgecolor=CB_PURPLE,
                             label="Chosen Grid Cells"))

ax.legend(handles=handles, loc="upper right",
          frameon=True, framealpha=1, edgecolor="black",
          fontsize=11)

# ------------------------------------------------------------------
# TITLE
# ------------------------------------------------------------------
ax.set_title("Location of the Ground-Based Stations",
             fontsize=18, fontweight="bold", pad=14)

# ------------------------------------------------------------------
# NORTH ARROW  (bottom‑left)
# ------------------------------------------------------------------
add_north_arrow(ax, x=0.05, y=0.08, size=0.09,
                color=CB_BLACKISH, text="N", textsize=13)

# ------------------------------------------------------------------
# SAVE
# ------------------------------------------------------------------
fig.savefig(png_out, dpi=600, bbox_inches="tight")
fig.savefig(tif_out, dpi=600, bbox_inches="tight",
            pil_kwargs={"compression": "tiff_lzw"})
plt.close(fig)

print("Saved map →", png_out)
print("Saved map →", tif_out)


In [None]:
# 4 Taylor Diagrams in one plot for prcp

from pathlib import Path
import matplotlib.pyplot as plt
from PIL import Image

# ======== CONFIG ========
DIR = Path(r"D:\PhD\GLB\EMDNA(Historical data)\Ensembles\New folder\Ensemble files\Taylor Diagrams\Prcp")

# If you want a fixed order, list the four files explicitly (uncomment and edit):
# files = [
#     DIR / "a_EMDNA.png",
#     DIR / "b_PRISM.png",
#     DIR / "c_ERA5.png",
#     DIR / "d_RDRS.png",
# ]

# Otherwise we’ll auto-use the 4 PNGs in alphabetical order:
files = sorted(DIR.glob("*.png"))

panel_labels = ['(a)', '(b)', '(c)', '(d)']   # optional panel tags
panel_titles = []                              # optional, e.g. ['EMDNA', 'PRISM', 'ERA5', 'RDRS']
out_base = DIR / "Figure_Taylor_4panel"        # output base name (no extension)
figsize = (7.0, 7.0)                           # inches; adjust as needed (e.g., two-column ~7 in)
# ========================

if len(files) != 4:
    raise RuntimeError(f"Expected 4 PNGs in {DIR}, found {len(files)}.")

# Create figure (dpi here is for on-screen; savefig dpi will control the export)
fig, axes = plt.subplots(2, 2, figsize=figsize, constrained_layout=True)

# Plot each image
for ax, img_path, idx in zip(axes.flat, files, range(4)):
    im = Image.open(img_path)
    ax.imshow(im)
    ax.axis("off")

    # Optional panel label
    if panel_labels and idx < len(panel_labels):
        ax.text(
            0.02, 0.98, panel_labels[idx],
            transform=ax.transAxes, va="top", ha="left",
            fontsize=10, weight="bold",
            bbox=dict(boxstyle="square,pad=0.1", facecolor="white", edgecolor="none", alpha=0.7)
        )

    # Optional per-panel title
    if panel_titles and idx < len(panel_titles):
        ax.set_title(panel_titles[idx], fontsize=10, pad=2)

# ---- Save at 600 dpi in both formats ----
png_path = out_base.with_suffix(".png")
tif_path = out_base.with_suffix(".tif")

# Direct saves (preferred)
fig.savefig(png_path, dpi=600, bbox_inches="tight")
fig.savefig(tif_path, dpi=600, bbox_inches="tight")

# If you need LZW compression on the TIFF, uncomment the block below:
# im = Image.open(png_path)
# im.save(tif_path, compression="tiff_lzw", dpi=(600, 600))

print(f"Saved:\n  {png_path}\n  {tif_path}")


In [None]:
# 4 Taylor Diagrams in one plot for temperature


from pathlib import Path
import matplotlib.pyplot as plt
from matplotlib.transforms import Bbox
from PIL import Image, ImageDraw
from matplotlib.lines import Line2D

# ========= CONFIG =========
DIR = Path(r"D:\PhD\GLB\EMDNA(Historical data)\Ensembles\New folder\Ensemble files\Taylor Diagrams\Temperature")
files = sorted(DIR.glob("*.png"))  # expects exactly 4 PNGs
out_base = DIR / "Figure_Taylor_4panel_clean"

panel_labels = ["(a) EMDNA", "(b) ERA5", "(c) PRISM", "(d) RDRS"]

# Mask per-panel legends (top-right) and native titles (upper middle)
REMOVE_PANEL_LEGENDS = True
LEGEND_BOX = (0.62, 0.03, 0.98, 0.16)   # (x0, y0, x1, y1) in fractional image coords
REMOVE_TOP_TITLES = True
TITLE_BOX  = (0.36, 0.02, 0.64, 0.08)

# Figure size in inches (extra height only for legend area)
figsize = (6.85, 7.80)
dpi_export = 600

# Bottom crop padding (inches) to keep a little space below legend
crop_pad_in = 0.06
# ==========================

if len(files) != 4:
    raise RuntimeError(f"Expected 4 PNGs in {DIR}, found {len(files)}.")

# Open & mask each image as requested
processed = []
for img_path in files:
    im = Image.open(img_path).convert("RGBA")
    draw = ImageDraw.Draw(im)
    w, h = im.size

    if REMOVE_PANEL_LEGENDS:
        x0, y0, x1, y1 = [int(v) for v in (LEGEND_BOX[0]*w, LEGEND_BOX[1]*h,
                                           LEGEND_BOX[2]*w, LEGEND_BOX[3]*h)]
        draw.rectangle([x0, y0, x1, y1], fill=(255, 255, 255, 255))

    if REMOVE_TOP_TITLES:
        x0, y0, x1, y1 = [int(v) for v in (TITLE_BOX[0]*w, TITLE_BOX[1]*h,
                                           TITLE_BOX[2]*w, TITLE_BOX[3]*h)]
        draw.rectangle([x0, y0, x1, y1], fill=(255, 255, 255, 255))

    processed.append(im)

# ---- Compose tightly packed 2×2 with bottom-only padding for legend ----
fig = plt.figure(figsize=figsize)
gs = fig.add_gridspec(
    2, 2,
    left=0.035,
    right=0.965,
    top=0.93,
    bottom=0.16,   # space for legend before we crop
    wspace=0.06,
    hspace=0.10
)

axes = [fig.add_subplot(gs[i, j]) for i in range(2) for j in range(2)]
for ax, im, label in zip(axes, processed, panel_labels):
    ax.imshow(im)
    ax.axis("off")
    ax.text(
        0.5, 1.01, label, transform=ax.transAxes,
        ha="center", va="bottom", fontsize=12, weight="bold",
        bbox=dict(boxstyle="round,pad=0.15", facecolor="white", edgecolor="none", alpha=0.90)
    )

# Figure-level legend with hollow markers
handles = [
    Line2D([0], [0], marker='o', linestyle='None',
           markerfacecolor='none', markeredgecolor='0.4', label='Grid/Station points'),
    Line2D([0], [0], marker='o', linestyle='None',
           markerfacecolor='none', markeredgecolor='red', label='Top 90% CC grids'),
]
leg = fig.legend(
    handles=handles,
    loc='lower center',
    ncol=2,
    frameon=True,
    bbox_to_anchor=(0.5, 0.115),
    borderaxespad=0.25
)

# ---- Crop ONLY the bottom whitespace (keep left/right/top) ----
fig.canvas.draw()  # need a renderer
renderer = fig.canvas.get_renderer()
fig_w, fig_h = fig.get_figwidth(), fig.get_figheight()

# Legend bbox in pixels → inches
leg_bb_px = leg.get_window_extent(renderer=renderer)
leg_bb_in = Bbox.from_extents(*(v / fig.dpi for v in leg_bb_px.extents))

# Define a bbox that trims bottom to just below the legend
bottom_new = max(0, leg_bb_in.y0 - crop_pad_in)  # keep a little pad
crop_bbox_inches = Bbox.from_extents(0, bottom_new, fig_w, fig_h)

# Save at 600 dpi using custom bbox (no other sides trimmed)
png_path = out_base.with_suffix(".png")
tif_path = out_base.with_suffix(".tif")
fig.savefig(png_path, dpi=dpi_export, bbox_inches=crop_bbox_inches)
fig.savefig(tif_path, dpi=dpi_export, bbox_inches=crop_bbox_inches)
print(f"Saved:\n  {png_path}\n  {tif_path}")


In [None]:
# Analysis plots and tables for the climatic indices within the whole period 1991-2012 for PRISM

import os
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from shapely.geometry import Point
from scipy.stats import pearsonr
import seaborn as sns

###############################################################################
# 1. CONFIGURATION & PATHS
###############################################################################
indices_dir  = r"D:\PhD\GLB\EMDNA(Historical data)\Ensembles\New folder\Ensemble files\PRISM_GLB_Precipitation\ClimaticIndices-8Nearest-3"

shapefile_path = r"D:\PhD\GLB\greatlakes_subbasins\New folder\Great_Lakes.shp"
lakes_shp      = r"D:\PhD\GLB\greatlakes_subbasins\GLB_Water_Bodies\Main_Lakes_GLB.shp"

output_plots   = os.path.join(indices_dir, "AnalysisPlots")
os.makedirs(output_plots, exist_ok=True)

# The 8 Excel files that your daily-index code produces:
index_files = {
    "rx1day":  os.path.join(indices_dir, "rx1day.xlsx"),
    "rx5day":  os.path.join(indices_dir, "rx5day.xlsx"),
    "cdd":     os.path.join(indices_dir, "cdd.xlsx"),
    "cwd":     os.path.join(indices_dir, "cwd.xlsx"),
    "r95p":    os.path.join(indices_dir, "r95p.xlsx"),
    "r99p":    os.path.join(indices_dir, "r99p.xlsx"),
    "wetdays": os.path.join(indices_dir, "wetdays.xlsx"),
    "drydays": os.path.join(indices_dir, "drydays.xlsx"),
}

# Key columns for each index (obs vs prism):
index_columns = {
    "rx1day":  ("obs_rx1day",  "prism_rx1day"),
    "rx5day":  ("obs_rx5day",  "prism_rx5day"),
    "cdd":     ("obs_cdd",     "prism_cdd"),
    "cwd":     ("obs_cwd",     "prism_cwd"),
    "r95p":    (("obs_r95amt", "obs_r95pct"), ("prism_r95amt", "prism_r95pct")),
    "r99p":    (("obs_r99amt", "obs_r99pct"), ("prism_r99amt", "prism_r99pct")),
    "wetdays": ("obs_wetdays5mm", "prism_wetdays5mm"),
    "drydays": ("obs_drydays",      "prism_drydays"),
}

###############################################################################
# 2. MERGE ALL INDICES INTO A MASTER TABLE (A)
###############################################################################
master_table = None

for idx_name, xls_path in index_files.items():
    print(f"[+] Reading {idx_name} from {xls_path}")
    df_idx = pd.read_excel(xls_path)
    df_idx["station_name"] = df_idx["station_name"].astype(str)
    # Identify obs/prism columns
    obs_cols = index_columns[idx_name][0]
    prism_cols = index_columns[idx_name][1]
    # Build subtable with station_name, lat, lon
    sub = df_idx[["station_name", "lat", "lon"]].copy()
    if isinstance(obs_cols, tuple):
        for oc, ec in zip(obs_cols, prism_cols):
            sub[oc] = df_idx[oc]
            sub[ec] = df_idx[ec]
            # Create ratio and difference columns
            ratio_col = f"ratio_{oc.replace('obs_','')}"
            diff_col  = f"diff_{oc.replace('obs_','')}"
            sub[ratio_col] = df_idx[ec] / df_idx[oc]
            sub[diff_col]  = df_idx[ec] - df_idx[oc]
    else:
        oc = obs_cols
        ec = prism_cols
        sub[oc] = df_idx[oc]
        sub[ec] = df_idx[ec]
        ratio_col = f"ratio_{oc.replace('obs_','')}"
        diff_col  = f"diff_{oc.replace('obs_','')}"
        sub[ratio_col] = df_idx[ec] / df_idx[oc]
        sub[diff_col]  = df_idx[ec] - df_idx[oc]
    # Rename columns with a prefix (e.g., "obs_rx1day" becomes "rx1day_obs_rx1day")
    rename_map = {}
    for c in sub.columns:
        if c not in ["station_name", "lat", "lon"]:
            rename_map[c] = f"{idx_name}_{c}"
    sub.rename(columns=rename_map, inplace=True)
    if master_table is None:
        master_table = sub
    else:
        master_table = pd.merge(master_table, sub, on=["station_name", "lat", "lon"], how="outer")

master_xlsx = os.path.join(output_plots, "MasterTable_AllIndices.xlsx")
master_table.to_excel(master_xlsx, index=False)
print(f"\n(A) Master table saved => {master_xlsx}")
print("Columns:", master_table.columns.tolist())

###############################################################################
# 3. SUMMARY TABLE: MBE, RMSE, STD, CC, d
###############################################################################
def index_of_agreement(obs, model):
    obs_mean = np.mean(obs)
    num = np.sum((model - obs)**2)
    den = np.sum((abs(model - obs_mean) + abs(obs - obs_mean))**2)
    if den == 0:
        return np.nan
    return 1 - num/den

def rmse(a, b):
    return np.sqrt(np.mean((a - b)**2))

def std_of_residuals(a, b):
    return np.std(a - b, ddof=1)

def mean_bias_error(a, b):
    return np.mean(b - a)

summary_rows = []
for idx_name, xls_path in index_files.items():
    df_idx = pd.read_excel(xls_path)
    obs_cols = index_columns[idx_name][0]
    prism_cols = index_columns[idx_name][1]
    if isinstance(obs_cols, tuple):
        for oc, ec in zip(obs_cols, prism_cols):
            valid = df_idx[[oc, ec]].dropna()
            if len(valid) < 2:
                continue
            obs_vals = valid[oc].values
            prism_vals = valid[ec].values
            MB  = mean_bias_error(obs_vals, prism_vals)
            RM  = rmse(obs_vals, prism_vals)
            SR  = std_of_residuals(obs_vals, prism_vals)
            CC  = pearsonr(obs_vals, prism_vals)[0] if len(obs_vals) > 1 else np.nan
            dd  = index_of_agreement(obs_vals, prism_vals)
            idx_label = f"{idx_name}_{oc.replace('obs_','')}"
            summary_rows.append({
                "Index": idx_label,
                "Count": len(valid),
                "MBE": MB,
                "RMSE": RM,
                "STDres": SR,
                "CC": CC,
                "d": dd,
            })
    else:
        oc = obs_cols
        ec = prism_cols
        valid = df_idx[[oc, ec]].dropna()
        if len(valid) < 2:
            continue
        obs_vals = valid[oc].values
        prism_vals = valid[ec].values
        MB = mean_bias_error(obs_vals, prism_vals)
        RM = rmse(obs_vals, prism_vals)
        SR = std_of_residuals(obs_vals, prism_vals)
        CC = pearsonr(obs_vals, prism_vals)[0] if len(obs_vals) > 1 else np.nan
        dd = index_of_agreement(obs_vals, prism_vals)
        summary_rows.append({
            "Index": idx_name,
            "Count": len(valid),
            "MBE": MB,
            "RMSE": RM,
            "STDres": SR,
            "CC": CC,
            "d": dd,
        })

summary_df = pd.DataFrame(summary_rows)
summary_cols = ["Index", "Count", "MBE", "RMSE", "STDres", "CC", "d"]
summary_df = summary_df[summary_cols]
summary_xlsx = os.path.join(output_plots, "SummaryTable_Extremes.xlsx")
summary_df.to_excel(summary_xlsx, index=False)
print(f"(B) Summary Table saved => {summary_xlsx}\n{summary_df}")

###############################################################################
# 4. SPATIAL MAPS TRIPTYCH (Observed, prism, Ratio)
###############################################################################
gdf_basin = gpd.read_file(shapefile_path).to_crs(epsg=4326)
gdf_lakes = gpd.read_file(lakes_shp).to_crs(epsg=4326)

def add_basin_lakes(ax):
    ax.add_feature(cfeature.COASTLINE)
    ax.add_feature(cfeature.BORDERS, linestyle=':')
    for geom in gdf_basin.geometry:
        ax.add_geometries([geom], ccrs.PlateCarree(), facecolor='none', edgecolor='blue', linewidth=1)
    for geom in gdf_lakes.geometry:
        ax.add_geometries([geom], ccrs.PlateCarree(), facecolor='none', edgecolor='cyan', linewidth=1)

def plot_map_triptych(df, obs_col, prism_col, ratio_col, idx_name, out_png):
    """
    Creates one figure with three subplots:
      Left: Observed values,
      Center: prism values,
      Right: Ratio (prism/OBS)
    """
    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18,6),
                             subplot_kw={"projection": ccrs.PlateCarree()})
    
    def single_map(ax, col, title):
        ax.set_extent([-95.5, -72, 38.5, 52.5])
        add_basin_lakes(ax)
        sc = ax.scatter(df["lon"], df["lat"], c=df[col], cmap="viridis", s=60,
                        transform=ccrs.PlateCarree(), edgecolor="k", zorder=10)
        cb = plt.colorbar(sc, ax=ax, shrink=0.8)
        cb.set_label(col)
        # Highlight hotspots (top 10%)
        vals = df[col].dropna().values
        if len(vals) > 0:
            thr = np.percentile(vals, 90)
            hot_mask = df[col] >= thr
            ax.scatter(df.loc[hot_mask,"lon"], df.loc[hot_mask,"lat"],
                       marker='o', facecolors='none', edgecolors='red', s=80,
                       transform=ccrs.PlateCarree(), zorder=11,
                       label=f"Hotspot >= {thr:.2f}")
        ax.set_title(title, fontsize=12)
        gl = ax.gridlines(draw_labels=True, linestyle='--', color='gray')
        gl.right_labels = False
        gl.top_labels = False
        ax.legend(loc='upper right')
    
    single_map(axes[0], obs_col,  f"{idx_name} Observed")
    single_map(axes[1], prism_col,  f"{idx_name} prism")
    single_map(axes[2], ratio_col, f"{idx_name} (prism/OBS)")
    
    plt.tight_layout()
    plt.savefig(out_png, dpi=300, bbox_inches="tight")
    plt.close()
    print("Saved 3-panel map =>", out_png)

def get_map_cols(idx):
    """Return obs_col, merra2_col, diff_col, ratio_col for the given index."""
    if idx in ["rx1day","rx5day","cdd","cwd","drydays"]:
        obs = f"{idx}_obs_{idx}"
        prism = f"{idx}_prism_{idx}"
        diff = f"{idx}_diff_{idx}"
        ratio = f"{idx}_ratio_{idx}"
        return obs, prism, diff, ratio
    elif idx == "wetdays":
        obs = "wetdays_obs_wetdays5mm"
        prism = "wetdays_prism_wetdays5mm"
        diff = "wetdays_diff_wetdays5mm"
        ratio = "wetdays_ratio_wetdays5mm"
        return obs, prism, diff, ratio
    elif idx == "r95p":
        obs = "r95p_obs_r95amt"
        prism = "r95p_prism_r95amt"
        diff = "r95p_diff_r95amt"
        ratio = "r95p_ratio_r95amt"
        return obs, prism, diff, ratio
    elif idx == "r99p":
        obs = "r99p_obs_r99amt"
        prism = "r99p_prism_r99amt"
        diff = "r99p_diff_r99amt"
        ratio = "r99p_ratio_r99amt"
        return obs, prism, diff, ratio
    else:
        return None, None, None, None

for idx_name in index_files.keys():
    obs_col, prism_col, diff_col, ratio_col = get_map_cols(idx_name)
    if obs_col is None:
        continue
    needed = ["station_name", "lat", "lon", obs_col, prism_col, ratio_col]
    if not all(c in master_table.columns for c in needed):
        print(f"Skipping map for {idx_name} - missing columns in master_table.")
        continue
    subdf = master_table[needed].dropna(subset=["lat", "lon"])
    out_png = os.path.join(output_plots, f"{idx_name}_MAP_3panel.png")
    plot_map_triptych(subdf, obs_col, prism_col, ratio_col, idx_name, out_png)

###############################################################################
# 5. DISTRIBUTION & BOX/CDF/SCATTER TRIPTYCH
###############################################################################
def plot_distribution_triptych(df, obs_col, prism_col, label, out_png):
    """
    Creates one figure with three subplots side-by-side:
      Left: Boxplot,
      Center: CDF,
      Right: Scatter plot.
    """
    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18,6))
    
    # (A) Boxplot
    data = pd.DataFrame({"Obs": df[obs_col], "prism": df[prism_col]}).melt(var_name="Dataset", value_name=label)
    sns.boxplot(data=data, x="Dataset", y=label, ax=axes[0])
    axes[0].set_title(f"Boxplot: {label}")
    
    # (B) CDF
    obs_vals = df[obs_col].dropna()
    prism_vals = df[prism_col].dropna()
    def ecdf(x):
        xs = np.sort(x)
        ys = np.arange(1, len(xs)+1) / len(xs)
        return xs, ys
    if len(obs_vals) >= 2 and len(prism_vals) >= 2:
        xs_o, ys_o = ecdf(obs_vals)
        xs_e, ys_e = ecdf(prism_vals)
        axes[1].plot(xs_o, ys_o, label="Obs")
        axes[1].plot(xs_e, ys_e, label="prism")
        axes[1].set_title(f"CDF of {label}")
        axes[1].set_xlabel(label)
        axes[1].set_ylabel("Probability")
        axes[1].legend()
    else:
        axes[1].set_title(f"CDF: Not enough data ({label})")
    
    # (C) Scatter plot
    valid = df[[obs_col, prism_col]].dropna()
    if len(valid) >= 2:
        x = valid[obs_col]
        y = valid[prism_col]
        cc, _ = pearsonr(x, y)
        axes[2].scatter(x, y, edgecolors='k', alpha=0.7)
        mn, mx = np.nanmin([x.min(), y.min()]), np.nanmax([x.max(), y.max()])
        axes[2].plot([mn, mx], [mn, mx], 'r--')
        axes[2].set_xlabel(f"Obs {label}")
        axes[2].set_ylabel(f"prism {label}")
        axes[2].set_title(f"{label} (Corr={cc:.2f})")
    else:
        axes[2].set_title(f"Scatter: Not enough data ({label})")
    
    plt.tight_layout()
    plt.savefig(out_png, dpi=300, bbox_inches="tight")
    plt.close()
    print("Saved distribution triptych =>", out_png)

for idx_name in index_files.keys():
    if idx_name in ["rx1day", "rx5day", "cdd", "cwd", "drydays"]:
        obs_col = f"{idx_name}_obs_{idx_name}"
        prism_col = f"{idx_name}_prism_{idx_name}"
    elif idx_name == "wetdays":
        obs_col = "wetdays_obs_wetdays5mm"
        prism_col = "wetdays_prism_wetdays5mm"
    elif idx_name == "r95p":
        obs_col = "r95p_obs_r95amt"
        prism_col = "r95p_prism_r95amt"
    elif idx_name == "r99p":
        obs_col = "r99p_obs_r99amt"
        prism_col = "r99p_prism_r99amt"
    else:
        continue

    if obs_col not in master_table.columns or prism_col not in master_table.columns:
        print(f"Skipping distribution for {idx_name} - missing columns.")
        continue
    subdf = master_table[[obs_col, prism_col]].dropna()
    if len(subdf) < 2:
        print(f"Skipping distribution for {idx_name} - not enough data.")
        continue
    out_3panel = os.path.join(output_plots, f"{idx_name}_Distribution_3panel.png")
    plot_distribution_triptych(subdf, obs_col, prism_col, idx_name, out_3panel)

###############################################################################
# 6. SCATTER PLOT (SEPARATE, if needed)
###############################################################################
def scatter_index(df, obs_col, prism_col, label, out_png):
    valid = df[[obs_col, prism_col]].dropna()
    if len(valid) < 2:
        return
    x = valid[obs_col]
    y = valid[prism_col]
    cc, _ = pearsonr(x, y)
    fig, ax = plt.subplots(figsize=(5,5))
    ax.scatter(x, y, edgecolors='k', alpha=0.7)
    mn, mx = np.nanmin([x.min(), y.min()]), np.nanmax([x.max(), y.max()])
    ax.plot([mn, mx], [mn, mx], 'r--')
    ax.set_xlabel(f"Obs {label}")
    ax.set_ylabel(f"prism {label}")
    ax.set_title(f"{label} (Corr={cc:.2f})")
    plt.savefig(out_png, dpi=300, bbox_inches="tight")
    plt.close()
    print("Saved scatter =>", out_png)

for idx_name in index_files.keys():
    if idx_name in ["rx1day", "rx5day", "cdd", "cwd", "drydays"]:
        obs_col = f"{idx_name}_obs_{idx_name}"
        prism_col = f"{idx_name}_prism_{idx_name}"
    elif idx_name == "wetdays":
        obs_col = "wetdays_obs_wetdays5mm"
        prism_col = "wetdays_prism_wetdays5mm"
    elif idx_name == "r95p":
        obs_col = "r95p_obs_r95amt"
        prism_col = "r95p_prism_r95amt"
    elif idx_name == "r99p":
        obs_col = "r99p_obs_r99amt"
        prism_col = "r99p_prism_r99amt"
    else:
        continue

    if obs_col not in master_table.columns or prism_col not in master_table.columns:
        continue
    subdf = master_table[[obs_col, prism_col]].dropna()
    if len(subdf) < 2:
        continue
    out_scat = os.path.join(output_plots, f"{idx_name}_Scatter.png")
    scatter_index(subdf, obs_col, prism_col, idx_name, out_scat)

###############################################################################
# 7. DONE
###############################################################################
print("\nAll steps completed! See outputs in:", output_plots)


In [None]:
# making the spatial maps to show which index is doing badly where across the basin

# =============================================================================
#  PRISM-vs-OBS  |  High-resolution bias maps (Great Lakes Basin)
# =============================================================================
import os, numpy as np, pandas as pd, geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.gridspec as gridspec
import cartopy.crs as ccrs, cartopy.feature as cfeature

# ────────────────────────────────────────────────────────────────────────────
# 1.  CONFIGURATION
# ────────────────────────────────────────────────────────────────────────────
indices_dir  = r"D:\PhD\GLB\EMDNA(Historical data)\Ensembles\New folder\Ensemble files\PRISM_GLB_Precipitation\ClimaticIndices-8Nearest-3"
shapefile_path = r"D:\PhD\GLB\greatlakes_subbasins\New folder\Great_Lakes.shp"
lakes_shp      = r"D:\PhD\GLB\greatlakes_subbasins\GLB_Water_Bodies\Main_Lakes_GLB.shp"

output_plots   = os.path.join(indices_dir, "BiasMaps_highres")
os.makedirs(output_plots, exist_ok=True)

index_files = {
    "rx1day":  os.path.join(indices_dir, "rx1day.xlsx"),
    "rx5day":  os.path.join(indices_dir, "rx5day.xlsx"),
    "r95p":    os.path.join(indices_dir, "r95p.xlsx"),
    "r99p":    os.path.join(indices_dir, "r99p.xlsx"),
}

index_columns = {
    "rx1day":  ("obs_rx1day",  "prism_rx1day"),
    "rx5day":  ("obs_rx5day",  "prism_rx5day"),
    "r95p":    (("obs_r95amt",), ("prism_r95amt",)),
    "r99p":    (("obs_r99amt",), ("prism_r99amt",)),
}

HOT_PCTL   = 90
RATIO_CMAP = plt.cm.RdBu_r

# ────────────────────────────────────────────────────────────────────────────
# 2.  MERGE ALL INDICES  ➜ master_table  (adds ratio_* columns)
# ────────────────────────────────────────────────────────────────────────────
master_table = None
for idx, xls in index_files.items():
    df = pd.read_excel(xls)
    df["station_name"] = df["station_name"].astype(str)

    obs_cols, prism_cols = index_columns[idx]
    if not isinstance(obs_cols, tuple):  # homogenise to tuples
        obs_cols, prism_cols = (obs_cols,), (prism_cols,)

    sub = df[["station_name","lat","lon"]].copy()
    for oc, ec in zip(obs_cols, prism_cols):
        sub[oc] = df[oc]
        sub[ec] = df[ec]
        sub[f"ratio_{oc.replace('obs_','')}"] = df[ec] / df[oc]

    # prefix every metric with the index name
    sub = sub.rename({c:f"{idx}_{c}" for c in sub if c not in ["station_name","lat","lon"]}, axis=1)
    master_table = sub if master_table is None else master_table.merge(sub, on=["station_name","lat","lon"], how="outer")

# ────────────────────────────────────────────────────────────────────────────
# 3.  MAP LAYERS
# ────────────────────────────────────────────────────────────────────────────
gdf_basin = gpd.read_file(shapefile_path).to_crs(epsg=4326)
gdf_lakes = gpd.read_file(lakes_shp).to_crs(epsg=4326)

def add_basin_lakes(ax):
    ax.add_feature(cfeature.COASTLINE)
    ax.add_feature(cfeature.BORDERS, linestyle=":")
    for g in gdf_basin.geometry:
        ax.add_geometries([g], ccrs.PlateCarree(), facecolor="none", edgecolor="blue", linewidth=1)
    for g in gdf_lakes.geometry:
        ax.add_geometries([g], ccrs.PlateCarree(), facecolor="none", edgecolor="cyan", linewidth=1)

###############################################################################
# 4.  HIGH-RES BIAS MAPS  –  colour-blind-friendly, concise label (600 dpi)
###############################################################################
import matplotlib.colors as mcolors
import matplotlib.gridspec as gridspec
import numpy as np

HOT_PCTL = 90   # worst 10 % of |ratio−1| are ringed

# ── colour-blind-safe diverging palette (blue → grey → orange) --------------
CB_DIV = mcolors.LinearSegmentedColormap.from_list(
    "cb_div", ["#2166ac", "#f7f7f7", "#b2182b"], N=256
)

# ── basin / lake layers ────────────────────────────────────────────────────
gdf_basin = gpd.read_file(shapefile_path).to_crs(epsg=4326)
gdf_lakes = gpd.read_file(lakes_shp).to_crs(epsg=4326)

def add_basin_lakes(ax):
    """Great-Lakes outlines in CVD-safe hues."""
    ax.add_feature(cfeature.COASTLINE, linewidth=0.6)
    ax.add_feature(cfeature.BORDERS,   linewidth=0.4, linestyle=":")
    # Basin — charcoal
    for geom in gdf_basin.geometry:
        ax.add_geometries([geom], ccrs.PlateCarree(),
                          facecolor="none", edgecolor="#333333",
                          linewidth=1.2, zorder=2)
    # Lakes — muted blue-green
    for geom in gdf_lakes.geometry:
        ax.add_geometries([geom], ccrs.PlateCarree(),
                          facecolor="none", edgecolor="#5ab4ac",
                          linewidth=1.0, zorder=1)

# ── high-resolution single-panel bias map ──────────────────────────────────
def bias_map(df, ratio_col, idx_name, out_base, clip_max=3.0):
    fig = plt.figure(figsize=(9, 6), dpi=600)
    gs  = gridspec.GridSpec(1, 2, width_ratios=[20, 1], wspace=0.03)
    ax  = fig.add_subplot(gs[0], projection=ccrs.PlateCarree())
    cax = fig.add_subplot(gs[1])

    # zoom on basin
    xmin, ymin, xmax, ymax = gdf_basin.total_bounds
    ax.set_extent([xmin-0.5, xmax+0.5, ymin-0.5, ymax+0.5])
    add_basin_lakes(ax)

    # symmetric colour range
    max_dev = min(np.abs(df[ratio_col] - 1).max(), clip_max)
    vmin, vmax = 1 - max_dev, 1 + max_dev
    norm = mcolors.TwoSlopeNorm(vmin=vmin, vcenter=1.0, vmax=vmax)

    # scatter points
    sc = ax.scatter(df["lon"], df["lat"], c=df[ratio_col],
                    cmap=CB_DIV, norm=norm, s=50,
                    transform=ccrs.PlateCarree(),
                    edgecolor="k", zorder=10)

    # colour-bar with concise label  (e.g. “r95p_ratio”)
    cb = fig.colorbar(sc, cax=cax)
    ticks = np.linspace(vmin, vmax, 5)
    cb.set_ticks(ticks)
    cb.ax.set_yticklabels([f"{t:.2f}" if abs(t-1) > 1e-6 else "1.00" for t in ticks])
    cb.set_label(f"{idx_name}_ratio", fontsize=9)     #  << concise name

    # hotspot rings
    tol = np.percentile(np.abs(df[ratio_col] - 1), HOT_PCTL)
    hot = np.abs(df[ratio_col] - 1) >= tol
    ax.scatter(df.loc[hot,"lon"], df.loc[hot,"lat"],
               marker="o", facecolors="none", edgecolors="red",
               s=90, transform=ccrs.PlateCarree(), zorder=11,
               label=f"Large bias (>|{tol:.2f}|)")

    # grid & labels
    gl = ax.gridlines(draw_labels=True, linestyle="--",
                      linewidth=0.5, color="gray", alpha=0.3)
    gl.top_labels = gl.right_labels = False
    gl.xlabel_style = gl.ylabel_style = {"size":8}

    ax.set_title(f"{idx_name} Bias PRISM", fontsize=16, pad=10)
    ax.legend(loc="upper center", bbox_to_anchor=(0.5, -0.08),
              fontsize=9, frameon=False)

    # save PNG & TIFF
    for ext in ("png", "tif"):
        fig.savefig(f"{out_base}.{ext}", dpi=600, bbox_inches="tight")
    plt.close()

# ── helper: ratio-column per index ─────────────────────────────────────────
def ratio_col_name(idx):
    if idx in ["rx1day", "rx5day"]:
        return f"{idx}_ratio_{idx}"
    if idx == "r95p":
        return "r95p_ratio_r95amt"
    if idx == "r99p":
        return "r99p_ratio_r99amt"
    return None

# ── generate bias maps ────────────────────────────────────────────────────
for idx in index_files:
    rc = ratio_col_name(idx)
    if rc is None or rc not in master_table.columns:
        continue
    df_plot = master_table[["lat", "lon", rc]].dropna()
    if df_plot.empty:
        continue
    bias_map(df_plot, rc, idx,
             os.path.join(output_plots, f"{idx}_BiasPRISM_highres"))

print("Done – colour-blind-friendly bias maps written to:", output_plots)



# ────────────────────────────────────────────────────────────────────────────
# 5.  GENERATE THE EIGHT FIGURES
# ────────────────────────────────────────────────────────────────────────────
for idx in index_files.keys():
    rc = ratio_col_name(idx)
    if rc is None or rc not in master_table.columns:
        continue
    df = master_table[["lat","lon",rc]].dropna()
    if df.empty:
        continue
    bias_map(df, rc, idx, os.path.join(output_plots, f"{idx}_BiasPRISM_highres"))

print("\nDone – eight high-resolution bias maps written to:", output_plots)


In [None]:
# Analysis plots and tables for the climatic indices within the whole period 1991-2012 for prcp for EMDNA

import os
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from shapely.geometry import Point
from scipy.stats import pearsonr
import seaborn as sns

###############################################################################
# 1. CONFIGURATION & PATHS
###############################################################################
root_dir = (r"D:\PhD\GLB\EMDNA(Historical data)\Ensembles\New folder"
            r"\Ensemble files\EMDNA_GLB_Precipitation")

# the 10 ensemble sub-folders we need to analyse
ENSEMBLES = [1, 11, 21, 31, 41, 51, 61, 71, 81, 91]

# shapefiles are common to all ensembles
shapefile_path = r"D:\PhD\GLB\greatlakes_subbasins\New folder\Great_Lakes.shp"
lakes_shp      = r"D:\PhD\GLB\greatlakes_subbasins\GLB_Water_Bodies\Main_Lakes_GLB.shp"

# ────────────────────────────────────────────────────────────────────────────
# mapping: index → (obs-column[s], emd-column[s])
# ────────────────────────────────────────────────────────────────────────────
index_columns = {
    "rx1day":  ("obs_rx1day",  "emd_rx1day"),
    "rx5day":  ("obs_rx5day",  "emd_rx5day"),
    "cdd":     ("obs_cdd",     "emd_cdd"),
    "cwd":     ("obs_cwd",     "emd_cwd"),
    "r95p":    (("obs_r95amt", "obs_r95pct"),
                ("emd_r95amt", "emd_r95pct")),
    "r99p":    (("obs_r99amt", "obs_r99pct"),
                ("emd_r99amt", "emd_r99pct")),
    "wetdays": ("obs_wetdays5mm", "emd_wetdays5mm"),
    "drydays": ("obs_drydays",    "emd_drydays"),
}

# ────────────────────────────────────────────────────────────────────────────
# LOOP over each ensemble — everything below is indented inside this loop
# ────────────────────────────────────────────────────────────────────────────
for ENS in ENSEMBLES:
    print("\n" + "="*86)
    print(f"⧉  Creating analysis plots / tables for ensemble {ENS:03d}  ⧉")
    print("="*86)

    # folder produced by the “ClimaticIndices-25KM” script
    indices_dir = os.path.join(root_dir, str(ENS), "ClimaticIndices-25KM")
    if not os.path.isdir(indices_dir):
        print(f"   ⚠  Indices directory missing → skip ensemble {ENS:03d}")
        continue

    # where the new plots/tables will be stored
    output_plots = os.path.join(indices_dir, "AnalysisPlots")
    os.makedirs(output_plots, exist_ok=True)

    # full paths to the eight index spreadsheets for *this* ensemble
    index_files = {
        "rx1day":  os.path.join(indices_dir, "rx1day.xlsx"),
        "rx5day":  os.path.join(indices_dir, "rx5day.xlsx"),
        "cdd":     os.path.join(indices_dir, "cdd.xlsx"),
        "cwd":     os.path.join(indices_dir, "cwd.xlsx"),
        "r95p":    os.path.join(indices_dir, "r95p.xlsx"),
        "r99p":    os.path.join(indices_dir, "r99p.xlsx"),
        "wetdays": os.path.join(indices_dir, "wetdays.xlsx"),
        "drydays": os.path.join(indices_dir, "drydays.xlsx"),
    }

    ###############################################################################
    # 2. MERGE ALL INDICES INTO A MASTER TABLE (A)
    ###############################################################################
    master_table = None   # reset for this ensemble

    for idx_name, xls_path in index_files.items():
        if not os.path.isfile(xls_path):
            print(f"   ⚠  Missing {idx_name}.xlsx → skip this index")
            continue

        print(f"[+] Reading {idx_name} from {xls_path}")
        df_idx = pd.read_excel(xls_path)
        df_idx["station_name"] = df_idx["station_name"].astype(str)

        # Identify obs / emd columns for this index
        obs_cols = index_columns[idx_name][0]
        emd_cols = index_columns[idx_name][1]

        # Start sub-table with coordinates
        sub = df_idx[["station_name", "lat", "lon"]].copy()

        # Handle one-column or two-column indices
        if isinstance(obs_cols, tuple):
            for oc, ec in zip(obs_cols, emd_cols):
                sub[oc] = df_idx[oc]
                sub[ec] = df_idx[ec]
                sub[f"ratio_{oc.replace('obs_', '')}"] = df_idx[ec] / df_idx[oc]
                sub[f"diff_{oc.replace('obs_',  '')}"] = df_idx[ec] - df_idx[oc]
        else:
            oc, ec = obs_cols, emd_cols
            sub[oc] = df_idx[oc]
            sub[ec] = df_idx[ec]
            sub[f"ratio_{oc.replace('obs_', '')}"] = df_idx[ec] / df_idx[oc]
            sub[f"diff_{oc.replace('obs_',  '')}"] = df_idx[ec] - df_idx[oc]

        # Prefix every metric column with the index name
        sub.rename(columns={c: f"{idx_name}_{c}"
                            for c in sub.columns
                            if c not in ["station_name", "lat", "lon"]},
                   inplace=True)

        # Append to / merge into the running master_table
        if master_table is None:
            master_table = sub
        else:
            master_table = pd.merge(master_table, sub,
                                    on=["station_name", "lat", "lon"],
                                    how="outer")

    # save master-table for this ensemble
    master_xlsx = os.path.join(output_plots, "MasterTable_AllIndices.xlsx")
    master_table.to_excel(master_xlsx, index=False)
    print(f"\n(A) Master table saved ⇒ {master_xlsx}")
    print("Columns:", master_table.columns.tolist())

    
    ###############################################################################
    # 3. SUMMARY TABLE: MBE, RMSE, STD, CC, d
    ###############################################################################
    def index_of_agreement(obs, model):
        obs_mean = np.mean(obs)
        num = np.sum((model - obs)**2)
        den = np.sum((abs(model - obs_mean) + abs(obs - obs_mean))**2)
        if den == 0:
            return np.nan
        return 1 - num/den
    
    def rmse(a, b):
        return np.sqrt(np.mean((a - b)**2))
    
    def std_of_residuals(a, b):
        return np.std(a - b, ddof=1)
    
    def mean_bias_error(a, b):
        return np.mean(b - a)
    
    summary_rows = []
    for idx_name, xls_path in index_files.items():
        if not os.path.isfile(xls_path):          # <- new safety check
            continue
        df_idx = pd.read_excel(xls_path)
        obs_cols = index_columns[idx_name][0]
        emd_cols = index_columns[idx_name][1]
        if isinstance(obs_cols, tuple):
            for oc, ec in zip(obs_cols, emd_cols):
                valid = df_idx[[oc, ec]].dropna()
                if len(valid) < 2:
                    continue
                obs_vals = valid[oc].values
                emd_vals = valid[ec].values
                MB  = mean_bias_error(obs_vals, emd_vals)
                RM  = rmse(obs_vals, emd_vals)
                SR  = std_of_residuals(obs_vals, emd_vals)
                CC  = pearsonr(obs_vals, emd_vals)[0] if len(obs_vals) > 1 else np.nan
                dd  = index_of_agreement(obs_vals, emd_vals)
                idx_label = f"{idx_name}_{oc.replace('obs_','')}"
                summary_rows.append({
                    "Index": idx_label,
                    "Count": len(valid),
                    "MBE": MB,
                    "RMSE": RM,
                    "STDres": SR,
                    "CC": CC,
                    "d": dd,
                })
        else:
            oc = obs_cols
            ec = emd_cols
            valid = df_idx[[oc, ec]].dropna()
            if len(valid) < 2:
                continue
            obs_vals = valid[oc].values
            emd_vals = valid[ec].values
            MB = mean_bias_error(obs_vals, emd_vals)
            RM = rmse(obs_vals, emd_vals)
            SR = std_of_residuals(obs_vals, emd_vals)
            CC = pearsonr(obs_vals, emd_vals)[0] if len(obs_vals) > 1 else np.nan
            dd = index_of_agreement(obs_vals, emd_vals)
            summary_rows.append({
                "Index": idx_name,
                "Count": len(valid),
                "MBE": MB,
                "RMSE": RM,
                "STDres": SR,
                "CC": CC,
                "d": dd,
            })
    
    summary_df = pd.DataFrame(summary_rows)
    summary_cols = ["Index", "Count", "MBE", "RMSE", "STDres", "CC", "d"]
    summary_df = summary_df[summary_cols]
    summary_xlsx = os.path.join(output_plots, "SummaryTable_Extremes.xlsx")
    summary_df.to_excel(summary_xlsx, index=False)
    print(f"(B) Summary Table saved => {summary_xlsx}\n{summary_df}")
    
    ###############################################################################
    # 4. SPATIAL MAPS TRIPTYCH (Observed, EMDNA, Ratio)
    ###############################################################################
    gdf_basin = gpd.read_file(shapefile_path).to_crs(epsg=4326)
    gdf_lakes = gpd.read_file(lakes_shp).to_crs(epsg=4326)
    
    def add_basin_lakes(ax):
        ax.add_feature(cfeature.COASTLINE)
        ax.add_feature(cfeature.BORDERS, linestyle=':')
        for geom in gdf_basin.geometry:
            ax.add_geometries([geom], ccrs.PlateCarree(), facecolor='none', edgecolor='blue', linewidth=1)
        for geom in gdf_lakes.geometry:
            ax.add_geometries([geom], ccrs.PlateCarree(), facecolor='none', edgecolor='cyan', linewidth=1)
    
    def plot_map_triptych(df, obs_col, emd_col, ratio_col, idx_name, out_png):
        """
        Creates one figure with three subplots:
          Left: Observed values,
          Center: EMDNA values,
          Right: Ratio (EMDNA/OBS)
        """
        fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18,6),
                                 subplot_kw={"projection": ccrs.PlateCarree()})
        
        def single_map(ax, col, title):
            ax.set_extent([-95.5, -72, 38.5, 52.5])
            add_basin_lakes(ax)
            sc = ax.scatter(df["lon"], df["lat"], c=df[col], cmap="viridis", s=60,
                            transform=ccrs.PlateCarree(), edgecolor="k", zorder=10)
            cb = plt.colorbar(sc, ax=ax, shrink=0.8)
            cb.set_label(col)
            # Highlight hotspots (top 10%)
            vals = df[col].dropna().values
            if len(vals) > 0:
                thr = np.percentile(vals, 90)
                hot_mask = df[col] >= thr
                ax.scatter(df.loc[hot_mask,"lon"], df.loc[hot_mask,"lat"],
                           marker='o', facecolors='none', edgecolors='red', s=80,
                           transform=ccrs.PlateCarree(), zorder=11,
                           label=f"Hotspot >= {thr:.2f}")
            ax.set_title(title, fontsize=12)
            gl = ax.gridlines(draw_labels=True, linestyle='--', color='gray')
            gl.right_labels = False
            gl.top_labels = False
            ax.legend(loc='upper right')
        
        single_map(axes[0], obs_col,  f"{idx_name} Observed")
        single_map(axes[1], emd_col,  f"{idx_name} EMDNA")
        single_map(axes[2], ratio_col, f"{idx_name} (EMD/OBS)")
        
        plt.tight_layout()
        plt.savefig(out_png, dpi=300, bbox_inches="tight")
        plt.close()
        print("Saved 3-panel map =>", out_png)
    
    def get_map_cols(idx):
        """Return obs_col, emd_col, diff_col, ratio_col for the given index."""
        if idx in ["rx1day","rx5day","cdd","cwd","drydays"]:
            obs = f"{idx}_obs_{idx}"
            emd = f"{idx}_emd_{idx}"
            diff = f"{idx}_diff_{idx}"
            ratio = f"{idx}_ratio_{idx}"
            return obs, emd, diff, ratio
        elif idx == "wetdays":
            obs = "wetdays_obs_wetdays5mm"
            emd = "wetdays_emd_wetdays5mm"
            diff = "wetdays_diff_wetdays5mm"
            ratio = "wetdays_ratio_wetdays5mm"
            return obs, emd, diff, ratio
        elif idx == "r95p":
            obs = "r95p_obs_r95amt"
            emd = "r95p_emd_r95amt"
            diff = "r95p_diff_r95amt"
            ratio = "r95p_ratio_r95amt"
            return obs, emd, diff, ratio
        elif idx == "r99p":
            obs = "r99p_obs_r99amt"
            emd = "r99p_emd_r99amt"
            diff = "r99p_diff_r99amt"
            ratio = "r99p_ratio_r99amt"
            return obs, emd, diff, ratio
        else:
            return None, None, None, None
    
    for idx_name in index_files.keys():
        obs_col, emd_col, diff_col, ratio_col = get_map_cols(idx_name)
        if obs_col is None:
            continue
        needed = ["station_name", "lat", "lon", obs_col, emd_col, ratio_col]
        if not all(c in master_table.columns for c in needed):
            print(f"Skipping map for {idx_name} - missing columns in master_table.")
            continue
        subdf = master_table[needed].dropna(subset=["lat", "lon"])
        out_png = os.path.join(output_plots, f"{idx_name}_MAP_3panel.png")
        plot_map_triptych(subdf, obs_col, emd_col, ratio_col, idx_name, out_png)
    
    ###############################################################################
    # 5. DISTRIBUTION & BOX/CDF/SCATTER TRIPTYCH
    ###############################################################################
    def plot_distribution_triptych(df, obs_col, emd_col, label, out_png):
        """
        Creates one figure with three subplots side-by-side:
          Left: Boxplot,
          Center: CDF,
          Right: Scatter plot.
        """
        fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18,6))
        
        # (A) Boxplot
        data = pd.DataFrame({"Obs": df[obs_col], "EMD": df[emd_col]}).melt(var_name="Dataset", value_name=label)
        sns.boxplot(data=data, x="Dataset", y=label, ax=axes[0])
        axes[0].set_title(f"Boxplot: {label}")
        
        # (B) CDF
        obs_vals = df[obs_col].dropna()
        emd_vals = df[emd_col].dropna()
        def ecdf(x):
            xs = np.sort(x)
            ys = np.arange(1, len(xs)+1) / len(xs)
            return xs, ys
        if len(obs_vals) >= 2 and len(emd_vals) >= 2:
            xs_o, ys_o = ecdf(obs_vals)
            xs_e, ys_e = ecdf(emd_vals)
            axes[1].plot(xs_o, ys_o, label="Obs")
            axes[1].plot(xs_e, ys_e, label="EMD")
            axes[1].set_title(f"CDF of {label}")
            axes[1].set_xlabel(label)
            axes[1].set_ylabel("Probability")
            axes[1].legend()
        else:
            axes[1].set_title(f"CDF: Not enough data ({label})")
        
        # (C) Scatter plot
        valid = df[[obs_col, emd_col]].dropna()
        if len(valid) >= 2:
            x = valid[obs_col]
            y = valid[emd_col]
            cc, _ = pearsonr(x, y)
            axes[2].scatter(x, y, edgecolors='k', alpha=0.7)
            mn, mx = np.nanmin([x.min(), y.min()]), np.nanmax([x.max(), y.max()])
            axes[2].plot([mn, mx], [mn, mx], 'r--')
            axes[2].set_xlabel(f"Obs {label}")
            axes[2].set_ylabel(f"EMD {label}")
            axes[2].set_title(f"{label} (Corr={cc:.2f})")
        else:
            axes[2].set_title(f"Scatter: Not enough data ({label})")
        
        plt.tight_layout()
        plt.savefig(out_png, dpi=300, bbox_inches="tight")
        plt.close()
        print("Saved distribution triptych =>", out_png)
    
    for idx_name in index_files.keys():
        if idx_name in ["rx1day", "rx5day", "cdd", "cwd", "drydays"]:
            obs_col = f"{idx_name}_obs_{idx_name}"
            emd_col = f"{idx_name}_emd_{idx_name}"
        elif idx_name == "wetdays":
            obs_col = "wetdays_obs_wetdays5mm"
            emd_col = "wetdays_emd_wetdays5mm"
        elif idx_name == "r95p":
            obs_col = "r95p_obs_r95amt"
            emd_col = "r95p_emd_r95amt"
        elif idx_name == "r99p":
            obs_col = "r99p_obs_r99amt"
            emd_col = "r99p_emd_r99amt"
        else:
            continue
    
        if obs_col not in master_table.columns or emd_col not in master_table.columns:
            print(f"Skipping distribution for {idx_name} - missing columns.")
            continue
        subdf = master_table[[obs_col, emd_col]].dropna()
        if len(subdf) < 2:
            print(f"Skipping distribution for {idx_name} - not enough data.")
            continue
        out_3panel = os.path.join(output_plots, f"{idx_name}_Distribution_3panel.png")
        plot_distribution_triptych(subdf, obs_col, emd_col, idx_name, out_3panel)
    
    ###############################################################################
    # 6. SCATTER PLOT (SEPARATE, if needed)
    ###############################################################################
    def scatter_index(df, obs_col, emd_col, label, out_png):
        valid = df[[obs_col, emd_col]].dropna()
        if len(valid) < 2:
            return
        x = valid[obs_col]
        y = valid[emd_col]
        cc, _ = pearsonr(x, y)
        fig, ax = plt.subplots(figsize=(5,5))
        ax.scatter(x, y, edgecolors='k', alpha=0.7)
        mn, mx = np.nanmin([x.min(), y.min()]), np.nanmax([x.max(), y.max()])
        ax.plot([mn, mx], [mn, mx], 'r--')
        ax.set_xlabel(f"Obs {label}")
        ax.set_ylabel(f"EMD {label}")
        ax.set_title(f"{label} (Corr={cc:.2f})")
        plt.savefig(out_png, dpi=300, bbox_inches="tight")
        plt.close()
        print("Saved scatter =>", out_png)
    
    for idx_name in index_files.keys():
        if idx_name in ["rx1day", "rx5day", "cdd", "cwd", "drydays"]:
            obs_col = f"{idx_name}_obs_{idx_name}"
            emd_col = f"{idx_name}_emd_{idx_name}"
        elif idx_name == "wetdays":
            obs_col = "wetdays_obs_wetdays5mm"
            emd_col = "wetdays_emd_wetdays5mm"
        elif idx_name == "r95p":
            obs_col = "r95p_obs_r95amt"
            emd_col = "r95p_emd_r95amt"
        elif idx_name == "r99p":
            obs_col = "r99p_obs_r99amt"
            emd_col = "r99p_emd_r99amt"
        else:
            continue
    
        if obs_col not in master_table.columns or emd_col not in master_table.columns:
            continue
        subdf = master_table[[obs_col, emd_col]].dropna()
        if len(subdf) < 2:
            continue
        out_scat = os.path.join(output_plots, f"{idx_name}_Scatter.png")
        scatter_index(subdf, obs_col, emd_col, idx_name, out_scat)
    
    ###############################################################################
    # 7. DONE
    ###############################################################################
    print(f"\n✅  Finished ensemble {ENS:03d}.  Outputs ⇒ {output_plots}")


In [None]:
# making the spatial maps to show which index is doing badly where across the basin

# =============================================================================
#  EMDNA-vs-OBS  |  High-resolution bias maps (Great Lakes Basin)
# =============================================================================
import os, numpy as np, pandas as pd, geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.gridspec as gridspec
import cartopy.crs as ccrs, cartopy.feature as cfeature

# ────────────────────────────────────────────────────────────────────────────
# 1.  CONFIGURATION
# ────────────────────────────────────────────────────────────────────────────
indices_dir  = r"D:\PhD\GLB\EMDNA(Historical data)\Ensembles\New folder\Ensemble files\EMDNA_GLB_Precipitation\1\ClimaticIndices-25KM"
shapefile_path = r"D:\PhD\GLB\greatlakes_subbasins\New folder\Great_Lakes.shp"
lakes_shp      = r"D:\PhD\GLB\greatlakes_subbasins\GLB_Water_Bodies\Main_Lakes_GLB.shp"

output_plots   = os.path.join(indices_dir, "BiasMaps_highres")
os.makedirs(output_plots, exist_ok=True)

index_files = {
    "rx1day":  os.path.join(indices_dir, "rx1day.xlsx"),
    "rx5day":  os.path.join(indices_dir, "rx5day.xlsx"),
    "r95p":    os.path.join(indices_dir, "r95p.xlsx"),
    "r99p":    os.path.join(indices_dir, "r99p.xlsx"),
}

index_columns = {
    "rx1day":  ("obs_rx1day",  "emd_rx1day"),
    "rx5day":  ("obs_rx5day",  "emd_rx5day"),
    "r95p":    (("obs_r95amt",), ("emd_r95amt",)),
    "r99p":    (("obs_r99amt",), ("emd_r99amt",)),
}

HOT_PCTL   = 90
RATIO_CMAP = plt.cm.RdBu_r

# ────────────────────────────────────────────────────────────────────────────
# 2.  MERGE ALL INDICES  ➜ master_table  (adds ratio_* columns)
# ────────────────────────────────────────────────────────────────────────────
master_table = None
for idx, xls in index_files.items():
    df = pd.read_excel(xls)
    df["station_name"] = df["station_name"].astype(str)

    obs_cols, emd_cols = index_columns[idx]
    if not isinstance(obs_cols, tuple):  # homogenise to tuples
        obs_cols, emd_cols = (obs_cols,), (emd_cols,)

    sub = df[["station_name","lat","lon"]].copy()
    for oc, ec in zip(obs_cols, emd_cols):
        sub[oc] = df[oc]
        sub[ec] = df[ec]
        sub[f"ratio_{oc.replace('obs_','')}"] = df[ec] / df[oc]

    # prefix every metric with the index name
    sub = sub.rename({c:f"{idx}_{c}" for c in sub if c not in ["station_name","lat","lon"]}, axis=1)
    master_table = sub if master_table is None else master_table.merge(sub, on=["station_name","lat","lon"], how="outer")

# ────────────────────────────────────────────────────────────────────────────
# 3.  MAP LAYERS
# ────────────────────────────────────────────────────────────────────────────
gdf_basin = gpd.read_file(shapefile_path).to_crs(epsg=4326)
gdf_lakes = gpd.read_file(lakes_shp).to_crs(epsg=4326)

def add_basin_lakes(ax):
    ax.add_feature(cfeature.COASTLINE)
    ax.add_feature(cfeature.BORDERS, linestyle=":")
    for g in gdf_basin.geometry:
        ax.add_geometries([g], ccrs.PlateCarree(), facecolor="none", edgecolor="blue", linewidth=1)
    for g in gdf_lakes.geometry:
        ax.add_geometries([g], ccrs.PlateCarree(), facecolor="none", edgecolor="cyan", linewidth=1)

###############################################################################
# 4.  HIGH-RES BIAS MAPS  –  colour-blind-friendly, concise label (600 dpi)
###############################################################################
import matplotlib.colors as mcolors
import matplotlib.gridspec as gridspec
import numpy as np

HOT_PCTL = 90   # worst 10 % of |ratio−1| are ringed

# ── colour-blind-safe diverging palette (blue → grey → orange) --------------
CB_DIV = mcolors.LinearSegmentedColormap.from_list(
    "cb_div", ["#2166ac", "#f7f7f7", "#b2182b"], N=256
)

# ── basin / lake layers ────────────────────────────────────────────────────
gdf_basin = gpd.read_file(shapefile_path).to_crs(epsg=4326)
gdf_lakes = gpd.read_file(lakes_shp).to_crs(epsg=4326)

def add_basin_lakes(ax):
    """Great-Lakes outlines in CVD-safe hues."""
    ax.add_feature(cfeature.COASTLINE, linewidth=0.6)
    ax.add_feature(cfeature.BORDERS,   linewidth=0.4, linestyle=":")
    # Basin — charcoal
    for geom in gdf_basin.geometry:
        ax.add_geometries([geom], ccrs.PlateCarree(),
                          facecolor="none", edgecolor="#333333",
                          linewidth=1.2, zorder=2)
    # Lakes — muted blue-green
    for geom in gdf_lakes.geometry:
        ax.add_geometries([geom], ccrs.PlateCarree(),
                          facecolor="none", edgecolor="#5ab4ac",
                          linewidth=1.0, zorder=1)

# ── high-resolution single-panel bias map ──────────────────────────────────
def bias_map(df, ratio_col, idx_name, out_base, clip_max=3.0):
    fig = plt.figure(figsize=(9, 6), dpi=600)
    gs  = gridspec.GridSpec(1, 2, width_ratios=[20, 1], wspace=0.03)
    ax  = fig.add_subplot(gs[0], projection=ccrs.PlateCarree())
    cax = fig.add_subplot(gs[1])

    # zoom on basin
    xmin, ymin, xmax, ymax = gdf_basin.total_bounds
    ax.set_extent([xmin-0.5, xmax+0.5, ymin-0.5, ymax+0.5])
    add_basin_lakes(ax)

    # symmetric colour range
    max_dev = min(np.abs(df[ratio_col] - 1).max(), clip_max)
    vmin, vmax = 1 - max_dev, 1 + max_dev
    norm = mcolors.TwoSlopeNorm(vmin=vmin, vcenter=1.0, vmax=vmax)

    # scatter points
    sc = ax.scatter(df["lon"], df["lat"], c=df[ratio_col],
                    cmap=CB_DIV, norm=norm, s=50,
                    transform=ccrs.PlateCarree(),
                    edgecolor="k", zorder=10)

    # colour-bar with concise label  (e.g. “r95p_ratio”)
    cb = fig.colorbar(sc, cax=cax)
    ticks = np.linspace(vmin, vmax, 5)
    cb.set_ticks(ticks)
    cb.ax.set_yticklabels([f"{t:.2f}" if abs(t-1) > 1e-6 else "1.00" for t in ticks])
    cb.set_label(f"{idx_name}_ratio", fontsize=9)     #  << concise name

    # hotspot rings
    tol = np.percentile(np.abs(df[ratio_col] - 1), HOT_PCTL)
    hot = np.abs(df[ratio_col] - 1) >= tol
    ax.scatter(df.loc[hot,"lon"], df.loc[hot,"lat"],
               marker="o", facecolors="none", edgecolors="red",
               s=90, transform=ccrs.PlateCarree(), zorder=11,
               label=f"Large bias (>|{tol:.2f}|)")

    # grid & labels
    gl = ax.gridlines(draw_labels=True, linestyle="--",
                      linewidth=0.5, color="gray", alpha=0.3)
    gl.top_labels = gl.right_labels = False
    gl.xlabel_style = gl.ylabel_style = {"size":8}

    ax.set_title(f"{idx_name} Bias EMDNA", fontsize=16, pad=10)
    ax.legend(loc="upper center", bbox_to_anchor=(0.5, -0.08),
              fontsize=9, frameon=False)

    # save PNG & TIFF
    for ext in ("png", "tif"):
        fig.savefig(f"{out_base}.{ext}", dpi=600, bbox_inches="tight")
    plt.close()

# ── helper: ratio-column per index ─────────────────────────────────────────
def ratio_col_name(idx):
    if idx in ["rx1day", "rx5day"]:
        return f"{idx}_ratio_{idx}"
    if idx == "r95p":
        return "r95p_ratio_r95amt"
    if idx == "r99p":
        return "r99p_ratio_r99amt"
    return None

# ── generate bias maps ────────────────────────────────────────────────────
for idx in index_files:
    rc = ratio_col_name(idx)
    if rc is None or rc not in master_table.columns:
        continue
    df_plot = master_table[["lat", "lon", rc]].dropna()
    if df_plot.empty:
        continue
    bias_map(df_plot, rc, idx,
             os.path.join(output_plots, f"{idx}_BiasEMDNA_highres"))

print("Done – colour-blind-friendly bias maps written to:", output_plots)

# ────────────────────────────────────────────────────────────────────────────
# 5.  GENERATE THE EIGHT FIGURES
# ────────────────────────────────────────────────────────────────────────────
for idx in index_files.keys():
    rc = ratio_col_name(idx)
    if rc is None or rc not in master_table.columns:
        continue
    df = master_table[["lat","lon",rc]].dropna()
    if df.empty:
        continue
    bias_map(df, rc, idx, os.path.join(output_plots, f"{idx}_BiasEMDNA_highres"))

print("\nDone – eight high-resolution bias maps written to:", output_plots)
