In [None]:
# Masking the EMDNA, Extracting prcp and organizing the ensembles per year 1989-2018

import os
import glob
import xarray as xr
import numpy as np

# --- USER SETTINGS ---

# Define the extended bounding box
min_lon, max_lon = -93, -74.5   # Desired longitude limits
max_lat, min_lat = 50.5, 40.0   # Desired latitude limits

# Main input folder containing year-based subfolders (e.g., 1989, 1990, ..., 2018)
main_input_folder = r"Z:\DATA\EMDNAdata"

# Base output folder where each ensemble subfolder (e.g., "001", "002", etc.) will be created
output_folder = r"D:\PhD\GLB\EMDNA full"

# --- SCRIPT BEGINS ---

print("Extended bounding box:")
print(f"  lon: {min_lon} to {max_lon}")
print(f"  lat: {max_lat} to {min_lat}\n")

# Gather year subfolders (1989, 1990, 1991, ..., 2018)
year_folders = [
    os.path.join(main_input_folder, d)
    for d in os.listdir(main_input_folder)
    if os.path.isdir(os.path.join(main_input_folder, d))
]

for year_folder in year_folders:
    year_name = os.path.basename(year_folder)
    print(f"Processing year folder: {year_folder}")

    # Collect all .nc4 files in this year folder
    nc_files = glob.glob(os.path.join(year_folder, "*.nc4"))
    if not nc_files:
        print(f"  No .nc4 files found in {year_folder}")
        continue

    for file_path in nc_files:
        base_name = os.path.basename(file_path)

        # Skip any mean or spread files
        if "mean.nc4" in base_name.lower() or "spread.nc4" in base_name.lower():
            print(f"  Skipping file (mean/spread): {base_name}")
            continue

        # Parse the ensemble number from the filename
        # Example filename: EMDNA_1989.001.nc4 -> ensemble_str = "001"
        # This assumes the pattern: EMDNA_YYYY.XXX.nc4
        parts = base_name.rsplit('.', 2)
        if len(parts) != 3:
            print(f"  WARNING: Unexpected filename format; skipping {base_name}")
            continue
        ensemble_str = parts[1]  # "001", "002", etc.

        print(f"  Processing file: {base_name} (Ensemble {ensemble_str})")

        # Open the dataset
        ds = xr.open_dataset(file_path)

        # Rename dimensions: "x" -> "lon" and "y" -> "lat"
        if "x" in ds.dims and "y" in ds.dims:
            ds = ds.rename({"x": "lon", "y": "lat"})

        # Assign coordinate values for "lon" and "lat" from "longitude"/"latitude"
        if "longitude" in ds.variables and "latitude" in ds.variables:
            ds = ds.assign_coords(lon=ds["longitude"], lat=ds["latitude"])
            ds = ds.drop_vars(["longitude", "latitude"])

        # Subset/mask to the bounding box
        ds_masked = ds.sel(lon=slice(min_lon, max_lon),
                           lat=slice(max_lat, min_lat))

        # Extract only the prcp variable (if present)
        if "prcp" not in ds_masked.data_vars:
            print(f"    No 'prcp' variable found in {base_name}, skipping.")
            ds_masked.close()
            ds.close()
            continue
        ds_prcp = ds_masked[["prcp"]]

        # Create the output subfolder for this ensemble number
        # e.g., D:\PhD\GLB\EMDNA full\001
        ensemble_folder = os.path.join(output_folder, ensemble_str)
        os.makedirs(ensemble_folder, exist_ok=True)

        # Construct output file name: EMDNA_1989.001_masked_prcp.nc4
        output_name = base_name.replace(".nc4", "_masked_prcp.nc4")
        output_file = os.path.join(ensemble_folder, output_name)

        # Save the prcp dataset
        ds_prcp.to_netcdf(output_file)

        # Close datasets
        ds_prcp.close()
        ds_masked.close()
        ds.close()

        print(f"    Saved prcp file to: {output_file}")

print("\nAll folders processed successfully.")


In [None]:
import os
import glob
import xarray as xr

# --------------------- USER SETTINGS ---------------------
base_folder = r"D:\PhD\GLB\EMDNA full"  # Path containing subfolders named 001..100

def parse_ensemble_folder(folder_name):
    """
    Convert a folder name like '001' to integer 1, '010' to 10, etc.
    Returns None if folder_name is not a valid integer.
    """
    try:
        return int(folder_name)
    except ValueError:
        return None

# ---------------------------------------------------------
# Loop through each subfolder (001..100), find all .nc4 files,
# manually open them, and concatenate along the 'time' dimension.
# Then save the merged dataset as a single NetCDF file.
# ---------------------------------------------------------
for folder_name in sorted(os.listdir(base_folder)):
    folder_path = os.path.join(base_folder, folder_name)
    
    if os.path.isdir(folder_path):
        # Convert folder name like '001' -> 1
        ens_num = parse_ensemble_folder(folder_name)
        # We only proceed if it's a valid ensemble number
        if ens_num is not None:
            # Gather the 30 yearly .nc4 files in this folder
            nc_files = sorted(glob.glob(os.path.join(folder_path, "*.nc4")))
            
            if not nc_files:
                print(f"No .nc4 files found in folder: {folder_name}. Skipping.")
                continue
            
            # Open each year's NetCDF and store in a list
            ds_list = []
            for f in nc_files:
                ds_list.append(xr.open_dataset(f))
            
            # Concatenate them along the 'time' dimension
            ds_merged = xr.concat(ds_list, dim="time")
            
            # Close the individual datasets now that we've merged
            for ds in ds_list:
                ds.close()
            
            # Construct output file name, e.g. "merger_1_1989-2018_prcp.nc"
            # or "merger_001_1989-2018_prcp.nc" – whichever format you prefer
            output_file = os.path.join(
                folder_path,
                f"merged_{ens_num:03d}_1989-2018_prcp.nc"
            )
            
            # Save the merged dataset
            ds_merged.to_netcdf(output_file)
            ds_merged.close()
            
            print(f"Merged {len(nc_files)} files into {output_file}")


In [None]:
# Code for testing the 10 ensembles over 100 ensembles mean, variance and distribution with bootstrapping


import os
import glob
import xarray as xr
import numpy as np
import pandas as pd
from scipy import stats
import random

# --------------------- 1) USER SETTINGS ---------------------
base_folder = r"D:\PhD\GLB\EMDNA full"  # top-level folder containing subfolders named 001..100

# Full ensemble range (1..100)
all_ensembles = list(range(1, 101))

# Your chosen subset of 10 ensembles
subset_ensembles = [1, 11, 21, 31, 41, 51, 61, 71, 81, 91]


def parse_ensemble_folder(folder_name: str):
    """
    Convert a folder name like '001' to integer 1, '010' to 10, etc.
    Returns None if it's not a valid integer folder name.
    """
    try:
        return int(folder_name)
    except ValueError:
        return None


def get_annual_means_for_ensemble(ensemble_folder: str, ens_str: str):
    """
    1) Locates the merged NetCDF file named "merged_<ens_str>_1989-2018_prcp.nc" in this folder.
    2) Opens it with xarray.
    3) If 'time' is not datetime64, assign a DatetimeIndex from 1989-01-01 daily
       for however many days are in 'time' (using ds.sizes['time']).
    4) Compute domain-average precipitation (mean over lon & lat).
    5) Resample to annual means using 'YS' (Year Start).
    6) Return a 1D NumPy array (~30 values, one per year 1989-2018).
    """
    # Example file name: merged_001_1989-2018_prcp.nc
    merged_pattern = f"merged_{ens_str}_1989-2018_prcp.nc"
    merged_file = os.path.join(ensemble_folder, merged_pattern)
    
    if not os.path.isfile(merged_file):
        print(f"  File not found: {merged_file}. Skipping ensemble {ens_str}.")
        return None
    
    ds = xr.open_dataset(merged_file)
    
    # Ensure 'time' is a daily DatetimeIndex, if not already
    if not np.issubdtype(ds.time.dtype, np.datetime64):
        n_time = ds.sizes['time']  # Future-proof method to get dimension length
        new_time = pd.date_range(start="1989-01-01", periods=n_time, freq='D')
        ds = ds.assign_coords(time=new_time)

    # Domain-average precipitation
    ds_domain_mean = ds['prcp'].mean(dim=['lon', 'lat'])
    
    # Resample annually with 'YS' (year start) to avoid 'AS' deprecation warning
    annual_means = ds_domain_mean.resample(time='YS').mean()
    annual_means_values = annual_means.values
    
    ds.close()
    return annual_means_values


# --------------------- 2) GATHER DATA FOR EACH ENSEMBLE ---------------------
ensemble_annual_means = {}  # key= ensemble int (1..100), value= array of ~30 annual means

for folder_name in sorted(os.listdir(base_folder)):
    folder_path = os.path.join(base_folder, folder_name)
    if os.path.isdir(folder_path):
        ens_num = parse_ensemble_folder(folder_name)
        if ens_num is not None and ens_num in all_ensembles:
            ens_str = f"{ens_num:03d}"
            ann_means = get_annual_means_for_ensemble(folder_path, ens_str)
            if ann_means is not None:
                ensemble_annual_means[ens_num] = ann_means

print(f"\nFound annual means for {len(ensemble_annual_means)} ensembles (out of 100).")

# --------------------- 3) BUILD FULL & SUBSET ARRAYS (ALL ANNUAL MEANS) ---------------------
full_data = []
for ens_num in sorted(all_ensembles):
    if ens_num in ensemble_annual_means:
        full_data.extend(ensemble_annual_means[ens_num])
full_data = np.array(full_data)

subset_data = []
for ens_num in subset_ensembles:
    if ens_num in ensemble_annual_means:
        subset_data.extend(ensemble_annual_means[ens_num])
subset_data = np.array(subset_data)

print("Full set array shape (annual means):", full_data.shape)
print("Subset array shape (annual means):", subset_data.shape)

# --------------------- 4) T-TEST, LEVENE'S TEST, K–S TEST ---------------------
t_stat, p_value_ttest = stats.ttest_ind(subset_data, full_data, equal_var=False)
print("\n== Two-sample t-test (compare means) ==")
print(f"  t-statistic: {t_stat:.4f}")
print(f"  p-value:     {p_value_ttest:.6g}")
if p_value_ttest < 0.05:
    print("  => Means are significantly different (p < 0.05).")
else:
    print("  => No significant difference in means (p >= 0.05).")

W_stat, p_value_levene = stats.levene(subset_data, full_data, center='mean')
print("\n== Levene’s test (compare variances) ==")
print(f"  W-statistic: {W_stat:.4f}")
print(f"  p-value:     {p_value_levene:.6g}")
if p_value_levene < 0.05:
    print("  => Variances differ significantly (p < 0.05).")
else:
    print("  => No significant difference in variances (p >= 0.05).")

ks_stat, ks_p = stats.ks_2samp(subset_data, full_data)
print("\n== Two-sample K–S test (compare distributions) ==")
print(f"  KS statistic: {ks_stat:.4f}")
print(f"  p-value:      {ks_p:.6g}")
if ks_p < 0.05:
    print("  => Distributions differ significantly (p < 0.05).")
else:
    print("  => No significant difference in distributions (p >= 0.05).")


# --------------------- 5) BOOTSTRAPPING ---------------------
# (A) Reduce each ensemble to a single 30-year mean => 1 value per ensemble
ensemble_mean = {}
for ens_num, ann_array in ensemble_annual_means.items():
    ensemble_mean[ens_num] = ann_array.mean()

# Build array of full set's ensemble-level means (length ≤ 100)
all_ensemble_means = []
for ens_num in sorted(all_ensembles):
    if ens_num in ensemble_mean:
        all_ensemble_means.append(ensemble_mean[ens_num])
all_ensemble_means = np.array(all_ensemble_means)

# Build array of subset's ensemble-level means (length ≤ 10)
subset_ensemble_means = []
for ens_num in subset_ensembles:
    if ens_num in ensemble_mean:
        subset_ensemble_means.append(ensemble_mean[ens_num])
subset_ensemble_means = np.array(subset_ensemble_means)

# Real subset's average = average of its 10 means
real_subset_mean = subset_ensemble_means.mean()

def bootstrap_test_ensemble_means(all_means, real_subset_mean, n_ensembles=10, n_boot=10000):
    """
    Perform a bootstrap to see how typical 'real_subset_mean' is
    compared to random draws of 'n_ensembles' from 'all_means'.

    all_means: array of shape (N,) e.g. 100 ensemble-level means
    real_subset_mean: float, the chosen subset's mean
    n_ensembles: number of ensembles in each draw (10)
    n_boot: number of bootstrap iterations (e.g., 10000)

    Returns: (boot_means, p_value_two_sided)
      - boot_means: array of shape (n_boot,) with the average for each draw
      - p_value_two_sided: fraction of draws that are as extreme as 'real_subset_mean'
                           in either tail (two-sided).
    """
    N = len(all_means)
    boot_means = np.zeros(n_boot, dtype=float)

    for i in range(n_boot):
        # Randomly pick 'n_ensembles' items WITH replacement
        sample_indices = np.random.randint(0, N, size=n_ensembles)
        sample_means = all_means[sample_indices]
        boot_means[i] = sample_means.mean()

    # Fraction of bootstrap means <= real_subset_mean
    fraction_le = np.mean(boot_means <= real_subset_mean)
    # Fraction of bootstrap means >= real_subset_mean
    fraction_ge = np.mean(boot_means >= real_subset_mean)

    # Two-sided test
    p_two_sided = 2 * min(fraction_le, fraction_ge)
    return boot_means, p_two_sided


# Run bootstrap
boot_means, p_val_boot = bootstrap_test_ensemble_means(
    all_ensemble_means,
    real_subset_mean,
    n_ensembles=10,
    n_boot=10000
)

print("\n== BOOTSTRAP RESULTS (ensemble-level means) ==")
print(f"  Subset of 10 average: {real_subset_mean:.4f}")
print(f"  Two-sided p-value from bootstrap: {p_val_boot:.4f}")
if p_val_boot < 0.05:
    print("  => The chosen 10-ensemble subset’s mean is unusual (outside ~95% range).")
else:
    print("  => The chosen 10-ensemble subset’s mean is typical of a random 10-ensemble draw.")

print("\nAll comparisons (including bootstrap) complete.")

