In [62]:
import xarray as xr
import netCDF4 as nc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import wilcoxon

# Define parameters for analysis
start_year = 2015
end_year = 2016
num_samples = 100
longhurst_region_code = 36
region_name = "Tasman Sea"
cat_value = 4


# Section 1: Load and Preprocess Data
# Load the netCDF file
dataset = xr.open_dataset('/Users/sayooj/Downloads/GlobalAtlas_MHW_ESACCISST_1deg_1982-2021.nc', decode_times=False)

# Define the start and end indices for slicing
start_idx = (start_year - 1982) * 365
end_idx = start_idx + (end_year - start_year + 1) * 365 - 1

# Create a new dataset with data only for the specified time range
new_dataset = dataset.isel(time=slice(start_idx, end_idx + 1))

# Convert data variables to float32
new_dataset['cat'] = new_dataset['cat'].astype('float32')
new_dataset['mhw'] = new_dataset['mhw'].astype('float32')

# Save the new dataset to a new netCDF file
new_dataset.to_netcdf(f'/Users/sayooj/Downloads/{region_name}_{start_year}_{end_year}.nc')

# Section 2: Create Monthly Masks
# Load the netCDF file
nc_file = nc.Dataset(f'/Users/sayooj/Downloads/{region_name}_{start_year}_{end_year}.nc', 'r')

# Retrieve latitude, longitude, and time variables
lat = nc_file.variables['lat'][:]
lon = nc_file.variables['lon'][:]
time = nc_file.variables['time'][:]
cat_daily = nc_file.variables['cat'][:]

# Calculate the number of months
num_months = int(len(time) / 30)

# Create an empty array to store monthly masks
monthly_masks = np.zeros((num_months, len(lat), len(lon)))

# Iterate over each month
for month in range(num_months):
    # Calculate the start and end indices for the current month
    start_idx = month * 30
    end_idx = (month + 1) * 30

    # Extract the daily cat values for the current month
    month_data = cat_daily[start_idx:end_idx]

    # Find the maximum category occurrence for each lat-lon point in the current month
    max_values = np.max(month_data, axis=0)

    # Set areas impacted by the highest category occurrence
    monthly_mask = np.where(max_values > 0, max_values, 0)

    # Save the monthly mask
    monthly_masks[month] = monthly_mask

# Create a new netCDF file to save the monthly masks
output_file = nc.Dataset(f'/Users/sayooj/Downloads/monthly_masks_{region_name}_{start_year}_{end_year}.nc', 'w')

# Define dimensions in the new netCDF file
output_file.createDimension('lat', len(lat))
output_file.createDimension('lon', len(lon))
output_file.createDimension('time', num_months)

# Create variables in the new netCDF file
lat_var = output_file.createVariable('lat', np.float32, ('lat',))
lon_var = output_file.createVariable('lon', np.float32, ('lon',))
time_var = output_file.createVariable('time', np.float64, ('time',))
mask_var = output_file.createVariable('monthly_masks', np.int32, ('time', 'lat', 'lon'))

# Assign values to variables in the new netCDF file
lat_var[:] = lat
lon_var[:] = lon
time_var[:] = np.arange(1, num_months + 1)
mask_var[:, :, :] = monthly_masks

# Add attributes to variables
lat_var.units = 'degrees_north'
lon_var.units = 'degrees_east'
time_var.units = 'months since {}-01-01'.format(start_year)
mask_var.units = '1'

# Add global attributes
output_file.description = 'Monthly masks for marine heatwaves in {}'.format(region_name)

# Close the new netCDF file
output_file.close()

# Close the netCDF file
nc_file.close()

# Section 3: Mask Based on Longhurst Regions
# Open the Longhurst region file
longhurst_file = '/Users/sayooj/Downloads/Longhurst_1_deg.nc'
longhurst_dataset = nc.Dataset(longhurst_file)

# Read the Longhurst variable
longhurst = longhurst_dataset.variables['longhurst'][:]

# Open the monthly masks file
monthly_mask_file = f'/Users/sayooj/Downloads/monthly_masks_{region_name}_{start_year}_{end_year}.nc'
monthly_mask_dataset = nc.Dataset(monthly_mask_file)

# Read the monthly masks variable
monthly_masks = monthly_mask_dataset.variables['monthly_masks'][:]

# Create a mask based on Longhurst regions and transpose it
mask = np.isin(longhurst, [longhurst_region_code]).T

# Apply the mask to each time step individually
masked_monthly_masks = np.where(mask, monthly_masks, np.nan)

# Get dimensions from the original dataset
lat = monthly_mask_dataset.variables['lat'][:]
lon = monthly_mask_dataset.variables['lon'][:]
time = monthly_mask_dataset.variables['time'][:]

# Close the netCDF files
longhurst_dataset.close()
monthly_mask_dataset.close()

# Save the masked data to a new netCDF file
masked_file_path = f'/Users/sayooj/Downloads/monthly_masks_masked_{region_name}_{start_year}_{end_year}.nc'
with nc.Dataset(masked_file_path, 'w') as masked_dataset:
    # Create dimensions
    masked_dataset.createDimension('lat', len(lat))
    masked_dataset.createDimension('lon', len(lon))
    masked_dataset.createDimension('time', len(time))

    # Create variables
    masked_lat = masked_dataset.createVariable('lat', 'float', ('lat',))
    masked_lat.units = 'degrees_north'
    masked_lon = masked_dataset.createVariable('lon', 'float', ('lon',))
    masked_lon.units = 'degrees_east'
    masked_time = masked_dataset.createVariable('time', 'double', ('time',))

    masked_monthly_masks_var = masked_dataset.createVariable('monthly_masks', 'float', ('time', 'lat', 'lon'),
                                                           fill_value=np.nan)  # Use an appropriate fill value

    # Assign values to variables
    masked_lat[:] = lat
    masked_lon[:] = lon
    masked_time[:] = time
    masked_monthly_masks_var[:] = np.where(masked_monthly_masks != -2147483648.0, masked_monthly_masks, np.nan)

    masked_monthly_masks_var.min = 1
    masked_monthly_masks_var.max = cat_value

print("Masking complete. The masked data has been saved to:", masked_file_path)

# Section 4: Statistical Analysis
# Open the dataset file
dataset_file = nc.Dataset('/Users/sayooj/Downloads/OceanSODA-ETHZ_GRaCER_v2021a_1982-2020.nc')

# Get the variable data
fgco2 = dataset_file.variables['fgco2'][:]
ph = dataset_file.variables['ph_total'][:]
aragonite_saturation = dataset_file.variables['omega_ar'][:]
sst = dataset_file.variables['temperature'][:]

# Create the time axis for the specified years
dates = pd.date_range(start='{}-01-01'.format(start_year), end='{}-12-31'.format(end_year), freq='M')

# Find the indices corresponding to the time period
indices_year = np.where(dates.year.isin([start_year, end_year]))[0]

# Slice the data for the specified years
fgco2_year = fgco2[indices_year]
sst_year = sst[indices_year]
ph_year = ph[indices_year]
aragonite_saturation_year = aragonite_saturation[indices_year]

# Open the mask file for the specified region
mask_file = nc.Dataset(f'/Users/sayooj/Downloads/monthly_masks_masked_{region_name}_{start_year}_{end_year}.nc')

# Get the mask variable for the specified region
mask_region = mask_file.variables['monthly_masks'][:]

# Apply the mask to the sliced data
fgco2_masked_year = np.ma.masked_array(fgco2_year, np.logical_not(mask_region))
sst_masked_year = np.ma.masked_array(sst_year, np.logical_not(mask_region))
ph_masked_year = np.ma.masked_array(ph_year, np.logical_not(mask_region))
aragonite_saturation_masked_year = np.ma.masked_array(aragonite_saturation_year, np.logical_not(mask_region))

# Calculate median values with the mask
fgco2_median_region_year = np.ma.median(fgco2_masked_year, axis=(1, 2))
sst_median_region_year = np.ma.median(sst_masked_year, axis=(1, 2))
ph_median_region_year = np.ma.median(ph_masked_year, axis=(1, 2))
aragonite_saturation_median_region_year = np.ma.median(aragonite_saturation_masked_year, axis=(1, 2))

# Get indices where the mask values are equal to the specified category value (e.g., heatwave period)
indices_heatwave_region_year = np.where(mask_region == cat_value)[0]

# Get indices where the mask values are not equal to the specified category value (e.g., non-heatwave period)
indices_non_heatwave_region_year = np.where(mask_region != cat_value)[0]

# Perform the Wilcoxon signed-rank tests for each variable
p_values_region_year = []
median_diff_region_year = []
std_dev_region_year = []

# Perform the Wilcoxon signed-rank tests for fgco2
for _ in range(num_samples):
    # Randomly select indices for heatwave and non-heatwave periods
    sample_indices_region_heatwave_year = np.random.choice(indices_heatwave_region_year, len(indices_heatwave_region_year), replace=True)
    sample_indices_region_non_heatwave_year = np.random.choice(indices_non_heatwave_region_year, len(indices_heatwave_region_year), replace=True)

    # Filter the data based on the sampled indices
    sample_fgco2_median_region_heatwave_year = fgco2_median_region_year[sample_indices_region_heatwave_year]
    sample_fgco2_median_region_non_heatwave_year = fgco2_median_region_year[sample_indices_region_non_heatwave_year]

    # Perform the Wilcoxon signed-rank tests
    _, p_value_region_year = wilcoxon(sample_fgco2_median_region_heatwave_year, sample_fgco2_median_region_non_heatwave_year)

    # Calculate the median difference and standard deviation
    median_diff_region_year.append(np.median(sample_fgco2_median_region_heatwave_year - sample_fgco2_median_region_non_heatwave_year))
    std_dev_region_year.append(np.std(sample_fgco2_median_region_heatwave_year - sample_fgco2_median_region_non_heatwave_year))

    # Append the p-value to the respective list
    p_values_region_year.append(p_value_region_year)

# Calculate the mean p-value, median difference, and standard deviation for fgco2
mean_p_value_fgco2_region_year = np.mean(p_values_region_year)
mean_median_diff_fgco2_region_year = np.mean(median_diff_region_year)
mean_std_dev_fgco2_region_year = np.mean(std_dev_region_year)

# Perform the Wilcoxon signed-rank tests for sst
p_values_region_year = []
median_diff_region_year = []
std_dev_region_year = []

for _ in range(num_samples):
    # Randomly select indices for heatwave and non-heatwave periods
    sample_indices_region_heatwave_year = np.random.choice(indices_heatwave_region_year, len(indices_heatwave_region_year), replace=True)
    sample_indices_region_non_heatwave_year = np.random.choice(indices_non_heatwave_region_year, len(indices_heatwave_region_year), replace=True)

    # Filter the data based on the sampled indices
    sample_sst_median_region_heatwave_year = sst_median_region_year[sample_indices_region_heatwave_year]
    sample_sst_median_region_non_heatwave_year = sst_median_region_year[sample_indices_region_non_heatwave_year]

    # Perform the Wilcoxon signed-rank tests
    _, p_value_region_year = wilcoxon(sample_sst_median_region_heatwave_year, sample_sst_median_region_non_heatwave_year)

    # Calculate the median difference and standard deviation
    median_diff_region_year.append(np.median(sample_sst_median_region_heatwave_year - sample_sst_median_region_non_heatwave_year))
    std_dev_region_year.append(np.std(sample_sst_median_region_heatwave_year - sample_sst_median_region_non_heatwave_year))

    # Append the p-value to the respective list
    p_values_region_year.append(p_value_region_year)

# Calculate the mean p-value, median difference, and standard deviation for sst
mean_p_value_sst_region_year = np.mean(p_values_region_year)
mean_median_diff_sst_region_year = np.mean(median_diff_region_year)
mean_std_dev_sst_region_year = np.mean(std_dev_region_year)

# Perform the Wilcoxon signed-rank tests for ph
p_values_region_year = []
median_diff_region_year = []
std_dev_region_year = []

for _ in range(num_samples):
    # Randomly select indices for heatwave and non-heatwave periods
    sample_indices_region_heatwave_year = np.random.choice(indices_heatwave_region_year, len(indices_heatwave_region_year), replace=True)
    sample_indices_region_non_heatwave_year = np.random.choice(indices_non_heatwave_region_year, len(indices_heatwave_region_year), replace=True)

    # Filter the data based on the sampled indices
    sample_ph_median_region_heatwave_year = ph_median_region_year[sample_indices_region_heatwave_year]
    sample_ph_median_region_non_heatwave_year = ph_median_region_year[sample_indices_region_non_heatwave_year]

    # Perform the Wilcoxon signed-rank tests
    _, p_value_region_year = wilcoxon(sample_ph_median_region_heatwave_year, sample_ph_median_region_non_heatwave_year)

    # Calculate the median difference and standard deviation
    median_diff_region_year.append(np.median(sample_ph_median_region_heatwave_year - sample_ph_median_region_non_heatwave_year))
    std_dev_region_year.append(np.std(sample_ph_median_region_heatwave_year - sample_ph_median_region_non_heatwave_year))

    # Append the p-value to the respective list
    p_values_region_year.append(p_value_region_year)

# Calculate the mean p-value, median difference, and standard deviation for ph
mean_p_value_ph_region_year = np.mean(p_values_region_year)
mean_median_diff_ph_region_year = np.mean(median_diff_region_year)
mean_std_dev_ph_region_year = np.mean(std_dev_region_year)

# Perform the Wilcoxon signed-rank tests for aragonite saturation
p_values_region_year = []
median_diff_region_year = []
std_dev_region_year = []

for _ in range(num_samples):
    # Randomly select indices for heatwave and non-heatwave periods
    sample_indices_region_heatwave_year = np.random.choice(indices_heatwave_region_year, len(indices_heatwave_region_year), replace=True)
    sample_indices_region_non_heatwave_year = np.random.choice(indices_non_heatwave_region_year, len(indices_heatwave_region_year), replace=True)

    # Filter the data based on the sampled indices
    sample_aragonite_saturation_median_region_heatwave_year = aragonite_saturation_median_region_year[sample_indices_region_heatwave_year]
    sample_aragonite_saturation_median_region_non_heatwave_year = aragonite_saturation_median_region_year[sample_indices_region_non_heatwave_year]

    # Perform the Wilcoxon signed-rank tests
    _, p_value_region_year = wilcoxon(sample_aragonite_saturation_median_region_heatwave_year, sample_aragonite_saturation_median_region_non_heatwave_year)

    # Calculate the median difference and standard deviation
    median_diff_region_year.append(np.median(sample_aragonite_saturation_median_region_heatwave_year - sample_aragonite_saturation_median_region_non_heatwave_year))
    std_dev_region_year.append(np.std(sample_aragonite_saturation_median_region_heatwave_year - sample_aragonite_saturation_median_region_non_heatwave_year))

    # Append the p-value to the respective list
    p_values_region_year.append(p_value_region_year)

# Calculate the mean p-value, median difference, and standard deviation for aragonite saturation
mean_p_value_aragonite_saturation_region_year = np.mean(p_values_region_year)
mean_median_diff_aragonite_saturation_region_year = np.mean(median_diff_region_year)
mean_std_dev_aragonite_saturation_region_year = np.mean(std_dev_region_year)

# Print the results
print(f"Results for fgco2 in {region_name} in {start_year}/{end_year}:")
print("Mean p-value:", mean_p_value_fgco2_region_year)
print("Mean median difference:", mean_median_diff_fgco2_region_year)
print("Mean standard deviation:", mean_std_dev_fgco2_region_year)

print(f"\nResults for sst in {region_name} in {start_year}/{end_year}:")
print("Mean p-value:", mean_p_value_sst_region_year)
print("Mean median difference:", mean_median_diff_sst_region_year)
print("Mean standard deviation:", mean_std_dev_sst_region_year)

print(f"\nResults for ph in {region_name} in {start_year}/{end_year}:")
print("Mean p-value:", mean_p_value_ph_region_year)
print("Mean median difference:", mean_median_diff_ph_region_year)
print("Mean standard deviation:", mean_std_dev_ph_region_year)

print(f"\nResults for aragonite saturation in {region_name} in {start_year}/{end_year}:")
print("Mean p-value:", mean_p_value_aragonite_saturation_region_year)
print("Mean median difference:", mean_median_diff_aragonite_saturation_region_year)
print("Mean standard deviation:", mean_std_dev_aragonite_saturation_region_year)

# Close the netCDF files
dataset_file.close()
mask_file.close()


Masking complete. The masked data has been saved to: /Users/sayooj/Downloads/monthly_masks_masked_Tasman Sea_2015_2016.nc




Results for fgco2 in Tasman Sea in 2015/2016:
Mean p-value: 0.4758150233419608
Mean median difference: -0.029110721834003924
Mean standard deviation: 0.05785404956520173

Results for sst in Tasman Sea in 2015/2016:
Mean p-value: 0.4406264349675293
Mean median difference: -0.16939254283905028
Mean standard deviation: 0.28117283229059586

Results for ph in Tasman Sea in 2015/2016:
Mean p-value: 0.5857235574184234
Mean median difference: 0.0005371665954589844
Mean standard deviation: 0.0021317238840245583

Results for aragonite saturation in Tasman Sea in 2015/2016:
Mean p-value: 0.5818548200151398
Mean median difference: 0.001022564172744751
Mean standard deviation: 0.03514444547459073
