In [36]:
import xarray as xr
import netCDF4 as nc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import wilcoxon

# Define parameters for analysis
start_year = 2011
end_year = 2011
num_samples = 100
variable_selection = 'fgco2'  # Change this to analyze different variables ('fgco2', 'ph', 'omega_ar', 'temperature','spco2')
longhurst_region_code = 29  # Change this to analyze different Longhurst regions
region_name = "western australia"
cat_value = 4 # Change this to select a different cat value for category selection

# Load the netCDF file
dataset = xr.open_dataset('/Users/sayooj/Downloads/GlobalAtlas_MHW_ESACCISST_1deg_1982-2021.nc', decode_times=False)

# Define the start and end indices for slicing
start_idx = (start_year - 1982) * 365  # Index corresponding to start date
end_idx = start_idx + (end_year - start_year + 1) * 365 - 1  # Index corresponding to end date

# Create a new dataset with data only for the specified time range
new_dataset = dataset.isel(time=slice(start_idx, end_idx + 1))

# Convert data variables to float32
new_dataset['cat'] = new_dataset['cat'].astype('float32')
new_dataset['mhw'] = new_dataset['mhw'].astype('float32')

# Save the new dataset to a new netCDF file
new_dataset.to_netcdf(f'/Users/sayooj/Downloads/{region_name}_{start_year}_{end_year}.nc')

# Load the netCDF file
nc_file = nc.Dataset(f'/Users/sayooj/Downloads/{region_name}_{start_year}_{end_year}.nc', 'r')

# Retrieve latitude, longitude, and time variables
lat = nc_file.variables['lat'][:]
lon = nc_file.variables['lon'][:]
time = nc_file.variables['time'][:]

# Extract cat variable
cat_daily = nc_file.variables['cat'][:]

# Calculate the number of months
num_months = int(len(time) / 30)

# Create an empty array to store monthly masks
monthly_masks = np.zeros((num_months, len(lat), len(lon)))

# Iterate over each month
for month in range(num_months):
    # Calculate the start and end indices for the current month
    start_idx = month * 30
    end_idx = (month + 1) * 30

    # Extract the daily cat values for the current month
    month_data = cat_daily[start_idx:end_idx]

    # Find the maximum category occurrence for each lat-lon point in the current month
    max_values = np.max(month_data, axis=0)

    # Set areas impacted by the highest category occurrence
    monthly_mask = np.where(max_values > 0, max_values, 0)

    # Save the monthly mask
    monthly_masks[month] = monthly_mask

# Create a new netCDF file to save the monthly masks
output_file = nc.Dataset(f'/Users/sayooj/Downloads/monthly_masks_{region_name}_{start_year}_{end_year}.nc', 'w')

# Define dimensions in the new netCDF file
output_file.createDimension('lat', len(lat))
output_file.createDimension('lon', len(lon))
output_file.createDimension('time', num_months)

# Create variables in the new netCDF file
lat_var = output_file.createVariable('lat', np.float32, ('lat',))
lon_var = output_file.createVariable('lon', np.float32, ('lon',))
time_var = output_file.createVariable('time', np.float64, ('time',))
mask_var = output_file.createVariable('monthly_masks', np.int32, ('time', 'lat', 'lon'))

# Assign values to variables in the new netCDF file
lat_var[:] = lat
lon_var[:] = lon
time_var[:] = np.arange(1, num_months + 1)
mask_var[:, :, :] = monthly_masks

# Add attributes to variables
lat_var.units = 'degrees_north'
lon_var.units = 'degrees_east'
time_var.units = 'months since 2015-01-01'
mask_var.units = '1'

# Add global attributes
output_file.description = 'Monthly masks for marine heatwaves'

# Close the new netCDF file
output_file.close()

# Close the netCDF file
nc_file.close()

# Open the Longhurst_1_deg.nc file
longhurst_file = '/Users/sayooj/Downloads/Longhurst_1_deg.nc'
longhurst_dataset = nc.Dataset(longhurst_file)

# Read the longhurst variable
longhurst = longhurst_dataset.variables['longhurst'][:]

# Open the monthly_masks file
monthly_mask_file = f'/Users/sayooj/Downloads/monthly_masks_{region_name}_{start_year}_{end_year}.nc'
monthly_mask_dataset = nc.Dataset(monthly_mask_file)

# Read the monthly_masks variable
monthly_masks = monthly_mask_dataset.variables['monthly_masks'][:]

# Create a mask based on Longhurst regions and transpose it
mask = np.isin(longhurst, [longhurst_region_code]).T

# Apply the mask to each time step individually
masked_monthly_masks = np.where(mask, monthly_masks, np.nan)

# Get dimensions from the original dataset
lat = monthly_mask_dataset.variables['lat'][:]
lon = monthly_mask_dataset.variables['lon'][:]
time = monthly_mask_dataset.variables['time'][:]

# Close the netCDF files
longhurst_dataset.close()
monthly_mask_dataset.close()

# Save the masked data to a new netCDF file
masked_file_path = f'/Users/sayooj/Downloads/monthly_masks_masked_{region_name}_{start_year}_{end_year}.nc'
with nc.Dataset(masked_file_path, 'w') as masked_dataset:
    # Create dimensions
    masked_dataset.createDimension('lat', len(lat))
    masked_dataset.createDimension('lon', len(lon))
    masked_dataset.createDimension('time', len(time))

    # Create variables
    masked_lat = masked_dataset.createVariable('new_lat', 'float', ('lat',))
    masked_lat.units = 'degrees_north'
    masked_lon = masked_dataset.createVariable('lon', 'float', ('lon',))
    masked_lon.units = 'degrees_east'
    masked_time = masked_dataset.createVariable('time', 'double', ('time',))

    masked_monthly_masks_var = masked_dataset.createVariable('monthly_masks', 'float', ('time', 'lat', 'lon'),
                                                              fill_value=np.nan)  # Use an appropriate fill value

    # Assign values to variables
    masked_lat[:] = lat
    masked_lon[:] = lon
    masked_time[:] = time
    masked_monthly_masks_var[:] = np.where(masked_monthly_masks != -2147483648.0, masked_monthly_masks, np.nan)

    masked_monthly_masks_var.min = 1
    masked_monthly_masks_var.max = 4

print("Masking complete. The masked data has been saved to:", masked_file_path)

# Open the dataset file
dataset_file = nc.Dataset('/Users/sayooj/Downloads/OceanSODA-ETHZ_GRaCER_v2021a_1982-2020.nc')

# Get the variable data
fgco2 = dataset_file.variables['fgco2'][:]
ph = dataset_file.variables['ph_total'][:]
aragonite_saturation = dataset_file.variables['omega_ar'][:]
sst = dataset_file.variables['temperature'][:]

# Create the time axis for the specified years
dates = pd.date_range(start=f'{start_year}-01-01', end=f'{end_year}-12-31', freq='M')

# Find the indices corresponding to the time period
indices_years = np.where(dates.year.isin(list(range(start_year, end_year + 1))))[0]

# Slice the data for the specified years
fgco2_years = fgco2[indices_years]
sst_years = sst[indices_years]
ph_years = ph[indices_years]
aragonite_saturation_years = aragonite_saturation[indices_years]

# Open the mask file for the specified Longhurst region and time period
mask_file = nc.Dataset(f'/Users/sayooj/Downloads/monthly_masks_masked_{region_name}_{start_year}_{end_year}.nc')

# Get the mask variable for the specified Longhurst region and time period
mask = mask_file.variables['monthly_masks'][:]

# Apply the mask to the sliced data for the specified Longhurst region and time period
fgco2_masked_years = np.ma.masked_array(fgco2_years, np.logical_not(mask))
sst_masked_years = np.ma.masked_array(sst_years, np.logical_not(mask))
ph_masked_years = np.ma.masked_array(ph_years, np.logical_not(mask))
aragonite_saturation_masked_years = np.ma.masked_array(aragonite_saturation_years, np.logical_not(mask))

# Calculate median values with the mask for the specified Longhurst region and time period
fgco2_median_years = np.ma.median(fgco2_masked_years, axis=(1, 2))
sst_median_years = np.ma.median(sst_masked_years, axis=(1, 2))
ph_median_years = np.ma.median(ph_masked_years, axis=(1, 2))
aragonite_saturation_median_years = np.ma.median(aragonite_saturation_masked_years, axis=(1, 2))

# Get indices where the mask values are equal to the specified mask value (e.g., 4 for heatwave period)
indices_heatwave_years = np.where(mask == cat_value)[0]

# Get indices where the mask values are not equal to the specified mask value (e.g., not 4 for non-heatwave period)
indices_non_heatwave_years = np.where(mask != cat_value)[0]

# Perform the Wilcoxon signed-rank tests for each variable in the specified Longhurst region and time period
p_values_years = []
median_diff_years = []
std_dev_years = []

# Perform the Wilcoxon signed-rank tests for the specified variable
if variable_selection == 'fgco2':
    sample_var_years = fgco2_median_years
elif variable_selection == 'ph':
    sample_var_years = ph_median_years
elif variable_selection == 'omega_ar':
    sample_var_years = aragonite_saturation_median_years
elif variable_selection == 'temperature':
    sample_var_years = sst_median_years
else:
    raise ValueError("Invalid variable selection. Please choose from 'fgco2', 'ph', 'omega_ar', or 'temperature'.")

for _ in range(num_samples):
    # Randomly select indices for heatwave and non-heatwave periods
    sample_indices_heatwave_years = np.random.choice(indices_heatwave_years, len(indices_heatwave_years), replace=True)
    sample_indices_non_heatwave_years = np.random.choice(indices_non_heatwave_years, len(indices_heatwave_years), replace=True)

    # Filter the data based on the sampled indices
    sample_var_heatwave_years = sample_var_years[sample_indices_heatwave_years]
    sample_var_non_heatwave_years = sample_var_years[sample_indices_non_heatwave_years]

    # Perform the Wilcoxon signed-rank tests
    _, p_value_years = wilcoxon(sample_var_heatwave_years, sample_var_non_heatwave_years)

    # Calculate the median difference and standard deviation
    median_diff_years.append(np.median(sample_var_heatwave_years - sample_var_non_heatwave_years))
    std_dev_years.append(np.std(sample_var_heatwave_years - sample_var_non_heatwave_years))

    # Append the p-value to the respective list
    p_values_years.append(p_value_years)

# Calculate the mean p-value, median difference, and standard deviation for the specified variable in the specified Longhurst region and time period
mean_p_value_var_years = np.mean(p_values_years)
mean_median_diff_var_years = np.mean(median_diff_years)
mean_std_dev_var_years = np.mean(std_dev_years)

# Print the results
print(f"\nResults for {variable_selection} in {region_name} region with Longhurst code {longhurst_region_code} during {start_year}/{end_year}:")
print("Mean p-value:", mean_p_value_var_years)
print("Mean median difference:", mean_median_diff_var_years)
print("Mean standard deviation:", mean_std_dev_var_years)


Masking complete. The masked data has been saved to: /Users/sayooj/Downloads/monthly_masks_masked_western australia_2011_2011.nc

Results for fgco2 in western australia region with Longhurst code 29 during 2011/2011:
Mean p-value: 0.08300014538318451
Mean median difference: -0.017665683925151825
Mean standard deviation: 0.07073843862196406


In [42]:
import xarray as xr
import netCDF4 as nc
import numpy as np
import pandas as pd
import matplotlib as mpl
from scipy import __version__ as scipy_version
from scipy.stats import wilcoxon

if __name__ == "__main__":
    print("xarray version:", xr.__version__)
    print("netCDF4 version:", nc.__version__)
    print("numpy version:", np.__version__)
    print("pandas version:", pd.__version__)
    print("matplotlib version:", mpl.__version__)
    print("scipy version:", scipy_version)


xarray version: 0.20.1
netCDF4 version: 1.5.7
numpy version: 1.21.6
pandas version: 1.3.5
matplotlib version: 3.5.3
scipy version: 1.7.3


In [41]:
import pkg_resources

def get_dependencies_in_code():
    imported_modules = set()
    for module_name, module_obj in globals().items():
        if isinstance(module_obj, type(pkg_resources)):  # Check if the object is a module
            imported_modules.add(module_name)
    return imported_modules

if __name__ == "__main__":
    code_dependencies = get_dependencies_in_code()
    print("Libraries used in the code:")
    for dependency in code_dependencies:
        print(dependency)


Libraries used in the code:
nc
__builtin__
np
pkg_resources
pd
plt
__builtins__
xr
