In [None]:
import xarray as xr
import numpy as np
from scipy.stats import wilcoxon
import netCDF4 as nc

# Define parameters for analysis
start_year = 2015
end_year = 2016
num_samples = 100
longhurst_region_code = 36
region_name = "TASM"
cat_values = [3, 4]  # List of cat_values to consider as heatwave periods

# Section 1: Load and Preprocess Data
# Load the netCDF file containing variables other than chlorophyll
dataset = xr.open_dataset('/Users/sayooj/Downloads/GlobalAtlas_MHW_ESACCISST_1deg_1982-2021.nc', decode_times=False)

# Define the start and end indices for slicing
start_idx = (start_year - 1982) * 365
end_idx = start_idx + (end_year - start_year + 1) * 365 - 1

# Create a new dataset with data only for the specified time range
new_dataset = dataset.isel(time=slice(start_idx, end_idx + 1))

# Convert data variables to float32 if needed
new_dataset['cat'] = new_dataset['cat'].astype('float32')
new_dataset['mhw'] = new_dataset['mhw'].astype('float32')

# Save the new dataset to a new netCDF file
new_dataset.to_netcdf(f'/Users/sayooj/Downloads/{region_name}_{start_year}_{end_year}.nc')

# Section 2: Create Monthly Masks
# Load the netCDF file containing monthly masks
nc_file = xr.open_dataset(f'/Users/sayooj/Downloads/{region_name}_{start_year}_{end_year}.nc', decode_times=False)

# Define dimensions
lat = nc_file['lat'].values
lon = nc_file['lon'].values
time = nc_file['time'].values
cat_daily = nc_file['cat'].values

# Calculate the number of months
num_months = int(len(time) / 30)

# Create an empty array to store monthly masks
monthly_masks = np.zeros((num_months, len(lat), len(lon)))

# Iterate over each month
for month in range(num_months):
    # Calculate the start and end indices for the current month
    start_idx = month * 30
    end_idx = (month + 1) * 30

    # Extract the daily cat values for the current month
    month_data = cat_daily[start_idx:end_idx]

    # Find the maximum category occurrence for each lat-lon point in the current month
    max_values = np.max(month_data, axis=0)

    # Set areas impacted by the highest category occurrence
    monthly_mask = np.where(max_values > 0, max_values, 0)

    # Save the monthly mask
    monthly_masks[month] = monthly_mask

# Create a new netCDF file to save the monthly masks
output_file = xr.Dataset(
    data_vars={
        'lat': ('lat', lat),
        'lon': ('lon', lon),
        'time': ('time', np.arange(1, num_months + 1)),
        'monthly_masks': (['time', 'lat', 'lon'], monthly_masks)
    }
)

# Add attributes
output_file['lat'].attrs['units'] = 'degrees_north'
output_file['lon'].attrs['units'] = 'degrees_east'
output_file['time'].attrs['units'] = f'months since {start_year}-01-01'
output_file['monthly_masks'].attrs['units'] = '1'
output_file.attrs['description'] = f'Monthly masks for marine heatwaves in {region_name}'

# Save the monthly masks to a new netCDF file
output_file.to_netcdf(f'/Users/sayooj/Downloads/monthly_masks_{region_name}_{start_year}_{end_year}.nc')

# Close the netCDF file
nc_file.close()

# Section 3: Mask Based on Longhurst Regions
# Open the Longhurst region file
longhurst_file = '/Users/sayooj/Downloads/Longhurst_1_deg.nc'
longhurst_dataset = xr.open_dataset(longhurst_file)

# Read the Longhurst variable
longhurst = longhurst_dataset['longhurst'].values

# Open the monthly masks file
monthly_mask_file = xr.open_dataset(f'/Users/sayooj/Downloads/monthly_masks_{region_name}_{start_year}_{end_year}.nc', decode_times=False)

# Read the monthly masks variable
monthly_masks = monthly_mask_file['monthly_masks'].values

# Create a mask based on Longhurst regions and transpose it
mask = np.isin(longhurst, [longhurst_region_code]).T

# Apply the mask to each time step individually
masked_monthly_masks = np.where(mask, monthly_masks, np.nan)

# Get dimensions from the original dataset
lat = monthly_mask_file['lat'].values
lon = monthly_mask_file['lon'].values
time = monthly_mask_file['time'].values

# Close the netCDF files
longhurst_dataset.close()
monthly_mask_file.close()

# Save the masked data to a new netCDF file
masked_file_path = f'/Users/sayooj/Downloads/monthly_masks_masked_{region_name}_{start_year}_{end_year}.nc'
with nc.Dataset(masked_file_path, 'w') as masked_dataset:
    # Create dimensions
    masked_dataset.createDimension('lat', len(lat))
    masked_dataset.createDimension('lon', len(lon))
    masked_dataset.createDimension('time', len(time))

    # Create variables
    masked_lat = masked_dataset.createVariable('lat', 'float', ('lat',))
    masked_lat.units = 'degrees_north'
    masked_lon = masked_dataset.createVariable('lon', 'float', ('lon',))
    masked_lon.units = 'degrees_east'
    masked_time = masked_dataset.createVariable('time', 'double', ('time',))

    masked_monthly_masks_var = masked_dataset.createVariable('monthly_masks', 'float', ('time', 'lat', 'lon'),
                                                           fill_value=np.nan)  # Use an appropriate fill value

    # Assign values to variables
    masked_lat[:] = lat
    masked_lon[:] = lon
    masked_time[:] = time
    masked_monthly_masks_var[:] = np.where(masked_monthly_masks != -2147483648.0, masked_monthly_masks, np.nan)

    masked_monthly_masks_var.min = 1
    masked_monthly_masks_var.max = max(cat_values)  # Use the maximum value of cat_values

print("Masking complete. The masked data has been saved to:", masked_file_path)

# Open the existing NetCDF file for chlorophyll data
file_path_chlorophyll = '/Users/Sayooj/Downloads/Sayooj_OC-CCI_chl-a_CCMP_wind_13092023/OC-CCI_chlor_a_1997_2022.nc'
ds_chlorophyll = xr.open_dataset(file_path_chlorophyll)

# Define the time range you want to slice for chlorophyll data
start_date_chlorophyll = f'{start_year}-01-01'
end_date_chlorophyll = f'{end_year}-12-31'

# Slice the chlorophyll dataset to the desired time range
ds_chlorophyll_sliced = ds_chlorophyll.sel(time=slice(start_date_chlorophyll, end_date_chlorophyll))

# Create a new NetCDF file for sliced chlorophyll data
output_file_path_chlorophyll = f'sliced_OC-CCI_chlor_a_{region_name}_{start_year}_{end_year}.nc'
ds_chlorophyll_sliced.to_netcdf(output_file_path_chlorophyll)

# Close the original and sliced chlorophyll datasets
ds_chlorophyll.close()
ds_chlorophyll_sliced.close()

print(f'Sliced chlorophyll dataset saved to {output_file_path_chlorophyll}')

# Open the existing NetCDF file for wind speed and direction data
file_path_wind = '/Users/Sayooj/Downloads/Sayooj_OC-CCI_chl-a_CCMP_wind_13092023/CCMP_v3.0_wind_1993_2019.nc'
ds_wind = xr.open_dataset(file_path_wind)

# Calculate the start and end dates based on start_year and end_year
start_date_wind = f'{start_year}-01-01'
end_date_wind = f'{end_year}-12-31'

# Slice the wind dataset to the calculated time range
ds_wind_sliced = ds_wind.sel(time=slice(start_date_wind, end_date_wind))

# Create a new NetCDF file with a formatted filename for wind data
output_file_path_wind = f'sliced_OC-CCI_CCMP_v3.0_wind_{region_name}_{start_year}_{end_year}.nc'
ds_wind_sliced.to_netcdf(output_file_path_wind)

# Close the original and sliced wind datasets
ds_wind.close()
ds_wind_sliced.close()

print(f'Sliced wind dataset saved to {output_file_path_wind}')

# Define the path to the sliced chlorophyll dataset file
chlorophyll_file = nc.Dataset(f'sliced_OC-CCI_chlor_a_{region_name}_{start_year}_{end_year}.nc')

# Get the chlorophyll variable data
chlorophyll = chlorophyll_file.variables['OC-CCI_chlor_a'][:]

# Define the path to the wind speed and direction dataset file
wind_file = nc.Dataset(f'sliced_OC-CCI_CCMP_v3.0_wind_{region_name}_{start_year}_{end_year}.nc')

# Get the wind speed and wind direction variable data
wind_speed = wind_file.variables['CCMP_w'][:]
wind_direction = wind_file.variables['CCMP_wind_dir'][:]

# Open the mask file for the specified region
mask_file = nc.Dataset(f'/Users/sayooj/Downloads/monthly_masks_masked_{region_name}_{start_year}_{end_year}.nc')

# Get the mask variable for the specified region
mask_region = mask_file.variables['monthly_masks'][:]

# Apply the mask to the chlorophyll data
chlorophyll_masked_year = np.ma.masked_array(chlorophyll, np.logical_not(mask_region))

# Apply the mask to the wind speed and wind direction data
wind_speed_masked_year = np.ma.masked_array(wind_speed, np.logical_not(mask_region))
wind_direction_masked_year = np.ma.masked_array(wind_direction, np.logical_not(mask_region))

# Calculate median values with the mask for chlorophyll, wind speed, and wind direction
chlorophyll_median_region_year = np.ma.median(chlorophyll_masked_year, axis=(1, 2))
wind_speed_median_region_year = np.ma.median(wind_speed_masked_year, axis=(1, 2))
wind_direction_median_region_year = np.ma.median(wind_direction_masked_year, axis=(1, 2))

# Get indices where the mask values are equal to any of the specified cat values (e.g., heatwave period)
indices_heatwave_region_year = np.where(np.isin(mask_region, cat_values))[0]

# Get indices where the mask values are not equal to any of the specified cat values (e.g., non-heatwave period)
indices_non_heatwave_region_year = np.where(~np.isin(mask_region, cat_values))[0]

# Perform the Wilcoxon signed-rank tests for chlorophyll, wind speed, and wind direction
p_values_chlorophyll_region_year = []
p_values_wind_speed_region_year = []
p_values_wind_direction_region_year = []

median_diff_chlorophyll_region_year = []
median_diff_wind_speed_region_year = []
median_diff_wind_direction_region_year = []

std_dev_chlorophyll_region_year = []
std_dev_wind_speed_region_year = []
std_dev_wind_direction_region_year = []

for _ in range(num_samples):
    # Randomly select indices for heatwave and non-heatwave periods
    sample_indices_region_heatwave_year = np.random.choice(indices_heatwave_region_year, len(indices_heatwave_region_year), replace=True)
    sample_indices_region_non_heatwave_year = np.random.choice(indices_non_heatwave_region_year, len(indices_heatwave_region_year), replace=True)

    # Filter the data based on the sampled indices for chlorophyll, wind speed, and wind direction
    sample_chlorophyll_median_region_heatwave_year = chlorophyll_median_region_year[sample_indices_region_heatwave_year]
    sample_chlorophyll_median_region_non_heatwave_year = chlorophyll_median_region_year[sample_indices_region_non_heatwave_year]
    
    sample_wind_speed_median_region_heatwave_year = wind_speed_median_region_year[sample_indices_region_heatwave_year]
    sample_wind_speed_median_region_non_heatwave_year = wind_speed_median_region_year[sample_indices_region_non_heatwave_year]
    
    sample_wind_direction_median_region_heatwave_year = wind_direction_median_region_year[sample_indices_region_heatwave_year]
    sample_wind_direction_median_region_non_heatwave_year = wind_direction_median_region_year[sample_indices_region_non_heatwave_year]

    # Perform the Wilcoxon signed-rank tests for chlorophyll
    _, p_value_chlorophyll_region_year = wilcoxon(sample_chlorophyll_median_region_heatwave_year, sample_chlorophyll_median_region_non_heatwave_year)

    # Perform the Wilcoxon signed-rank tests for wind speed
    _, p_value_wind_speed_region_year = wilcoxon(sample_wind_speed_median_region_heatwave_year, sample_wind_speed_median_region_non_heatwave_year)

    # Perform the Wilcoxon signed-rank tests for wind direction
    _, p_value_wind_direction_region_year = wilcoxon(sample_wind_direction_median_region_heatwave_year, sample_wind_direction_median_region_non_heatwave_year)

    # Append the p-values to the respective lists for all variables
    p_values_chlorophyll_region_year.append(p_value_chlorophyll_region_year)
    p_values_wind_speed_region_year.append(p_value_wind_speed_region_year)
    p_values_wind_direction_region_year.append(p_value_wind_direction_region_year)

    # Calculate the median difference and standard deviation for all variables
    median_diff_chlorophyll_region_year.append(np.median(sample_chlorophyll_median_region_heatwave_year - sample_chlorophyll_median_region_non_heatwave_year))
    median_diff_wind_speed_region_year.append(np.median(sample_wind_speed_median_region_heatwave_year - sample_wind_speed_median_region_non_heatwave_year))
    median_diff_wind_direction_region_year.append(np.median(sample_wind_direction_median_region_heatwave_year - sample_wind_direction_median_region_non_heatwave_year))

    std_dev_chlorophyll_region_year.append(np.std(sample_chlorophyll_median_region_heatwave_year - sample_chlorophyll_median_region_non_heatwave_year))
    std_dev_wind_speed_region_year.append(np.std(sample_wind_speed_median_region_heatwave_year - sample_wind_speed_median_region_non_heatwave_year))
    std_dev_wind_direction_region_year.append(np.std(sample_wind_direction_median_region_heatwave_year - sample_wind_direction_median_region_non_heatwave_year))

# Calculate the median p-values, median median differences, and median standard deviations for all variables
median_p_value_chlorophyll_region_year = np.median(p_values_chlorophyll_region_year)
median_p_value_wind_speed_region_year = np.median(p_values_wind_speed_region_year)
median_p_value_wind_direction_region_year = np.median(p_values_wind_direction_region_year)

median_median_diff_chlorophyll_region_year = np.median(median_diff_chlorophyll_region_year)
median_median_diff_wind_speed_region_year = np.median(median_diff_wind_speed_region_year)
median_median_diff_wind_direction_region_year = np.median(median_diff_wind_direction_region_year)

median_std_dev_chlorophyll_region_year = np.median(std_dev_chlorophyll_region_year)
median_std_dev_wind_speed_region_year = np.median(std_dev_wind_speed_region_year)
median_std_dev_wind_direction_region_year = np.median(std_dev_wind_direction_region_year)

# Print the results for chlorophyll
print(f"Results for chlorophyll in {region_name} in {start_year}/{end_year}:")
print("Median p-value:", median_p_value_chlorophyll_region_year)
print("Median median difference:", median_median_diff_chlorophyll_region_year)
print("Median standard deviation:", median_std_dev_chlorophyll_region_year)

# Print the results for wind speed
print(f"Results for wind speed in {region_name} in {start_year}/{end_year}:")
print("Median p-value:", median_p_value_wind_speed_region_year)
print("Median median difference:", median_median_diff_wind_speed_region_year)
print("Median standard deviation:", median_std_dev_wind_speed_region_year)

# Print the results for wind direction
print(f"Results for wind direction in {region_name} in {start_year}/{end_year}:")
print("Median p-value:", median_p_value_wind_direction_region_year)
print("Median median difference:", median_median_diff_wind_direction_region_year)
print("Median standard deviation:", median_std_dev_wind_direction_region_year)

# Calculate the pre-heatwave and post-heatwave parameter values
median_chlorophyll_pre_heatwave = np.median(sample_chlorophyll_median_region_non_heatwave_year)
median_chlorophyll_post_heatwave = np.median(sample_chlorophyll_median_region_heatwave_year)

median_wind_speed_pre_heatwave = np.median(sample_wind_speed_median_region_non_heatwave_year)
median_wind_speed_post_heatwave = np.median(sample_wind_speed_median_region_heatwave_year)

median_wind_direction_pre_heatwave = np.median(sample_wind_direction_median_region_non_heatwave_year)
median_wind_direction_post_heatwave = np.median(sample_wind_direction_median_region_heatwave_year)

# Define the units for each parameter
chlorophyll_units = 'mg Chl-a m^-3'  # Replace with the actual units for chlorophyll
wind_speed_units = 'ms^-1'    # Replace with the actual units for wind speed
wind_direction_units = '0°'  # Replace with the actual units for wind direction

# Print the results for chlorophyll, wind speed, and wind direction
print(f"Changes in parameter values for {region_name} in {start_year}/{end_year} (Pre-Heatwave to Post-Heatwave):")
print(f"Chlorophyll: {median_chlorophyll_pre_heatwave:.2f} {chlorophyll_units} to {median_chlorophyll_post_heatwave:.2f} {chlorophyll_units}")
print(f"Wind Speed: {median_wind_speed_pre_heatwave:.2f} {wind_speed_units} to {median_wind_speed_post_heatwave:.2f} {wind_speed_units}")
print(f"Wind Direction: {median_wind_direction_pre_heatwave:.2f} {wind_direction_units} to {median_wind_direction_post_heatwave:.2f} {wind_direction_units}")

# Close the netCDF files
chlorophyll_file.close()
wind_file.close()
mask_file.close()
