In [None]:
import xarray as xr
import numpy as np
from scipy.stats import wilcoxon
import netCDF4 as nc
import pandas as pd
#from docx import Document
import os
import csv
   

# Read parameter combinations from CSV file
csv_file_path = "C:/Users/df391/OneDrive - University of Exeter/Post_Doc_Ocean_Health/HeatwaveAnalysis/PARAMETERS.csv"  # Replace with the actual path
heatwave_file = "D:/OceanHealth/GlobalAtlas_MHW_ESACCISST_1deg_1982-2021.nc"
longhurst_file = 'D:/OceanHealth/Longhurst_1_deg.nc'
oceansoda_file = 'D:/Data/_DataSets/OCEANSODA_CO2/OceanSODA_ETHZ-v2023.OCADS.01_1982-2022.nc'
#oceansoda_climatology_file = 'D:/OceanHealth/all_variables_monthly_avg_overall.nc'
output_loc = 'D:/OceanHealth/output/'
parameters_df = pd.read_csv(csv_file_path)

# Remove extra spaces from column names
parameters_df.columns = parameters_df.columns.str.strip()

for index, row in parameters_df.iterrows():
    start_year = row['start_year']
    oceansoda_climatology_file = 'D:/OceanHealth/all_variables_monthly_avg_'+str(start_year)+'.nc'
    end_year = row['end_year']
    longhurst_region_code = row['longhurst_region_code']
    region_name = row['region_name']
    combined_cat_values = [3, 4]
    num_samples = row['num_samples']
    consecutive_months_threshold = row['consecutive_months_threshold']
    months_after = row['months_after']  

    # Section 1: Load and Preprocess Data
    # Load the netCDF file containing variables other than chlorophyll
    dataset = xr.open_dataset(heatwave_file, decode_times=False,autoclose=True)

    # Define the start and end indices for slicing
    start_idx = (start_year - 1982) * 365
    end_idx = start_idx + (end_year - start_year + 1) * 365 - 1

    # Create a new dataset with data only for the specified time range
    new_dataset = dataset.isel(time=slice(start_idx, end_idx + 1))

    # Convert data variables to float32 if needed
    new_dataset['cat'] = new_dataset['cat'].astype('float32')
    new_dataset['mhw'] = new_dataset['mhw'].astype('float32')

    # Save the new dataset to a new netCDF file
    new_dataset.to_netcdf(output_loc+f'{region_name}{consecutive_months_threshold}_{months_after}_{start_year}_{end_year}.nc')

    # Section 2: Mask Based on Longhurst Regions
    # Open the Longhurst region file
    longhurst_dataset = xr.open_dataset(longhurst_file,autoclose=True)

    # Read the Longhurst variable
    longhurst = longhurst_dataset['longhurst'].values

    # Create a mask based on Longhurst regions and transpose it
    mask = np.isin(longhurst, [longhurst_region_code]).T

    # Apply the mask to the entire time range
    masked_dataset = new_dataset.where(mask)

    # Save the masked data to a new netCDF file
    masked_file_path = output_loc+f'masked_{region_name}{consecutive_months_threshold}_{months_after}_{start_year}_{end_year}.nc'
    masked_dataset.to_netcdf(masked_file_path)

    # Section 3: Create Monthly Masks with Values Only Inside Longhurst Region
    # Load the netCDF file containing the masked data
    masked_nc_file = xr.open_dataset(masked_file_path, decode_times=False, autoclose=True)

    # Extract the masked cat variable and apply the Longhurst mask
    masked_cat = masked_nc_file['cat'].where(mask)

    # Calculate the number of months
    num_months = int(len(masked_nc_file['time']) / 30)

    # Create an empty array to store monthly masks
    monthly_masks = np.zeros((num_months, len(masked_nc_file['lat']), len(masked_nc_file['lon']))) * np.nan

    # Iterate over each month
    for month in range(num_months):
        # Calculate the start and end indices for the current month
        start_idx = month * 30
        end_idx = (month + 1) * 30

        # Extract the masked daily cat values for the current month
        month_data = masked_cat[start_idx:end_idx]

        # Find the maximum category occurrence for each lat-lon point in the current month
        max_values = np.max(month_data, axis=0)

        # Set areas impacted by the highest category occurrence within the Longhurst region
        monthly_mask = np.where(mask, max_values, np.nan)

        # Save the monthly mask
        monthly_masks[month] = monthly_mask

    # Create a new netCDF file to save the monthly masks
    output_file = xr.Dataset(
        data_vars={
            'lat': ('lat', masked_nc_file['lat'].values),
            'lon': ('lon', masked_nc_file['lon'].values),
            'time': ('time', np.arange(1, num_months + 1)),
            'monthly_masks': (['time', 'lat', 'lon'], monthly_masks)
        }
    )

    # Add attributes
    output_file['lat'].attrs['units'] = 'degrees_north'
    output_file['lon'].attrs['units'] = 'degrees_east'
    output_file['time'].attrs['units'] = f'months since {start_year}-01-01'
    output_file['monthly_masks'].attrs['units'] = '1'
    output_file.attrs['description'] = f'Monthly masks for marine heatwaves in {region_name}'

    # Save the monthly masks to a new netCDF file
    output_file.to_netcdf(output_loc+f'monthly_masks_{region_name}{consecutive_months_threshold}_{months_after}_{start_year}_{end_year}.nc')

    # Close the netCDF files
    masked_nc_file.close()

    # Section 4: Create Consecutive Monthly Mask for Values 3 or 4
    # Load the netCDF file containing the monthly masks
    monthly_masks_file = xr.open_dataset(output_loc+f'monthly_masks_{region_name}{consecutive_months_threshold}_{months_after}_{start_year}_{end_year}.nc', decode_times=False, autoclose=True)

    # Extract the monthly masks variable
    monthly_masks_data = monthly_masks_file['monthly_masks'].values

    # Initialize the consecutive monthly mask array
    consecutive_monthly_mask = np.zeros_like(monthly_masks) * np.nan

    # Iterate over each lat-lon point
    for lat_idx in range(monthly_masks_data.shape[1]):
        for lon_idx in range(monthly_masks_data.shape[2]):
            # Extract the monthly mask values for the current lat-lon point
            values = monthly_masks_data[:, lat_idx, lon_idx]

            consecutive_count = 0
            consecutive_mask = np.zeros_like(values)

            for i in range(len(values)):
                if (values[i] == 3) or (values[i] == 4):
                    consecutive_count += 1
                    consecutive_mask[i] = values[i]
                else:
                    consecutive_count = 0
                    consecutive_mask[i] = 0

                if consecutive_count >= consecutive_months_threshold:
                    break

            # Set the consecutive monthly mask values for the current lat-lon point
            consecutive_monthly_mask[:len(consecutive_mask), lat_idx, lon_idx] = consecutive_mask

    # Apply the Longhurst mask to set values inside the region to NaN
    consecutive_monthly_mask = np.where(mask, consecutive_monthly_mask, np.nan)



    # Create a new netCDF file to save the consecutive monthly mask
    consecutive_monthly_mask_file = xr.Dataset(
        data_vars={
            'lat': ('lat', monthly_masks_file['lat'].values),
            'lon': ('lon', monthly_masks_file['lon'].values),
            'time': ('time', monthly_masks_file['time'].values),
            'consecutive_monthly_mask': (['time', 'lat', 'lon'], consecutive_monthly_mask)
        }
    )

    # Add attributes
    consecutive_monthly_mask_file['lat'].attrs['units'] = 'degrees_north'
    consecutive_monthly_mask_file['lon'].attrs['units'] = 'degrees_east'
    consecutive_monthly_mask_file['time'].attrs['units'] = f'months since {start_year}-01-01'
    consecutive_monthly_mask_file['consecutive_monthly_mask'].attrs['units'] = '1'
    consecutive_monthly_mask_file.attrs['description'] = f'Consecutive monthly mask for values 3 or 4 in {region_name}'

    # Save the consecutive monthly mask to a new netCDF file
    consecutive_monthly_mask_file.to_netcdf(output_loc+f'consecutive_monthly_mask_{region_name}{consecutive_months_threshold}_{months_after}_{start_year}_{end_year}.nc')

    # Close the netCDF files
    monthly_masks_file.close()
    consecutive_monthly_mask_file.close()

    # Initialize a list to store information about consecutive heatwaves
    consecutive_heatwave_info = []

    # Iterate over each lat-lon point
    for lat_idx in range(consecutive_monthly_mask.shape[1]):
        for lon_idx in range(consecutive_monthly_mask.shape[2]):
            # Extract the consecutive monthly mask values for the current lat-lon point
            values = consecutive_monthly_mask[:, lat_idx, lon_idx]

            # Find indices where consecutive heatwaves occurred (values 3 or 4)
            heatwave_indices = np.where(np.isin(values, [3, 4]))[0]

            # If consecutive heatwaves occurred at this lat-lon point
            if len(heatwave_indices) >= consecutive_months_threshold:
                # Get the corresponding dates for the identified indices
                heatwave_dates = monthly_masks_file['time'].values[heatwave_indices]

                # Convert months to dates based on the start_year
                start_date = pd.to_datetime(f'{start_year}-01-01')
                exact_dates = [(start_date + pd.DateOffset(months=int(month))).strftime('%Y-%m-%d') for month in heatwave_dates]

                # Append the lat, lon, months, and exact_dates to the list
                consecutive_heatwave_info.append({
                    'lat': monthly_masks_file['lat'].values[lat_idx],
                    'lon': monthly_masks_file['lon'].values[lon_idx],
                    'months': heatwave_dates.tolist(),
                    'exact_dates': exact_dates
                })

    # Create a DataFrame with the extracted information
    consecutive_heatwave_info_df = pd.DataFrame(consecutive_heatwave_info)

    # Save the DataFrame to a CSV file with region name, start year, and end year in the filename
    csv_file_path = output_loc+f'consecutive_heatwave_info_{region_name}{consecutive_months_threshold}_{months_after}_{start_year}_{end_year}.csv'
    consecutive_heatwave_info_df.to_csv(csv_file_path, index=False)


    # Section 4: Create Consecutive Monthly Mask for Values 3 or 4
    # Load the netCDF file containing the monthly masks
    monthly_masks_file = xr.open_dataset(output_loc+f'monthly_masks_{region_name}{consecutive_months_threshold}_{months_after}_{start_year}_{end_year}.nc', decode_times=False, autoclose=True)

    # Extract the monthly masks variable
    monthly_masks_data = monthly_masks_file['monthly_masks'].values

    # Initialize the consecutive monthly mask array
    consecutive_monthly_mask = np.zeros_like(monthly_masks) * np.nan

    # Iterate over each lat-lon point
    for lat_idx in range(monthly_masks_data.shape[1]):
        for lon_idx in range(monthly_masks_data.shape[2]):
            # Extract the monthly mask values for the current lat-lon point
            values = monthly_masks_data[:, lat_idx, lon_idx]

            consecutive_count = 0
            consecutive_mask = np.zeros_like(values)

            for i in range(len(values)):
                if (values[i] == 3) or (values[i] == 4):
                    consecutive_count += 1
                    consecutive_mask[i] = values[i]
                else:
                    consecutive_count = 0
                    consecutive_mask[i] = 0

                if consecutive_count >= consecutive_months_threshold:
                    break

            # Set the consecutive monthly mask values for the current lat-lon point
            consecutive_monthly_mask[:len(consecutive_mask), lat_idx, lon_idx] = consecutive_mask

    # Apply the Longhurst mask to set values inside the region to NaN
    consecutive_monthly_mask = np.where(mask, consecutive_monthly_mask, np.nan)



    # Create a new netCDF file to save the consecutive monthly mask
    consecutive_monthly_mask_file = xr.Dataset(
        data_vars={
            'lat': ('lat', monthly_masks_file['lat'].values),
            'lon': ('lon', monthly_masks_file['lon'].values),
            'time': ('time', monthly_masks_file['time'].values),
            'consecutive_monthly_mask': (['time', 'lat', 'lon'], consecutive_monthly_mask)
        }
    )

    # Add attributes
    consecutive_monthly_mask_file['lat'].attrs['units'] = 'degrees_north'
    consecutive_monthly_mask_file['lon'].attrs['units'] = 'degrees_east'
    consecutive_monthly_mask_file['time'].attrs['units'] = f'months since {start_year}-01-01'
    consecutive_monthly_mask_file['consecutive_monthly_mask'].attrs['units'] = '1'
    consecutive_monthly_mask_file.attrs['description'] = f'Consecutive monthly mask for values 3 or 4 in {region_name}'

    # Save the consecutive monthly mask to a new netCDF file
    consecutive_monthly_mask_file.to_netcdf(output_loc+f'consecutive_monthly_mask_{region_name}{consecutive_months_threshold}_{months_after}_{start_year}_{end_year}.nc')

    # Close the netCDF files
    monthly_masks_file.close()
    consecutive_monthly_mask_file.close()

    # Initialize a list to store information about consecutive heatwaves
    consecutive_heatwave_info = []

    # Iterate over each lat-lon point
    for lat_idx in range(consecutive_monthly_mask.shape[1]):
        for lon_idx in range(consecutive_monthly_mask.shape[2]):
            # Extract the consecutive monthly mask values for the current lat-lon point
            values = consecutive_monthly_mask[:, lat_idx, lon_idx]

            # Find indices where consecutive heatwaves occurred (values 3 or 4)
            heatwave_indices = np.where(np.isin(values, [3, 4]))[0]

            # If consecutive heatwaves occurred at this lat-lon point
            if len(heatwave_indices) >= consecutive_months_threshold:
                # Get the corresponding dates for the identified indices
                heatwave_dates = monthly_masks_file['time'].values[heatwave_indices]

                # Convert months to dates based on the start_year
                start_date = pd.to_datetime(f'{start_year}-01-01')
                exact_dates = [(start_date + pd.DateOffset(months=int(month))).strftime('%Y-%m-%d') for month in heatwave_dates]

                # Append the lat, lon, months, and exact_dates to the list
                consecutive_heatwave_info.append({
                    'lat': monthly_masks_file['lat'].values[lat_idx],
                    'lon': monthly_masks_file['lon'].values[lon_idx],
                    'months': heatwave_dates.tolist(),
                    'exact_dates': exact_dates
                })

    # Create a DataFrame with the extracted information
    consecutive_heatwave_info_df = pd.DataFrame(consecutive_heatwave_info)

    # Save the DataFrame to a CSV file with region name, start year, and end year in the filename
    csv_file_path = output_loc+f'consecutive_heatwave_info_{region_name}{consecutive_months_threshold}_{months_after}_{start_year}_{end_year}.csv'
    consecutive_heatwave_info_df.to_csv(csv_file_path, index=False)
   

    from datetime import datetime, timedelta
    import pandas as pd
    import xarray as xr
    import numpy as np

    # Function to calculate date after 'n' months with day set to 1
    def calculate_date_after_n_months(start_date, n):
        # Calculate the date after 'n' months
        result_date = start_date + timedelta(days=30 * n)
        # Set the day to 1
        result_date = result_date.replace(day=1)
        # Format the date as 'YYYY-MM-DD'
        return result_date.strftime('%Y-%m-%d')

    # Load the consecutive heatwave information CSV file
    csv_file_path = output_loc+f'consecutive_heatwave_info_{region_name}{consecutive_months_threshold}_{months_after}_{start_year}_{end_year}.csv'
    consecutive_heatwave_info_df = pd.read_csv(csv_file_path)

    # Convert 'exact_dates' column to list of lists of datetime objects
    consecutive_heatwave_info_df['exact_dates'] = consecutive_heatwave_info_df['exact_dates'].apply(eval).apply(lambda x: [datetime.strptime(date_str, '%Y-%m-%d') for date_str in x])

    # Function to calculate date after 'n' months for each heatwave event
    def calculate_date_after_n_months_list(dates_list, n):
        return [calculate_date_after_n_months(date, n) for date in dates_list]

    # Apply the function to calculate the date after 'n' months for each heatwave event
    consecutive_heatwave_info_df[f'date_after_{months_after}_months'] = consecutive_heatwave_info_df['exact_dates'].apply(lambda x: calculate_date_after_n_months_list(x, months_after))

    # Define the end date for masking ('n' months after start date)
    end_date = datetime.strptime('2022-12-31', '%Y-%m-%d')

    # Create a mask for dates after 'n' months from start date
    mask = consecutive_heatwave_info_df[f'date_after_{months_after}_months'].apply(lambda x: all(datetime.strptime(date, '%Y-%m-%d') <= end_date for date in x))

    # Apply the mask
    masked_heatwaves = consecutive_heatwave_info_df[mask]

    # Convert the strings representing lists to actual lists of integers in the 'months' column
    masked_heatwaves['months'] = masked_heatwaves['months'].apply(eval)

    # Add 'n' to each value in the 'months' column to get the new months for 'date_after_n_months'
    masked_heatwaves[f'months_date_after_{months_after}_months'] = masked_heatwaves['months'].apply(lambda x: [month + months_after for month in x])

    # Define the output file name with the current date
    output_file_name = output_loc+f"masked_consecutive_heatwave_info_{region_name}{consecutive_months_threshold}_{months_after}_{start_year}_{end_year}.csv"

    # Save the masked DataFrame to a CSV file
    masked_heatwaves.to_csv(output_file_name, index=False)

    # Load the netCDF file containing the monthly masks
    monthly_masks_file = xr.open_dataset(output_loc+f'monthly_masks_{region_name}{consecutive_months_threshold}_{months_after}_{start_year}_{end_year}.nc', decode_times=False)

    # Extract the third month values from the 'months_date_after_n_months' column
    third_month_values = masked_heatwaves[f'months_date_after_{months_after}_months'].apply(lambda x: int(x[2]))

    # Extract latitudes and longitudes
    lats = masked_heatwaves['lat'].values
    lons = masked_heatwaves['lon'].values

    # Initialize a mask array
    mask_array = np.zeros_like(monthly_masks_file['monthly_masks'].values)

    for i, (lat, lon, month_value) in enumerate(zip(lats, lons, third_month_values)):
        # Find the indices corresponding to the given latitude and longitude
        lat_index = np.where(monthly_masks_file['lat'].values == lat)[0]
        lon_index = np.where(monthly_masks_file['lon'].values == lon)[0]

        # Check if the index is within bounds
        if month_value - 1 < mask_array.shape[0] and lat_index.size > 0 and lon_index.size > 0:
            # Set the value of the third month at the corresponding lat-lon position
            mask_array[month_value - 1, lat_index[0], lon_index[0]] = monthly_masks_file['monthly_masks'].values[month_value - 1, lat_index[0], lon_index[0]]

    # Set NaN values back to NaN
    mask_array[np.isnan(monthly_masks_file['monthly_masks'].values)] = np.nan
    
    # Create a new xarray dataset with the extracted mask
    mask_dataset = xr.Dataset(
        data_vars={
            'lat': ('lat', monthly_masks_file['lat'].values),
            'lon': ('lon', monthly_masks_file['lon'].values),
            'time': ('time', monthly_masks_file['time'].values),
            'masked_values': (['time', 'lat', 'lon'], mask_array)
        }
    )

    # Add attributes
    mask_dataset['lat'].attrs['units'] = 'degrees_north'
    mask_dataset['lon'].attrs['units'] = 'degrees_east'
    mask_dataset['time'].attrs['units'] = 'months since 2014-01-01'
    mask_dataset['masked_values'].attrs['units'] = '1'
    mask_dataset.attrs['description'] = 'Masked values for specific latitudes, longitudes, and months'

    # Save the masked values to a new netCDF file
    mask_dataset.to_netcdf(output_loc+f'masked_values_consecutive_heatwave_info_{region_name}{consecutive_months_threshold}_{months_after}_{start_year}_{end_year}.nc')

    # Close the netCDF file
    monthly_masks_file.close()
    

    # Section 4: Statistical Analysis
    # Open the heatwave dataset file
    heatwave_dataset_file = nc.Dataset(oceansoda_file)

    # Open the non-heatwave dataset file
    non_heatwave_dataset_file = nc.Dataset(oceansoda_climatology_file)

    # Get the variable data for heatwave dataset
    variables_heatwave = ['fgco2', 'ph_total', 'omega_ar', 'temperature', 'spco2', 'talk']
    heatwave_data = {var: heatwave_dataset_file.variables[var][:] for var in variables_heatwave}

    # Get the variable data for non-heatwave dataset
    variables_non_heatwave = ['fgco2', 'ph_total', 'omega_ar', 'temperature', 'spco2', 'talk']
    non_heatwave_data = {var: non_heatwave_dataset_file.variables[var][:] for var in variables_non_heatwave}

    # Create the time axis for the specified years
    dates = pd.date_range(start='1982-01-01', end='2022-12-31', freq='M')  # Assuming 1982-01-01 is the start of your dataset

    # Find the indices corresponding to the time period
    start_index = (start_year - 1982) * 12
    end_index = start_index + (end_year - start_year + 1) * 12

    # Slice the data for the specified years in heatwave dataset
    heatwave_data_year = {var: data[start_index:end_index] for var, data in heatwave_data.items()}

    # Slice the data for the non-heatwave dataset for the corresponding months
    non_heatwave_data_year = {var: data[:12] for var, data in non_heatwave_data.items()}

    # Open the mask file for the specified region
    mask_file = nc.Dataset(output_loc+f'masked_values_consecutive_heatwave_info_{region_name}{consecutive_months_threshold}_{months_after}_{start_year}_{end_year}.nc')

    # Get the mask variable for the specified region
    mask_region = mask_file.variables['masked_values'][:]

    # Get indices where the mask values are equal to any of the specified cat values (e.g., heatwave period)
    indices_heatwave_region_year = np.where(np.isin(mask_region, combined_cat_values))[0]

    # Get indices where the mask values are not equal to any of the specified cat values (e.g., non-heatwave period)
    indices_non_heatwave_region_year = np.where(~np.isin(mask_region, combined_cat_values))[0]

    # Iterate over each variable for analysis
    for variable in variables_heatwave:
        # Initialize a list to store results for each variable
        results = []

        # Get the data for the variable
        variable_heatwave = heatwave_data_year[variable]
        variable_non_heatwave = non_heatwave_data_year[variable]

        # Apply the mask to the sliced data in heatwave dataset
        variable_masked_year = np.ma.masked_array(variable_heatwave, np.logical_not(mask_region))

        # Calculate the cyclic index for mask_region to match the time dimension of non-heatwave data
        cyclic_indices = np.arange(12) % 12  # Generate cyclic indices from 0 to 11

        # Synchronize dimensions of mask_region with non-heatwave data using cyclic indices
        mask_region_synced = mask_region[cyclic_indices]

        # Mask the non-heatwave dataset variables with the synchronized mask
        variable_non_heatwave_masked_year = np.ma.masked_array(variable_non_heatwave, np.logical_not(mask_region_synced))

        # Calculate median values with the mask for the variable
        variable_median_region_year = np.ma.median(variable_masked_year, axis=(1, 2))
        variable_non_heatwave_median_region_year = np.ma.median(variable_non_heatwave_masked_year, axis=(1, 2))

        # Perform the Wilcoxon signed-rank tests
        p_values_region_year = []
        median_diff_region_year = []
        std_dev_region_year = []

        for _ in range(num_samples):
            # Randomly select indices for heatwave and non-heatwave periods
            sample_indices_region_heatwave_year = np.random.choice(indices_heatwave_region_year, len(indices_heatwave_region_year), replace=True)
            # Randomly select indices for non-heatwave periods, ensuring they are within the valid range (0-11)
            sample_indices_region_non_heatwave_year = np.random.choice(range(12), len(indices_heatwave_region_year), replace=True)

            # Adjust indices for non-heatwave year to ensure they are within the valid range (0-11)
            sample_indices_region_non_heatwave_year = sample_indices_region_non_heatwave_year % 12

            # Filter the data based on the sampled indices
            sample_variable_median_region_heatwave_year = variable_median_region_year[sample_indices_region_heatwave_year]
            sample_variable_median_region_non_heatwave_year = variable_non_heatwave_median_region_year[sample_indices_region_non_heatwave_year]

            # Perform the Wilcoxon signed-rank tests
            _, p_value_region_year = wilcoxon(sample_variable_median_region_heatwave_year, sample_variable_median_region_non_heatwave_year)

            # Calculate the median difference and standard deviation
            median_diff_region_year.append(np.median(sample_variable_median_region_heatwave_year - sample_variable_median_region_non_heatwave_year))
            std_dev_region_year.append(np.std(sample_variable_median_region_heatwave_year - sample_variable_median_region_non_heatwave_year))

            # Append the p-value to the respective list
            p_values_region_year.append(p_value_region_year)

        # Calculate median p-value, median difference, and median standard deviation
        median_p_value = np.median(p_values_region_year)
        median_median_diff = np.median(median_diff_region_year)
        median_std_dev = np.median(std_dev_region_year)

        # Define the function to determine significance
        def determine_significance(p_value):
            if p_value < 0.01:
                return "Significant"
            else:
                return "Not Significant"

        # Determine significance
        significance = determine_significance(median_p_value)

        # Append results to the list
        results.append([region_name, start_year, end_year,months_after, consecutive_months_threshold,  combined_cat_values, variable, median_p_value, median_median_diff, median_std_dev, significance])
       
        # Extract combined category values from the first row of the 'combined_cat_values' column
        combined_cat_values = parameters_df['combined_cat_values'].iloc[0]

        # Extract the numbers from the combined_cat_values string
        numbers = ''.join(filter(str.isdigit, combined_cat_values))

        # Insert an underscore between the numbers
        formatted_numbers = '_'.join(numbers)

        # Construct the filename
        csv_filename = output_loc+f"results_cat_{formatted_numbers}.csv"

        # Check if the file exists, if not, create it and write the header
        if not os.path.exists(csv_filename):
            with open(csv_filename, 'w', newline='') as csvfile:
                csv_writer = csv.writer(csvfile)
                # Write header
                csv_writer.writerow(["Region Name", "Start Year", "End Year","months_after",  "Consecutive Months Threshold", "combined_cat_values", "Parameter", "Median p-value", "Median median difference", "Median standard deviation", "Significance", "HW Start Date", "HW End Date"])

        # Append data to CSV
        with open(csv_filename, 'a', newline='') as csvfile:
            csv_writer = csv.writer(csvfile)
            # Write data rows
            csv_writer.writerows(results)

        print("Results for", variable, "in", region_name, "appended to", csv_filename)

  dates = pd.date_range(start='1982-01-01', end='2022-12-31', freq='M')  # Assuming 1982-01-01 is the start of your dataset
  a.partition(kth, axis=axis, kind=kind, order=order)


Results for fgco2 in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv
Results for ph_total in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv
Results for omega_ar in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv
Results for temperature in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv
Results for spco2 in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv
Results for talk in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv


  dates = pd.date_range(start='1982-01-01', end='2022-12-31', freq='M')  # Assuming 1982-01-01 is the start of your dataset
  a.partition(kth, axis=axis, kind=kind, order=order)


Results for fgco2 in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv
Results for ph_total in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv
Results for omega_ar in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv
Results for temperature in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv
Results for spco2 in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv
Results for talk in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv


  dates = pd.date_range(start='1982-01-01', end='2022-12-31', freq='M')  # Assuming 1982-01-01 is the start of your dataset
  a.partition(kth, axis=axis, kind=kind, order=order)


Results for fgco2 in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv
Results for ph_total in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv
Results for omega_ar in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv
Results for temperature in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv
Results for spco2 in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv
Results for talk in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv


  dates = pd.date_range(start='1982-01-01', end='2022-12-31', freq='M')  # Assuming 1982-01-01 is the start of your dataset
  a.partition(kth, axis=axis, kind=kind, order=order)


Results for fgco2 in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv
Results for ph_total in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv
Results for omega_ar in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv
Results for temperature in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv
Results for spco2 in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv
Results for talk in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv


  dates = pd.date_range(start='1982-01-01', end='2022-12-31', freq='M')  # Assuming 1982-01-01 is the start of your dataset
  a.partition(kth, axis=axis, kind=kind, order=order)


Results for fgco2 in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv
Results for ph_total in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv
Results for omega_ar in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv
Results for temperature in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv
Results for spco2 in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv
Results for talk in ANTA appended to D:/OceanHealth/output/results_cat_3_4.csv


In [4]:
import matplotlib.pyplot as plt
import pandas as pd

# Extract combined category values from the first row of the 'combined_cat_values' column
csv_file_path = "C:/Users/df391/OneDrive - University of Exeter/Post_Doc_Ocean_Health/HeatwaveAnalysis/PARAMETERS.csv"  # Replace with the actual path
output_loc = 'D:/OceanHealth/output/'
parameters_df = pd.read_csv(csv_file_path)
print(parameters_df)
combined_cat_values = parameters_df['combined_cat_values '].iloc[0]

# Extract the numbers from the combined_cat_values string
numbers = ''.join(filter(str.isdigit, combined_cat_values))

# Insert an underscore between the numbers
formatted_numbers = '_'.join(numbers)

data = pd.read_csv(output_loc+ f"results_cat_{formatted_numbers}.csv")

# Separate data by variable
variables = data['Parameter'].unique()

# Iterate over regions and create separate plots for each
for region in data['Region Name'].unique():
    region_data = data[data['Region Name'] == region]
    start_year = region_data['Start Year'].min()
    end_year = region_data['End Year'].max()
    combined_cat_values = region_data['combined_cat_values']

    fig, axs = plt.subplots(len(variables), 1, figsize=(8, 2 * len(variables)), sharex=False)
    fig.suptitle(f'Region: {region} ({start_year}-{end_year})', y=1.02, fontsize=16, ha='center')  # Place region name and year on top of the plots with increased font size

    for i, variable in enumerate(variables):
        # Filter data for the specific variable and region
        variable_data = region_data[region_data['Parameter'] == variable]

        # Sort the data by Months After for a better visualization
        variable_data.sort_values(by='months_after', inplace=True)

        # Create a step digital signal plot with bolder lines
        x_values = variable_data['months_after']
        y_values = (variable_data['Significance'] == "Significant").astype(int)

        axs[i].step(x_values, y_values, where='post', color='darkblue', linewidth=2.5)  # Make the lines bolder

        axs[i].set_title(f'Change in significance for {variable}', fontsize=14)  # Adjust title fontsize
        axs[i].set_yticks([0, 1])
        axs[i].set_ylim([-0.1, 1.1])  # Ensure the y-axis range is fixed

        # Make ticks and tick labels bold
        axs[i].tick_params(axis='both', which='major', labelsize=10, width=1.5, length=6)  # Adjust tick parameters

    # Write region name on top of the entire figure
    fig.text(0.5, 0.98, f'Region: {region}', fontsize=12, ha='center')

    # Set common labels for y axis
    fig.text(-0.04, 0.5, 'Significance (1: Significant, 0: Non-Significant)', va='center', rotation='vertical', fontsize=12)

    # Set custom x labels for all subplots
    for ax in axs:
        ax.set_xticks(data['months_after'].unique())
        ax.set_xticklabels(data['months_after'].unique().astype(str))

    # Place the plots for "spco2" and "alkalinity" at the bottom
    axs[-1].set_xlabel('Months', fontsize=12)  # Adjust xlabel fontsize

    plt.tight_layout()
    
    # Save the plot as an image file with region name and year
    filename = output_loc+ f'plots/{region}_{start_year}-{end_year}_cat_{formatted_numbers}_plots.png'
    fig.savefig(filename)
    
    # Close the figure to release memory
    plt.close(fig)

    start_year   end_year   longhurst_region_code  region_name   \
0          2016       2017                      53         ANTA   
1          2016       2017                      53         ANTA   
2          2016       2017                      53         ANTA   
3          2016       2017                      53         ANTA   
4          2016       2017                      53         ANTA   
5          2016       2017                      53         ANTA   
6          2016       2017                      53         ANTA   
7          2016       2017                      53         ANTA   
8          2016       2017                      53         ANTA   
9          2016       2017                      53         ANTA   
10         2016       2017                      53         ANTA   
11         2016       2017                      53         ANTA   

   combined_cat_values   consecutive_months_threshold   num_samples  \
0                [3, 4]                              3   

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  variable_data.sort_values(by='months_after', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  variable_data.sort_values(by='months_after', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  variable_data.sort_values(by='months_after', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-