# Notebook for performing Mann-Whitney U-tests for statistical significance

### Import stats libraries, namely mannwhitney package from scipy.stats

In [1]:
from statsmodels.sandbox.stats.multicomp import multipletests
from scipy.stats import mannwhitneyu
import xarray as xr
import netCDF4 as nc
from netCDF4 import Dataset
import numpy as np

## Import WRF daily max files (resampled to convective daily max) for each climate epoch

In [2]:
wrf_hist = xr.open_dataset('/home/scratch/jgoodin/convective_daily_max_resamples/hist_daily_max_hail_inches.nc')
wrf_mid4p5 = xr.open_dataset('/home/scratch/jgoodin/convective_daily_max_resamples/mid_century_4p5_daily_max_hail_inches.nc')
wrf_mid8p5 = xr.open_dataset('/home/scratch/jgoodin/convective_daily_max_resamples/mid_century_8p5_daily_max_hail_inches.nc')
wrf_end4p5 = xr.open_dataset('/home/scratch/jgoodin/convective_daily_max_resamples/end_century_4p5_daily_max_hail_inches.nc')
wrf_end8p5 = xr.open_dataset('/home/scratch/jgoodin/convective_daily_max_resamples/end_century_8p5_daily_max_hail_inches.nc')

In [3]:
wrf_hist

## Pass hail day threshold

In [4]:
hist_sev = 1 * (wrf_hist >= 0.0254)
mid4p5_sev = 1 * (wrf_mid4p5 >= 0.0254)
#mid8p5_sev = 1 * (wrf_mid8p5 >= 0.0254) #Can change threshold to larger hail as well
#end4p5_sev = 1 * (wrf_end4p5 >= 0.0254)
#end8p5_sev = 1 * (wrf_end8p5 >= 0.0254)

In [5]:
hist_sev.HAIL_MAX2D.shape

(5479, 899, 1399)

### Resample by year and sum along the 'Time' dimension- provides a count of yearly sev hail days

In [6]:
hist_annual = hist_sev.groupby('Time.year').sum(dim = 'Time')
mid4p5_annual = mid4p5_sev.groupby('Time.year').sum(dim = 'Time')
#mid8p5_annual = mid8p5_sev.groupby('Time.year').sum(dim = 'Time') #Groupby year and sum sev days across time dim to provide annual sev hail day count
#end4p5_annual = end4p5_sev.groupby('Time.year').sum(dim = 'Time') #Can change to seasonal as well
#end8p5_annual = end8p5_sev.groupby('Time.year').sum(dim = 'Time')

In [7]:
hist_annual

### Select array containing HAIL_MAX2D values to send into Mann-Whitney

In [None]:
"""
hist_annual = hist_annual.HAIL_MAX2D
mid4p5_annual = mid4p5_annual.HAIL_MAX2D
mid8p5_annual = mid8p5_annual.HAIL_MAX2D #Select values (2D array) of HAIL_MAX2D to pass to Mann-Whitney U-test
end4p5_annual = end4p5_annual.HAIL_MAX2D
end8p5_annual = end8p5_annual.HAIL_MAX2D
"""

In [None]:
#hist_annual.shape #Make sure shape is 2D

## Import netcdf file containing lat/lon coords for WRF files

In [8]:
coords = xr.open_dataset('lat_lon.nc') #Open .nc file containing WRF lat/lon coords

### Assign lat/lon coords from file to variables

In [9]:
lats = coords.CLAT.values[0, :, :]
lons = coords.CLONG.values[0, :, :]

## Perform Mann-Whitney U-test for medians (means?), incorporating false-discovery rate correction

In [1]:
def field_significance(hist_annual, mid4p5_annual):
    
    #sets up p-value array (default 1 - not significant)
    results = np.ones(shape=(hist_annual.HAIL_MAX2D.shape[1], hist_annual.HAIL_MAX2D.shape[2]), dtype=float)
    
    #loop through the y and x dimensions
    for i in range(hist_annual.HAIL_MAX2D.shape[1]):
        for j in range(hist_annual.HAIL_MAX2D.shape[2]):
            
            dist1 = hist_annual.HAIL_MAX2D[:, i, j]
            dist2 = mid4p5_annual.HAIL_MAX2D[:, i, j]
            
            try:
                s, p = mannwhitneyu(dist1, dist2)
                results[i, j] = p
            except Exception as e:
                results[i, j] = np.nan
            
    return results

#Send your arrays into 'field_significance' function, sample below
res = field_significance(hist_annual, mid4p5_annual)
#clip to your region if needed
#res_mask = np.ma.masked_where(clip==False, res).filled(np.nan)
#flat_max = mask_fin[0].values.flatten()

#Perform the false discovery rate test
p = multipletests(res.flatten(), alpha=0.1, method='fdr_bh')[0]

#Save output
np.save('/home/scratch/jgoodin/fdr_delta_hist_mid4p5_annual_sev_hail_days.npy',p.reshape((lons.shape)))

"\ndef field_significance(hist_annual, mid4p5_annual):\n    \n    #sets up p-value array (default 1 - not significant)\n    results = np.ones(shape=(hist_annual.HAIL_MAX2D.shape[1], hist_annual.HAIL_MAX2D.shape[2]), dtype=float)\n    \n    #loop through the y and x dimensions\n    for i in range(hist_annual.HAIL_MAX2D.shape[1]):\n        for j in range(hist_annual.HAIL_MAX2D.shape[2]):\n            \n            dist1 = hist_annual.HAIL_MAX2D[:, i, j]\n            dist2 = mid4p5_annual.HAIL_MAX2D[:, i, j]\n            \n            try:\n                s, p = mannwhitneyu(dist1, dist2)\n                results[i, j] = p\n            except Exception as e:\n                results[i, j] = np.nan\n            \n    return results\n\n#Send your arrays into 'field_significance' function, sample below\nres = field_significance(hist_annual, mid4p5_annual)\n#clip to your region if needed\n#res_mask = np.ma.masked_where(clip==False, res).filled(np.nan)\n#flat_max = mask_fin[0].values.flatten(