In [1]:
#!/usr/bin/env python
# coding: utf-8

import numpy as np
import os
import pandas as pd
import xarray as xr
from scipy.stats import pearsonr

from paths_bra import *

from dask.diagnostics import ProgressBar
ProgressBar().register()

In [2]:
# MERRA-2 and ERA5 only unique interpolated locations
print('prepare turbine location data')
# open turbine files
wt_mer = pd.read_csv(bra_path + '/turbine_data_mer.csv', index_col=0)
wt_era = pd.read_csv(bra_path + '/turbine_data_era.csv', index_col=0)

# open wind files
wind_mer = xr.open_mfdataset(mer_path + "/eff_ws/merra2_wind_BRA_*.nc", chunks = {'time': 38})
alpha_mer = xr.open_mfdataset(mer_path + "/eff_ws/merra2_alpha_BRA_*.nc", chunks = {'time': 38})
wind_era = xr.open_mfdataset(era_path + "/eff_ws/era5_wind_BRA_*.nc", chunks = {'time': 38})
alpha_era = xr.open_mfdataset(era_path + "/eff_ws/era5_alpha_BRA_*.nc", chunks = {'time': 38})

prepare turbine location data


In [3]:
# Create dataframe with sequence the size of MERRA-2 grid to find out which turbines interpolate to the same point
in_seq_mer = xr.Dataset({'x':(['lat','lon'],
                              np.array(range(wind_mer.wh50.isel(time=0).values.size)).reshape(wind_mer.wh50.isel(time=0).values.shape))},
                         coords = {'lat':wind_mer.lat.values,
                                   'lon':wind_mer.lon.values})
in_seq_era = xr.Dataset({'x':(['lat','lon'],
                              np.array(range(wind_era.wh100.isel(time=0).values.size)).reshape(wind_era.wh100.isel(time=0).values.shape))},
                         coords = {'lat':wind_era.latitude.values,
                                   'lon':wind_era.longitude.values})

# interpolate to reanalysis grid points
ip_mer = in_seq_mer.interp(coords={"lon":xr.DataArray(wt_mer.lon,dims='location'),
                                   "lat":xr.DataArray(wt_mer.lat,dims='location')},method="nearest").to_dataframe()
ip_era = in_seq_era.interp(coords={"lon":xr.DataArray(wt_era.lon,dims='location'),
                                   "lat":xr.DataArray(wt_era.lat,dims='location')},method="nearest").to_dataframe()

[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s


In [4]:
# find unique locations
uniques_mer = ip_mer.groupby(ip_mer.x).min()
uniques_era = ip_era.groupby(ip_era.x).min()

# add ids to unique correlation locations
uniques_era['cor_id'] = range(len(uniques_era.index))
uniques_mer['cor_id'] = range(len(uniques_mer.index))

# add correlation ids to wind turbine data
wt_mer['cor_id'] = ip_mer.x.map(uniques_mer.cor_id)
wt_era['cor_id'] = ip_era.x.map(uniques_era.cor_id)

# interpolate wind to unique locations and extrapolate to 90 m hubheight, which is mean hubheight in BRA
windi_mer = wind_mer.interp(coords={"lon":xr.DataArray(uniques_mer.lon,dims='location'),
                                    "lat":xr.DataArray(uniques_mer.lat,dims='location')},method="nearest")
windi_era = wind_era.interp(coords={"longitude":xr.DataArray(uniques_era.lon,dims='location'),
                                    "latitude":xr.DataArray(uniques_era.lat,dims='location')},method="nearest")
                                    
alphai_mer = alpha_mer.interp(coords={"lon":xr.DataArray(uniques_mer.lon,dims='location'),
                                      "lat":xr.DataArray(uniques_mer.lat,dims='location')},method="nearest")
alphai_era = alpha_era.interp(coords={"longitude":xr.DataArray(uniques_era.lon,dims='location'),
                                      "latitude":xr.DataArray(uniques_era.lat,dims='location')},method="nearest")

In [5]:
# calculate wind speeds at 90 m height
windhh_mer = (windi_mer.wh50 * (90/50)**alphai_mer.alpha)
windhh_era = (windi_era.wh100 * (90/100)**alphai_era.alpha)

In [6]:
# calculate cross-correlations
print('calculate cross-correlations')
if not os.path.isfile(results_path + '/corr_mer.nc'):
    c_mer = np.corrcoef(windhh_mer,rowvar=False)
    xr.DataArray(c_mer,
                 dims = ['id1','id2'],
                 coords = {'id1':range(c_mer.shape[0]),
                           'id2':range(c_mer.shape[0])}).to_netcdf(results_path + '/corr_mer.nc')
    del(c_mer)
if not os.path.isfile(results_path + '/corr_era.nc'):
    c_era = np.corrcoef(windhh_era,rowvar=False)
    xr.DataArray(c_era,
                 dims = ['id1','id2'],
                 coords = {'id1':range(c_era.shape[0]),
                           'id2':range(c_era.shape[0])}).to_netcdf(results_path + '/corr_era.nc')
    del(c_era)

calculate cross-correlations


In [7]:
# load correlations
c_mer = xr.open_dataarray(results_path + '/corr_mer.nc').values
c_era = xr.open_dataarray(results_path + '/corr_era.nc').values

In [8]:
ANL = pd.read_csv(bra_path + '/turbine_data.csv', index_col = 0)
lbl = pd.read_csv(bra_path+ '/labels_turbine_data_gwa3.csv',index_col=0)

# usinas

In [9]:
# some locations have more than one park, get shares of parks
sharesMER = ANL.cap.groupby([lbl.lbl_mer.values,ANL.name.values]).sum()/ANL.cap.groupby([lbl.lbl_mer.values,ANL.name.values]).sum().index.get_level_values(0).map(ANL.cap.groupby(lbl.lbl_mer.values).sum())
sharesERA = ANL.cap.groupby([lbl.lbl_era.values,ANL.name.values]).sum()/ANL.cap.groupby([lbl.lbl_era.values,ANL.name.values]).sum().index.get_level_values(0).map(ANL.cap.groupby(lbl.lbl_era.values).sum())
sharesMERg = ANL.cap.groupby([lbl.lbl_mer_gwa.values,ANL.name.values]).sum()/ANL.cap.groupby([lbl.lbl_mer_gwa.values,ANL.name.values]).sum().index.get_level_values(0).map(ANL.cap.groupby(lbl.lbl_mer_gwa.values).sum())
sharesERAg = ANL.cap.groupby([lbl.lbl_era_gwa.values,ANL.name.values]).sum()/ANL.cap.groupby([lbl.lbl_era_gwa.values,ANL.name.values]).sum().index.get_level_values(0).map(ANL.cap.groupby(lbl.lbl_era_gwa.values).sum())


In [10]:
# add correlation ids to shares
sharesMER = pd.DataFrame({'share':sharesMER,
                          'cor_id':sharesMER.index.get_level_values(0).map(pd.Series(wt_mer.cor_id.values,index=lbl.lbl_mer.unique()))})
sharesERA = pd.DataFrame({'share':sharesERA,
                          'cor_id':sharesERA.index.get_level_values(0).map(pd.Series(wt_era.cor_id.values,index=lbl.lbl_era.unique()))})

In [11]:
# group ids by park
cidMER = sharesMER.groupby(sharesMER.index.get_level_values(1)).cor_id.unique()
cidERA = sharesERA.groupby(sharesERA.index.get_level_values(1)).cor_id.unique()

In [12]:
def getmeancorr(ids,cors):
    idu = np.unique(ids)
    if(len(idu))>1:
        mc = np.array([list(cors[idu[i],
                       np.delete(idu,i,0)]) for i in range(len(idu))]).mean()
        #return(cors[np.unique(ids)[0],np.unique(ids)[1:]].mean())
        return(mc)
    else:
        return(1)

In [13]:
def gmc_mer(ids):
    return(getmeancorr(ids,c_mer))
def gmc_era(ids):
    return(getmeancorr(ids,c_era))

In [14]:
# get mean correlations per park
cm_USI_mer = cidMER.apply(gmc_mer)
cm_USI_era = cidERA.apply(gmc_era)

# state

In [15]:
# load matching parks
mpM = pd.read_pickle(bra_path + '/matches2.pkl')
mpH = pd.read_pickle(bra_path + '/matches2H.pkl')

In [16]:
# get correlation ids per park found in hourly and monthly data
cid_usi_merM = pd.Series(mpM.ANL_name[mpM.score==100].map(cidMER).values,index = mpM.ANL_name[mpM.score==100])
cid_usi_eraM = pd.Series(mpM.ANL_name[mpM.score==100].map(cidERA).values,index = mpM.ANL_name[mpM.score==100])
cid_usi_merH = pd.Series(mpH.ANL_name[mpH.score==100].map(cidMER).values,index = mpH.ANL_name[mpH.score==100])
cid_usi_eraH = pd.Series(mpH.ANL_name[mpH.score==100].map(cidERA).values,index = mpH.ANL_name[mpH.score==100])

In [17]:
usiest = ANL.groupby('name').state.first()

In [18]:
cm_EST_merM = cid_usi_merM.groupby(cid_usi_merM.index.map(usiest)).apply(np.concatenate).apply(np.unique).apply(gmc_mer)
cm_EST_eraM = cid_usi_eraM.groupby(cid_usi_eraM.index.map(usiest)).apply(np.concatenate).apply(np.unique).apply(gmc_era)
cm_EST_merH = cid_usi_merH.groupby(cid_usi_merH.index.map(usiest)).apply(np.concatenate).apply(np.unique).apply(gmc_mer)
cm_EST_eraH = cid_usi_eraH.groupby(cid_usi_eraH.index.map(usiest)).apply(np.concatenate).apply(np.unique).apply(gmc_era)

# subsystem

In [19]:
subH = pd.Series(['NE']*3,index=['BA','CE','RN'])
subM = pd.Series(['NE']*8+['S']*3,index=['BA','CE','MA','PB','PE','PI','RN','SE','PR','RS','SC'])

In [20]:
cm_SUB_merM = cid_usi_merM.groupby(cid_usi_merM.index.map(usiest).map(subM)).apply(np.concatenate).apply(np.unique).apply(gmc_mer)
cm_SUB_eraM = cid_usi_eraM.groupby(cid_usi_eraM.index.map(usiest).map(subM)).apply(np.concatenate).apply(np.unique).apply(gmc_era)
cm_SUB_merH = cid_usi_merH.groupby(cid_usi_merH.index.map(usiest).map(subH)).apply(np.concatenate).apply(np.unique).apply(gmc_mer)
cm_SUB_eraH = cid_usi_eraH.groupby(cid_usi_eraH.index.map(usiest).map(subH)).apply(np.concatenate).apply(np.unique).apply(gmc_era)

# Brazil

In [21]:
np.repeat(['MERRA2','ERA5'],len(cm_EST_merM))

array(['MERRA2', 'MERRA2', 'MERRA2', 'MERRA2', 'MERRA2', 'MERRA2',
       'MERRA2', 'MERRA2', 'MERRA2', 'MERRA2', 'MERRA2', 'MERRA2', 'ERA5',
       'ERA5', 'ERA5', 'ERA5', 'ERA5', 'ERA5', 'ERA5', 'ERA5', 'ERA5',
       'ERA5', 'ERA5', 'ERA5'], dtype='<U6')

In [23]:
cm_BRA_mer = gmc_mer(range(len(uniques_mer)))
cm_BRA_era = gmc_era(range(len(uniques_era)))

In [26]:
# merge correlations
print('merge all correlations')
mean_cors = pd.DataFrame({'scale': ['country']*6 + 
                                   ['state']*2*(2*len(cm_EST_merH)+len(cm_EST_merM)) + 
                                    ['subsystem']*2*(2*len(cm_SUB_merH)+len(cm_SUB_merM)) + 
                                    ['park']*6*len(cm_USI_mer),
                          'region': ['BRA',]*6 + 
                                    cm_EST_merH.index.to_list()*4 + cm_EST_merM.index.to_list()*2 +
                                    ['NE']*4 + ['NE','S']*2 + 
                                    cm_USI_mer.index.to_list()*6,
                          'dataset': np.repeat(['MERRA2','ERA5'],3).tolist() + 
                                     np.repeat(['MERRA2','ERA5'],len(cm_EST_merH)).tolist()*2 + np.repeat(['MERRA2','ERA5'],len(cm_EST_merM)).tolist()+ 
                                     np.repeat(['MERRA2','ERA5'],len(cm_SUB_merH)).tolist()*2 + np.repeat(['MERRA2','ERA5'],len(cm_SUB_merM)).tolist() +
                                     np.repeat(['MERRA2','ERA5'],len(cm_USI_mer)).tolist()*3,
                          'temp':['m','d','h']*2+
                                 np.repeat(['h','d'],2*len(cm_EST_merH)).tolist()+['m']*2*len(cm_EST_merM)+
                                 np.repeat(['h','d'],2*len(cm_SUB_merH)).tolist()+['m']*2*len(cm_SUB_merM)+
                                  np.repeat(['m','d','h'],2*len(cm_USI_mer)).tolist(),
                          'cor':np.repeat([cm_BRA_mer,cm_BRA_era],3).tolist() +
                                (cm_EST_merH.values.tolist() + cm_EST_eraH.values.tolist())*2 + cm_EST_merM.values.tolist() + cm_EST_eraM.values.tolist() +
                                 (cm_SUB_merH.values.tolist() + cm_SUB_eraH.values.tolist())*2 + cm_SUB_merM.values.tolist() + cm_SUB_eraM.values.tolist() +
                                 (cm_USI_mer.values.tolist() + cm_USI_era.values.tolist())*3})

merge all correlations


In [27]:
# save wind speed correlations
print('save wind speed correlations')
mean_cors.to_csv(results_path + '/correlations_wind.csv')

save wind speed correlations
