In [1]:
import datetime
from dateutil.relativedelta import *
from fuzzywuzzy import fuzz
import glob
import numpy as np
import pandas as pd
from scipy.stats import ttest_1samp
import sys
import xarray as xr


from paths_bra import *

sys.path.append('./..')
from refuelplot import *
setup()

from utils import *

In [2]:
gen_path = bra_path + '/generation'

In [3]:
## load generation data
# load usinas hourly
if gen_path + '/hourly/usinas.pkl' not in glob.glob(gen_path + '/hourly/*.pkl'):
    USIh = pd.read_csv(gen_path + '/hourly/Comparativo_Geração_de_Energia_Semana_data_usinas.csv',
                       sep = ';', index_col = 0, parse_dates = True, dayfirst = True).iloc[1:,[6,8]].sort_index()
    # remove missing values
    USIh = USIh.loc[USIh.index.notnull()].dropna()
    USIh.columns = ['usina','prod_GWh']

    # in RIO DO FOGO there is one duplicate hour after one missing hour -> change timestamps of those hours
    idxUSIh = USIh.index.values
    midxUSIh = USIh.reset_index().set_index(['usina','Data Escala de Tempo 1 GE Comp 3']).index
    idxUSIh[midxUSIh.duplicated(keep='last')]  = idxUSIh[midxUSIh.duplicated(keep='first')] - np.timedelta64(1,'h')
    USIh.index = pd.DatetimeIndex(idxUSIh)

    USIhs = USIh.reset_index().set_index(['usina','index']).unstack(level=0).prod_GWh
    USIhs.to_csv(gen_path + '/hourly/usinas.csv')
    USIhs.to_pickle(gen_path + '/hourly/usinas.pkl')
wpUSIhs = pd.read_pickle(gen_path + '/hourly/usinas.pkl')

In [5]:
USIhs[USIhs.fillna(0).cumsum(axis=0)==0] = np.nan # remove leading 0s
USIhs[USIhs[::-1].fillna(0).cumsum(axis=0)[::-1]==0] = np.nan # remove trailing 0s

USIms[USIms.fillna(0).cumsum(axis=0)==0] = np.nan # remove leading 0s
USIms[USIms[::-1].fillna(0).cumsum(axis=0)[::-1]==0] = np.nan # remove trailing 0s

In [6]:
def get_cap_df(cap,comdate):
    com = pd.DataFrame({'capacity': cap}).groupby(comdate).sum()
    cap_cum = com.capacity.cumsum()
    # if only years given for commissioning dates -> gradual capacity increase over year, full capacity at end of year
    if type(cap_cum.index.values[0]) == np.int64:
        cap_cum.index = [np.datetime64(str(int(year))+"-12-31 23:00:00") for year in cap_cum.index.values]
        # create yearly dates at yearends
        drcc = pd.date_range(np.datetime64('2005-12-31 23:00:00'),
                             np.datetime64('2019-12-31 23:00:00'),freq= 'y')
        cap_cum = pd.Series(drcc.map(cap_cum),index = drcc)
        # if first year emtpy: either year before or 0 if nothing before
        if(sum(com.index<2000) > 0):
            cap_cum[0] = com.cumsum()[com.index<2000].max()
        else:
            cap_cum[0] = 0
        # if missing years -> put capacity of year before
        cap_cum = cap_cum.ffill()
    dr = pd.date_range('1/1/2006','31/12/2019 23:00:00',freq = 'h')
    cap_ts = pd.Series(dr.map(cap_cum),index = dr)
    cap_ts[0] = cap_cum[cap_cum.index<=pd.Timestamp('2006-01-01')].max()
    if type(comdate[0]) == np.int64:
        return(cap_ts.interpolate(method='linear'))
    else:
        return(cap_ts.fillna(method='ffill'))

In [7]:
# load and match aneel and ons windparks
def matchWords(word, statements):
    # function to match a word to different statements
    # output: ratio of matching (0-100) for all provided statements
    results = []
    for s in statements:
        r = fuzz.ratio(word, s)
        results.append(r)
    return results

def match_string(string, array):
    # function for matching casefolded strings
    Slc = string.strip().casefold()
    Alc = [arr.casefold() for arr in array.str.strip().unique()]
    scores = matchWords(Slc, Alc)
    mscore = max(scores)
    strarr = array.unique()[np.where(np.array(scores)==mscore)][0]
    return(string,strarr,mscore)

def match_anl(string):
    # function to match ONS to ANL windparks
    return(match_string(string,ANL2.name))

# load ANEEL and ONS windparks
ONS = pd.read_csv(bra_path + '/ONS_windparks.csv', index_col = 0)
# remove those with CONJUNTO EOLICO - they're there twice and capacities don't match with ANEEL data
ONS = ONS[~ONS.usina.str.contains('CONJUNTO EOLICO')]
# remove some other duplicate windparks
ONS = ONS[[d not in [' CANOA QUEBRADA (E-RV-ACEP)',' PV DO NORDESTE',' SM (SANTA MARIA)',' SÃO BENTO NORTE II'] for d in ONS.usina]]
ANL = pd.read_csv(bra_path + '/turbine_data.csv', index_col = 0)

# characters and strings to replace for better matching
letters = {'Ãµ':'õ',
           'ó':'o',
           'ã':'a',
           'á':'a',
           'â':'a',
           'é':'e',
           'Ã':'A',
           'Á':'A',
           'Â':'A',
           'Ó':'O',
           'É':'E',
           'ú':'u',
           'ô':'o',
           'Ô':'O',
           'ú':'u',
           'Ú':'U',
           'ç':'c',
           'Ç':'C',
           'í':'i',
           'Í':'I',
           'Ê':'E'}
remove = {' 2LER':'',
          ' 2LFA':'',
          ' LFA':'',
          'EOL ':'',
          ' 3LER':'',
          'Usina Eolica ':'',
          'Eólica ':'',
          ' ENERGIAS RENOVAVEIS':'',
#          ' CONJUNTO EOLICO':'',
          '\(E-BV-ACEP\)':'',
          '\(E-RV-ACEP\)':'',
          '\(BELA BISTA\)':'',
          '\(ENERGEN\)':'',
          '\(Antiga Ventos Maranhenses 05\)':'',
          'PARQUE EOLICO ':'',
          ' - N HORIZ':'',
          'ENERGETICA S/A':'',
          '\(ILHEUS\)':'',
          ' EOLOS':'',
          'S\.A\.':''}
replace = {'LAG DO':'LAGOA DO',
           'VENTOS S VICENTE':'VENTOS DE SAO VICENTE',
           'SERRA BABILONIA':'SERRA DA BABILONIA',
           'CORREDOR SENANDES':'CORREDOR DO SENANDES',
           'SAO BENTO NORTE':'SAO BENTO DO NORTE',
           'GAMELEIRAS':'GAMELERIAS',
           'Lagoinha':'Lagoinh',
           'PAPAGAIOS':'PAPAGAIO',
           'VENTOS DE SAO ABRAAO':'VENTOS DO SANTO ABRAAO',
           'VENTOS DO SAO MARIO':'VENTOS DE SAO MARIO',
           'DAGUA':'D AGUA',
           'B VEN':'BONS VENTOS',
           'NOVA BURITI':'BURITI',
           'NOVA CAJUCOCO':'CAJUCOCO',
           'PALMAS':'DE PALMAS',
           'DE PALMARES':'PALMARES',
           'PV DO NORDESTE':'VENTOS DO NORDESTE',
           'Aura Lagoa do Barro':'Lagoa do Barro',
           'AURA LAGOA DO BARRO':'LAGOA DO BARRO',
           'LAGOA BARRO':'LAGOA DO BARRO',
           'GRAVATA':'GRAVATA FRUITRADE',
           'FAZENDA DO ROSARIO':'FAZENDA ROSARIO',
           'Parque Eolico do Horizonte':'Ventos de Horizonte',
           'S BENTO':'SAO BENTO',
           'SANTO ANTONIO (BTG PACTUAL)':'SANTO ANTONIO DE PADUA',
           'SM \(SANTA MARIA\)':'SANTA MARIA',
           'SAO JORGE CE':'SAO JORGE',
           'VENT DA ST ESPERANCA':'VENTOS DA SANTA ESPERANCA',
           'VENTOS DA STA DULCE':'VENTOS DA SANTA DULCE',
           'ESPERANCA NORDESTE':'ESPERANCA DO NORDESTE',
           'Eolica Delta':'Delta',
           'Eolica Serra das Vacas':'Serra das Vacas',
           'Ventos de Santo Augusto':'Santo Augusto',
           'Ventos do Sao Gabriel':'Sao Gabriel',
           'GE Maria Helena':'Maria Helena'}
numbers = {'10':'X',
           '11':'XI',
           '12':'XII',
           '13':'XIII',
           '14':'XIV',
           '15':'XV',
           '17':'XVII',
           '19':'XIX',
           '21':'XXI',
           '23':'XXIII',
           '24':'XXIV',
           '25':'XXV',
           '26':'XXVI',
           '27':'XXVII',
           '28':'XXVIII',
           '29':'XXIX',
           '31':'XXXI',
           '34':'XXXIV',
           '35':'XXXV',
           '36':'XXXVI',
           '01':'I',
           '02':'II',
           '03':'III',
           '04':'IV',
           '05':'V',
           '06':'VI',
           '07':'VII',
           '08':'VIII',
           '09':'IX',
           '1':'I',
           '2':'II',
           '3':'III',
           '4':'IV',
           '5':'V',
           '6':'VI',
           '7':'VII',
           '8':'VIII',
           '9':'IX'}

# replace characters
ONS2 = ONS.copy(deep=True)
ANL2 = ANL.copy(deep=True)
for i in letters:
    ONS2.usina = ONS2.usina.str.replace(i,letters.get(i))
    ANL2.name = ANL2.name.str.replace(i,letters.get(i))
for i in replace:
    ONS2.usina = ONS2.usina.str.replace(i,replace.get(i))
    ANL2.name = ANL2.name.str.replace(i,replace.get(i))
for i in remove:
    ONS2.usina = ONS2.usina.str.replace(i,remove.get(i))
for i in numbers:
    ONS2.usina = ONS2.usina.str.replace(i,numbers.get(i))
    ANL2.name = ANL2.name.str.replace(i,numbers.get(i))

# match windparks
matches = ONS2.usina.apply(match_anl).apply(pd.Series)
matches.columns = ['ONS_name','ANL_name','score']
len(matches[matches.score<100])

53

In [8]:
ONSd = pd.Series(ONS.usina.values,index=ONS2.usina.values)#.reset_index().drop_duplicates()
ANLd = pd.Series(ANL.name.values,index=ANL2.name.values)#.reset_index().drop_duplicates()
ONSd.columns = ['simpl','orig']
ANLd.columns = ['simpl','orig']

### get only matching power generation timeseries

In [9]:
matches2 = pd.DataFrame({'ANL_name':matches.ANL_name.map(ANLd.drop_duplicates()),
                         'ONS_name':matches.ONS_name.map(ONSd),
                         'score':matches.score})

In [184]:
matches2H = matches2.copy(deep=True)
matches2H = matches2H[[usi in USIhs.columns.values for usi in matches2.ONS_name]]

In [922]:
matches2.to_pickle(bra_path + '/matches2.pkl')
matches2H.to_pickle(bra_path + '/matches2H.pkl')

In [10]:
wpUSIm = USIms[matches2.ONS_name[matches2.score==100].values]

In [173]:
wpUSIh = USIhs[matches2H[matches2H.score==100].ONS_name.values]

In [None]:
# extract only matching ANEEL data
ANLmatch = ANL[[name in matches2.ANL_name[matches2.score==100].values for name in ANL.name]]
ANLmatchH = ANL[[name in matches2H.ANL_name[matches2H.score==100].values for name in ANL.name]]

### load simualted data

In [13]:
wpERAxr = xr.open_dataset(results_path + '/windpower_stat_ERA5.nc')
wpMERxr = xr.open_dataset(results_path + '/windpower_stat_MERRA2.nc')
wpERAgxr = xr.open_mfdataset(results_path +'/windpower_??_ERA5_GWA.nc')
wpMERgxr = xr.open_mfdataset(results_path +'/windpower_??_MERRA2_GWA.nc')

In [14]:
turb_mer = pd.read_csv(bra_path + '/turbine_data_mer.csv',index_col=0)
turb_era = pd.read_csv(bra_path + '/turbine_data_era.csv',index_col=0)
turb_merg = pd.read_csv(bra_path + '/turbine_data_mer_gwa3.csv',index_col=0)
turb_erag = pd.read_csv(bra_path + '/turbine_data_era_gwa3.csv',index_col=0)

In [15]:
lbl = pd.read_csv(bra_path+ '/labels_turbine_data_gwa3.csv',index_col=0)

In [16]:
# prepare simulated data as dataframe
wpMERdf = wpMERxr.to_dataframe().unstack().wp
wpERAdf = wpERAxr.to_dataframe().unstack().wp
wpMERgdf = wpMERgxr.assign_coords(location=range(len(wpMERgxr.location.values))).to_dataframe().unstack().wp
wpERAgdf = wpERAgxr.assign_coords(location=range(len(wpERAgxr.location.values))).to_dataframe().unstack().wp
# some locations have more than one park, get shares of parks
sharesMER = ANL.cap.groupby([lbl.lbl_mer.values,ANL.name.values]).sum()/ANL.cap.groupby([lbl.lbl_mer.values,ANL.name.values]).sum().index.get_level_values(0).map(ANL.cap.groupby(lbl.lbl_mer.values).sum())
sharesERA = ANL.cap.groupby([lbl.lbl_era.values,ANL.name.values]).sum()/ANL.cap.groupby([lbl.lbl_era.values,ANL.name.values]).sum().index.get_level_values(0).map(ANL.cap.groupby(lbl.lbl_era.values).sum())
sharesMERg = ANL.cap.groupby([lbl.lbl_mer_gwa.values,ANL.name.values]).sum()/ANL.cap.groupby([lbl.lbl_mer_gwa.values,ANL.name.values]).sum().index.get_level_values(0).map(ANL.cap.groupby(lbl.lbl_mer_gwa.values).sum())
sharesERAg = ANL.cap.groupby([lbl.lbl_era_gwa.values,ANL.name.values]).sum()/ANL.cap.groupby([lbl.lbl_era_gwa.values,ANL.name.values]).sum().index.get_level_values(0).map(ANL.cap.groupby(lbl.lbl_era_gwa.values).sum())
# get generation per park
wpMER = wpMERdf.loc[sharesMER.index.codes[0].values()].mul(sharesMER.values,axis=0).groupby(sharesMER.index.get_level_values(1).values).sum().transpose()
wpERA = wpERAdf.loc[sharesERA.index.codes[0].values()].mul(sharesERA.values,axis=0).groupby(sharesERA.index.get_level_values(1).values).sum().transpose()
wpMERg = wpMERgdf.loc[sharesMERg.index.codes[0].values()].mul(sharesMERg.values,axis=0).groupby(sharesMERg.index.get_level_values(1).values).sum().transpose()
wpERAg = wpERAgdf.loc[sharesERAg.index.codes[0].values()].mul(sharesERAg.values,axis=0).groupby(sharesERAg.index.get_level_values(1).values).sum().transpose()
# adapt index of MERRA data in 2019 (substract half an hour)
wpMER.index = wpMER.index[wpMER.index<'2018-12'].append(wpMER.index[wpMER.index>='2018-12'] - np.timedelta64(30,'m'))
wpMERg.index = wpMER.index[wpMERg.index<'2018-12'].append(wpMERg.index[wpMERg.index>='2018-12'] - np.timedelta64(30,'m'))
# set time zones
wpMER = wpMER.tz_localize('UTC').tz_convert('Etc/GMT-3')
wpERA = wpERA.tz_localize('UTC').tz_convert('Etc/GMT-3')
wpMERg = wpMERg.tz_localize('UTC').tz_convert('Etc/GMT-3')
wpERAg = wpERAg.tz_localize('UTC').tz_convert('Etc/GMT-3')
wpUSIh = wpUSIh.tz_localize('Etc/GMT-3')
wpUSIm = wpUSIm.tz_localize('Etc/GMT-3')

# Usinas

In [269]:
def analyseUSIh(parks):
    compUSIh= pd.DataFrame({'MERRA2':wpMER[parks.ANL_name],
                            'ERA5':wpERA[parks.ANL_name],
                            'MERRA2_GWA':wpMERg[parks.ANL_name],
                            'ERA5_GWA':wpERAg[parks.ANL_name],
                            'wp_obs':wpUSIh[parks.ONS_name]*10**6})
    # get capacities
    capUSIh = get_cap_df(ANL[ANL.name==parks.ANL_name].cap.values,
                         ANL[ANL.name==parks.ANL_name].commissioning.astype(np.datetime64).values).tz_localize('UTC').tz_convert('Etc/GMT-3')
    # calculate capacity factors
    cf_USIh = compUSIh.div(capUSIh,axis=0).dropna()
    stat_h = pd.DataFrame({'ERA5':stats(cf_USIh.ERA5,cf_USIh.wp_obs,False),
                           'ERA5_GWA':stats(cf_USIh.ERA5_GWA,cf_USIh.wp_obs,False),
                           'MERRA2':stats(cf_USIh.MERRA2,cf_USIh.wp_obs,False),
                           'MERRA2_GWA':stats(cf_USIh.MERRA2_GWA,cf_USIh.wp_obs,False),
                           'obs':[np.nan,np.nan,np.nan,cf_USIh.wp_obs.mean()]},
                          index = ['cor','rmse','mbe','avg']).reset_index().melt(id_vars=['index']).dropna()
    stat_h.columns = ['param','dataset',parks.ANL_name]
    return(stat_h.set_index(['param','dataset']).transpose())

In [270]:
def analyseUSId(parks):
    compUSIh= pd.DataFrame({'MERRA2':wpMER[parks.ANL_name],
                            'ERA5':wpERA[parks.ANL_name],
                            'MERRA2_GWA':wpMERg[parks.ANL_name],
                            'ERA5_GWA':wpERAg[parks.ANL_name],
                            'wp_obs':wpUSIh[parks.ONS_name]*10**6})
    # get capacities
    capUSIh = get_cap_df(ANL[ANL.name==parks.ANL_name].cap.values,
                         ANL[ANL.name==parks.ANL_name].commissioning.astype(np.datetime64).values).tz_localize('UTC').tz_convert('Etc/GMT-3')
    ccUSIh = pd.concat([compUSIh,capUSIh],axis=1).dropna()
    ccUSIh.columns = compUSIh.columns.tolist() + ['cap']
    # aggregate daily
    ccUSId = ccUSIh.resample('D').sum()
    cf_USId = ccUSId.drop('cap',axis=1).div(ccUSId.cap,axis=0)
    stat_d = pd.DataFrame({'ERA5':stats(cf_USId.ERA5,cf_USId.wp_obs,False),
                           'ERA5_GWA':stats(cf_USId.ERA5_GWA,cf_USId.wp_obs,False),
                           'MERRA2':stats(cf_USId.MERRA2,cf_USId.wp_obs,False),
                           'MERRA2_GWA':stats(cf_USId.MERRA2_GWA,cf_USId.wp_obs,False),
                           'obs':[np.nan,np.nan,np.nan,cf_USId.wp_obs.mean()]},
                          index = ['cor','rmse','mbe','avg']).reset_index().melt(id_vars=['index']).dropna()
    stat_d.columns = ['param','dataset',parks.ANL_name]
    return(stat_d.set_index(['param','dataset']).transpose())

In [470]:
def analyseUSIm(parks):
    compUSIh= pd.DataFrame({'MERRA2':wpMER[parks.ANL_name],
                            'ERA5':wpERA[parks.ANL_name],
                            'MERRA2_GWA':wpMERg[parks.ANL_name],
                            'ERA5_GWA':wpERAg[parks.ANL_name],
                            'wp_obs':wpUSIm[parks.ONS_name]*10**6}).dropna()
    compUSIm = compUSIh.resample('M').sum()
    # get capacities
    capUSIh = get_cap_df(ANL[ANL.name==parks.ANL_name].cap.values,
                         ANL[ANL.name==parks.ANL_name].commissioning.astype(np.datetime64).values).tz_localize('UTC').tz_convert('Etc/GMT-3')
    capUSIm = capUSIh.resample('M').sum()
    # calculate capacity factors
    cf_USIm = compUSIm.div(capUSIm,axis=0).dropna()
    stat_m = pd.DataFrame({'ERA5':stats(cf_USIm.ERA5,cf_USIm.wp_obs,False),
                           'ERA5_GWA':stats(cf_USIm.ERA5_GWA,cf_USIm.wp_obs,False),
                           'MERRA2':stats(cf_USIm.MERRA2,cf_USIm.wp_obs,False),
                           'MERRA2_GWA':stats(cf_USIm.MERRA2_GWA,cf_USIm.wp_obs,False),
                           'obs':[np.nan,np.nan,np.nan,cf_USIm.wp_obs.mean()]},
                          index = ['cor','rmse','mbe','avg']).reset_index().melt(id_vars=['index']).dropna()
    stat_m.columns = ['param','dataset',parks.ANL_name]
    return(stat_m.set_index(['param','dataset']).transpose())

## calculate stats usinas

In [460]:
stats_USIh = pd.concat(matches2H[matches2H.score==100].apply(analyseUSIh,axis=1).tolist(),axis=0).transpose()

  avg = a.mean(axis)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


In [461]:
stats_USIh.to_csv(results_path + '/stats_USIh.csv')

In [462]:
stats_USId = pd.concat(matches2H[matches2H.score==100].apply(analyseUSId,axis=1).tolist(),axis=0).transpose()

In [463]:
stats_USId.to_csv(results_path + '/stats_USId.csv')

In [473]:
stats_USIm = pd.concat(matches2[matches2.score==100].apply(analyseUSIm,axis=1).tolist(),axis=0).transpose()

In [474]:
stats_USIm.to_csv(results_path + '/stats_USIm.csv')

# States

In [196]:
# insert states in matches dataframes
matches2['state'] = matches2.ANL_name.map(ANL.groupby('name').state.first()).values
matches2H['state'] = matches2H.ANL_name.map(ANL.groupby('name').state.first()).values
# observed generation
wpESTh = wpUSIh.groupby(matches2H[matches2H.score==100].state.values,axis=1).sum()

In [868]:
wpESTm = wpUSIm.groupby(matches2[matches2.score==100].state.values,axis=1).sum()

In [355]:
def getusicapdf(usi):
    # function to get the capacity of wind parks neglecting timespans without production data
    # usi: ANEEL name of park
    c = get_cap_df(ANLmatchH[ANLmatchH.name==usi].cap.values,
                   ANLmatchH[ANLmatchH.name==usi].commissioning.astype(np.datetime64).values).tz_localize('UTC').tz_convert('Etc/GMT-3')
    mH2 = matches2H[matches2H.score==100]
    p = wpUSIh[(mH2.ONS_name[mH2.ANL_name==usi]).values].iloc[:,0]
    c[c.index.map(p).isna()] = np.nan
    return(c)

In [399]:
# OLD
def analyseESTh(state):
    # remove leading and trailing 0s in observed data
    wpobs = wpESTh[state].copy(deep=True)
    wpobs[wpobs.cumsum(axis=0)==0] = np.nan
    wpobs[wpobs[::-1].cumsum(axis=0)[::-1]==0] = np.nan
    # get capacities
    capusish = pd.Series(ANLmatchH[ANLmatchH.state==state].name.values).apply(getusicapdf).transpose()
    capusish.columns = ANLmatchH[ANLmatchH.state==state].name.values
    capESTh = capusish.sum(axis=1)
    # mask for masking simulated data (to only use timespans where also observed data are available)
    mask = capusish.notna()
    mask.columns = ANLmatchH[ANLmatchH.state==state].name.values
    # mask and aggregate simulated data
    wpMER_ESTh = (wpMER[ANLmatchH[ANLmatchH.state==state].name.values]*mask).sum(axis=1)
    wpERA_ESTh = (wpERA[ANLmatchH[ANLmatchH.state==state].name.values]*mask).sum(axis=1)
    wpMERg_ESTh = (wpMERg[ANLmatchH[ANLmatchH.state==state].name.values]*mask).sum(axis=1)
    wpERAg_ESTh = (wpERAg[ANLmatchH[ANLmatchH.state==state].name.values]*mask).sum(axis=1)
    compESTh= pd.DataFrame({'MERRA2':wpMER_ESTh,
                            'ERA5':wpERA_ESTh,
                            'MERRA2_GWA':wpMERg_ESTh,
                            'ERA5_GWA':wpERAg_ESTh,
                            'wp_obs':wpobs*10**6})
    # calculate capacity factors
    cf_ESTh = compESTh.div(capESTh,axis=0).dropna()
    stat_h = pd.DataFrame({'ERA5':stats(cf_ESTh.ERA5,cf_ESTh.wp_obs,False),
                           'ERA5_GWA':stats(cf_ESTh.ERA5_GWA,cf_ESTh.wp_obs,False),
                           'MERRA2':stats(cf_ESTh.MERRA2,cf_ESTh.wp_obs,False),
                           'MERRA2_GWA':stats(cf_ESTh.MERRA2_GWA,cf_ESTh.wp_obs,False),
                           'obs':[np.nan,np.nan,np.nan,cf_ESTh.wp_obs.mean()]},
                          index = ['cor','rmse','mbe','avg']).reset_index().melt(id_vars=['index']).dropna()
    stat_h.columns = ['param','dataset',state]
    return(stat_h.set_index(['param','dataset']).transpose())

In [889]:
def analyseESTh(state):
    # remove leading and trailing 0s in observed data
    wpobs = wpESTh[state].copy(deep=True)
    wpobs[wpobs.cumsum(axis=0)==0] = np.nan
    wpobs[wpobs[::-1].cumsum(axis=0)[::-1]==0] = np.nan
    # get capacities
    capusish = pd.Series(ANLmatchH[ANLmatchH.state==state].name.unique()).apply(getusicapdf).transpose()
    capusish.columns = ANLmatchH[ANLmatchH.state==state].name.unique()
    capESTh = capusish.sum(axis=1)
    # mask for masking simulated data (to only use timespans where also observed data are available)
    mask = capusish.notna()
    mask.columns = ANLmatchH[ANLmatchH.state==state].name.unique()
    # mask and aggregate simulated data
    wpMER_ESTh = (wpMER[ANLmatchH[ANLmatchH.state==state].name.unique()]*mask).sum(axis=1)
    wpERA_ESTh = (wpERA[ANLmatchH[ANLmatchH.state==state].name.unique()]*mask).sum(axis=1)
    wpMERg_ESTh = (wpMERg[ANLmatchH[ANLmatchH.state==state].name.unique()]*mask).sum(axis=1)
    wpERAg_ESTh = (wpERAg[ANLmatchH[ANLmatchH.state==state].name.unique()]*mask).sum(axis=1)
    compESTh= pd.DataFrame({'MERRA2':wpMER_ESTh,
                            'ERA5':wpERA_ESTh,
                            'MERRA2_GWA':wpMERg_ESTh,
                            'ERA5_GWA':wpERAg_ESTh,
                            'wp_obs':wpobs*10**6})
    # calculate capacity factors
    cf_ESTh = compESTh.div(capESTh,axis=0).dropna()
    stat_h = pd.DataFrame({'ERA5':stats(cf_ESTh.ERA5,cf_ESTh.wp_obs,False),
                           'ERA5_GWA':stats(cf_ESTh.ERA5_GWA,cf_ESTh.wp_obs,False),
                           'MERRA2':stats(cf_ESTh.MERRA2,cf_ESTh.wp_obs,False),
                           'MERRA2_GWA':stats(cf_ESTh.MERRA2_GWA,cf_ESTh.wp_obs,False),
                           'obs':[np.nan,np.nan,np.nan,cf_ESTh.wp_obs.mean()]},
                          index = ['cor','rmse','mbe','avg']).reset_index().melt(id_vars=['index']).dropna()
    stat_h.columns = ['param','dataset',state]
    return(stat_h.set_index(['param','dataset']).transpose())

In [404]:
#OLD
def analyseESTd(state):
    # remove leading and trailing 0s in observed data
    wpobs = wpESTh[state].copy(deep=True)
    wpobs[wpobs.cumsum(axis=0)==0] = np.nan
    wpobs[wpobs[::-1].cumsum(axis=0)[::-1]==0] = np.nan
    
    # get capacities
    capusish = pd.Series(ANLmatchH[ANLmatchH.state==state].name.values).apply(getusicapdf).transpose()
    capusish.columns = ANLmatchH[ANLmatchH.state==state].name.values
    capESTh = capusish.sum(axis=1)
    # mask for masking simulated data (to only use timespans where also observed data are available)
    mask = capusish.notna()
    mask.columns = ANLmatchH[ANLmatchH.state==state].name.values
    # mask and aggregate simulated data
    wpMER_ESTh = (wpMER[ANLmatchH[ANLmatchH.state==state].name.values]*mask).sum(axis=1)
    wpERA_ESTh = (wpERA[ANLmatchH[ANLmatchH.state==state].name.values]*mask).sum(axis=1)
    wpMERg_ESTh = (wpMERg[ANLmatchH[ANLmatchH.state==state].name.values]*mask).sum(axis=1)
    wpERAg_ESTh = (wpERAg[ANLmatchH[ANLmatchH.state==state].name.values]*mask).sum(axis=1)
    compESTh= pd.DataFrame({'MERRA2':wpMER_ESTh,
                            'ERA5':wpERA_ESTh,
                            'MERRA2_GWA':wpMERg_ESTh,
                            'ERA5_GWA':wpERAg_ESTh,
                            'wp_obs':wpobs*10**6})
    ccESTh = pd.concat([compESTh,capESTh],axis=1).dropna()
    ccESTh.columns = compESTh.columns.tolist() + ['cap']
    
    # aggregate daily
    ccESTd = ccESTh.resample('D').sum()
    cf_ESTd = ccESTd.drop('cap',axis=1).div(ccESTd.cap,axis=0).dropna()
    stat_d = pd.DataFrame({'ERA5':stats(cf_ESTd.ERA5,cf_ESTd.wp_obs,False),
                           'ERA5_GWA':stats(cf_ESTd.ERA5_GWA,cf_ESTd.wp_obs,False),
                           'MERRA2':stats(cf_ESTd.MERRA2,cf_ESTd.wp_obs,False),
                           'MERRA2_GWA':stats(cf_ESTd.MERRA2_GWA,cf_ESTd.wp_obs,False),
                           'obs':[np.nan,np.nan,np.nan,cf_ESTd.wp_obs.mean()]},
                          index = ['cor','rmse','mbe','avg']).reset_index().melt(id_vars=['index']).dropna()
    stat_d.columns = ['param','dataset',state]
    return(stat_d.set_index(['param','dataset']).transpose())

In [892]:
def analyseESTd(state):
    # remove leading and trailing 0s in observed data
    wpobs = wpESTh[state].copy(deep=True)
    wpobs[wpobs.cumsum(axis=0)==0] = np.nan
    wpobs[wpobs[::-1].cumsum(axis=0)[::-1]==0] = np.nan
    
    # get capacities
    capusish = pd.Series(ANLmatchH[ANLmatchH.state==state].name.unique()).apply(getusicapdf).transpose()
    capusish.columns = ANLmatchH[ANLmatchH.state==state].name.unique()
    capESTh = capusish.sum(axis=1)
    # mask for masking simulated data (to only use timespans where also observed data are available)
    mask = capusish.notna()
    mask.columns = ANLmatchH[ANLmatchH.state==state].name.unique()
    # mask and aggregate simulated data
    wpMER_ESTh = (wpMER[ANLmatchH[ANLmatchH.state==state].name.unique()]*mask).sum(axis=1)
    wpERA_ESTh = (wpERA[ANLmatchH[ANLmatchH.state==state].name.unique()]*mask).sum(axis=1)
    wpMERg_ESTh = (wpMERg[ANLmatchH[ANLmatchH.state==state].name.unique()]*mask).sum(axis=1)
    wpERAg_ESTh = (wpERAg[ANLmatchH[ANLmatchH.state==state].name.unique()]*mask).sum(axis=1)
    compESTh= pd.DataFrame({'MERRA2':wpMER_ESTh,
                            'ERA5':wpERA_ESTh,
                            'MERRA2_GWA':wpMERg_ESTh,
                            'ERA5_GWA':wpERAg_ESTh,
                            'wp_obs':wpobs*10**6})
    ccESTh = pd.concat([compESTh,capESTh],axis=1).dropna()
    ccESTh.columns = compESTh.columns.tolist() + ['cap']
    
    # aggregate daily
    ccESTd = ccESTh.resample('D').sum()
    cf_ESTd = ccESTd.drop('cap',axis=1).div(ccESTd.cap,axis=0).dropna()
    stat_d = pd.DataFrame({'ERA5':stats(cf_ESTd.ERA5,cf_ESTd.wp_obs,False),
                           'ERA5_GWA':stats(cf_ESTd.ERA5_GWA,cf_ESTd.wp_obs,False),
                           'MERRA2':stats(cf_ESTd.MERRA2,cf_ESTd.wp_obs,False),
                           'MERRA2_GWA':stats(cf_ESTd.MERRA2_GWA,cf_ESTd.wp_obs,False),
                           'obs':[np.nan,np.nan,np.nan,cf_ESTd.wp_obs.mean()]},
                          index = ['cor','rmse','mbe','avg']).reset_index().melt(id_vars=['index']).dropna()
    stat_d.columns = ['param','dataset',state]
    return(stat_d.set_index(['param','dataset']).transpose())

In [837]:
def getusicapdfM(usi):
    # function to get the capacity of wind parks neglecting timespans without production data
    # usi: ANEEL name of park
    c = get_cap_df(ANLmatch[ANLmatch.name==usi].cap.values,
                   ANLmatch[ANLmatch.name==usi].commissioning.astype(np.datetime64).values).tz_localize('UTC').tz_convert('Etc/GMT-3')
    m2 = matches2[matches2.score==100]
    p = wpUSIm[(m2.ONS_name[m2.ANL_name==usi]).values].iloc[:,0]#.dropna()
    cm = c.resample('M').sum()
    cym = cm.index.year*100+cm.index.month
    pym = pd.Series(p.values,index=p.index.year*100+p.index.month)
    cm[cym.map(pym).isna()] = np.nan
    return(cm)

In [701]:
# OLD
def analyseESTm(state):
    print(state)
    # remove leading and trailing 0s in observed data
    wpobs = wpESTm[state].copy(deep=True)
    wpobs[wpobs.cumsum(axis=0)==0] = np.nan
    wpobs[wpobs[::-1].cumsum(axis=0)[::-1]==0] = np.nan
    # get capacities
    capusism = pd.Series(ANLmatch[ANLmatch.state==state].name.values).apply(getusicapdfM).transpose()
    capusism.columns = ANLmatch[ANLmatch.state==state].name.values
    capESTm = capusism.sum(axis=1)
    # mask for masking simulated data (to only use timespans where also observed data are available)
    mask = capusism.notna()
    mask.columns = ANLmatch[ANLmatch.state==state].name.values
    # mask and aggregate simulated data
    wpMER_ESTh = (wpMER[ANLmatch[ANLmatch.state==state].name.values]*mask).sum(axis=1)
    wpERA_ESTh = (wpERA[ANLmatch[ANLmatch.state==state].name.values]*mask).sum(axis=1)
    wpMERg_ESTh = (wpMERg[ANLmatch[ANLmatch.state==state].name.values]*mask).sum(axis=1)
    wpERAg_ESTh = (wpERAg[ANLmatch[ANLmatch.state==state].name.values]*mask).sum(axis=1)
    compESTh= pd.DataFrame({'MERRA2':wpMER_ESTh,
                            'ERA5':wpERA_ESTh,
                            'MERRA2_GWA':wpMERg_ESTh,
                            'ERA5_GWA':wpERAg_ESTh,
                            'wp_obs':wpobs*10**6})
    # sum up per month
    compESTm = compESTh.resample('M').sum()
    # calculate capacity factors
    cf_ESTm = compESTm.div(capESTm,axis=0).dropna()
    stat_m = pd.DataFrame({'ERA5':stats(cf_ESTm.ERA5,cf_ESTm.wp_obs,False),
                           'ERA5_GWA':stats(cf_ESTm.ERA5_GWA,cf_ESTm.wp_obs,False),
                           'MERRA2':stats(cf_ESTm.MERRA2,cf_ESTm.wp_obs,False),
                           'MERRA2_GWA':stats(cf_ESTm.MERRA2_GWA,cf_ESTm.wp_obs,False),
                           'obs':[np.nan,np.nan,np.nan,cf_ESTm.wp_obs.mean()]},
                          index = ['cor','rmse','mbe','avg']).reset_index().melt(id_vars=['index']).dropna()
    stat_m.columns = ['param','dataset',state]
    return(stat_m.set_index(['param','dataset']).transpose())

In [896]:
def analyseESTm(state):
    print(state)
    # remove leading and trailing 0s in observed data
    wpobs = wpESTm[state].copy(deep=True)
    wpobs[wpobs.cumsum(axis=0)==0] = np.nan
    wpobs[wpobs[::-1].cumsum(axis=0)[::-1]==0] = np.nan
    # get capacities
    capusism = pd.Series(ANLmatch[ANLmatch.state==state].name.unique()).apply(getusicapdfM).transpose()
    capusism.columns = ANLmatch[ANLmatch.state==state].name.unique()
    capESTm = capusism.sum(axis=1)
    # mask for masking simulated data (to only use timespans where also observed data are available)
    mask = capusism.notna()
    mask.columns = ANLmatch[ANLmatch.state==state].name.unique()
    # mask and aggregate simulated data
    wpMER_ESTm = (wpMER[ANLmatch[ANLmatch.state==state].name.unique()].resample('M').sum()*mask).sum(axis=1)
    wpERA_ESTm = (wpERA[ANLmatch[ANLmatch.state==state].name.unique()].resample('M').sum()*mask).sum(axis=1)
    wpMERg_ESTm = (wpMERg[ANLmatch[ANLmatch.state==state].name.unique()].resample('M').sum()*mask).sum(axis=1)
    wpERAg_ESTm = (wpERAg[ANLmatch[ANLmatch.state==state].name.unique()].resample('M').sum()*mask).sum(axis=1)
    compESTm= pd.DataFrame({'MERRA2':wpMER_ESTm,
                            'ERA5':wpERA_ESTm,
                            'MERRA2_GWA':wpMERg_ESTm,
                            'ERA5_GWA':wpERAg_ESTm,
                            'wp_obs':wpobs.resample('M').sum()*10**6})
    # calculate capacity factors
    cf_ESTm = compESTm.div(capESTm,axis=0).dropna()
    stat_m = pd.DataFrame({'ERA5':stats(cf_ESTm.ERA5,cf_ESTm.wp_obs,False),
                           'ERA5_GWA':stats(cf_ESTm.ERA5_GWA,cf_ESTm.wp_obs,False),
                           'MERRA2':stats(cf_ESTm.MERRA2,cf_ESTm.wp_obs,False),
                           'MERRA2_GWA':stats(cf_ESTm.MERRA2_GWA,cf_ESTm.wp_obs,False),
                           'obs':[np.nan,np.nan,np.nan,cf_ESTm.wp_obs.mean()]},
                          index = ['cor','rmse','mbe','avg']).reset_index().melt(id_vars=['index']).dropna()
    stat_m.columns = ['param','dataset',state]
    return(stat_m.set_index(['param','dataset']).transpose())

In [890]:
stats_ESTh = pd.concat(pd.Series(matches2H[matches2H.score==100].state.unique()).apply(analyseESTh).tolist(),axis=0).transpose()

In [893]:
stats_ESTd = pd.concat(pd.Series(matches2H[matches2H.score==100].state.unique()).apply(analyseESTd).tolist(),axis=0).transpose()

In [895]:
stats_ESTh.to_csv(results_path + '/stats_ESTh.csv')
stats_ESTd.to_csv(results_path + '/stats_ESTd.csv')

In [897]:
stats_ESTm = pd.concat(pd.Series(matches2[matches2.score==100].state.unique()).apply(analyseESTm).tolist(),axis=0).transpose()

PB
RN
BA
SC
RS
PI
SE
CE
MA
PE
RJ
PR


In [898]:
stats_ESTm.to_csv(results_path + '/stats_ESTm.csv')

# Subsystems

south is the same as RS!!!

In [236]:
# sum up northeast
wpNEh = wpESTh[['BA','CE','RN']].sum(axis=1)

In [869]:
# sum up subsystems
subs = {'NE':['BA','CE','MA','PB','PE','PI','RN','SE'],
        'S':['PR','RS','SC'],
        'SE':['RJ']}
wpSUBm = pd.DataFrame({'NE':wpESTm[subs.get('NE')].sum(axis=1),
                       'S':wpESTm[subs.get('S')].sum(axis=1)})

In [422]:
matches2H.ONS_name = matches2H.ONS_name.replace(' EOL ANDORINHAS',' TAÍBA ANDORINHA')

In [244]:
# OLD
def analyseNEh():
    # remove leading and trailing 0s in observed data
    wpobs = wpNEh.copy(deep=True)
    wpobs[wpobs.cumsum(axis=0)==0] = np.nan
    wpobs[wpobs[::-1].cumsum(axis=0)[::-1]==0] = np.nan
    # get capacities
    capusish = pd.Series(ANLmatchH[ANLmatchH.state!='RS'].name.values).apply(getusicapdf).transpose()
    capusish.columns = ANLmatchH[ANLmatchH.state!='RS'].name.values
    capNEh = capusish.sum(axis=1)
    # mask for masking simulated data (to only use timespans where also observed data are available)
    mask = capusish.notna()
    mask.columns = ANLmatchH[ANLmatchH.state!='RS'].name.values
    # mask and aggregate simulated data
    wpMER_NEh = (wpMER[ANLmatchH[ANLmatchH.state!='RS'].name.values]*mask).sum(axis=1)
    wpERA_NEh = (wpERA[ANLmatchH[ANLmatchH.state!='RS'].name.values]*mask).sum(axis=1)
    wpMERg_NEh = (wpMERg[ANLmatchH[ANLmatchH.state!='RS'].name.values]*mask).sum(axis=1)
    wpERAg_NEh = (wpERAg[ANLmatchH[ANLmatchH.state!='RS'].name.values]*mask).sum(axis=1)
    compNEh= pd.DataFrame({'MERRA2':wpMER_NEh,
                            'ERA5':wpERA_NEh,
                            'MERRA2_GWA':wpMERg_NEh,
                            'ERA5_GWA':wpERAg_NEh,
                            'wp_obs':wpobs*10**6})
    # calculate capacity factors
    cf_NEh = compNEh.div(capNEh,axis=0).dropna()
    stat_h = pd.DataFrame({'ERA5':stats(cf_NEh.ERA5,cf_NEh.wp_obs,False),
                           'ERA5_GWA':stats(cf_NEh.ERA5_GWA,cf_NEh.wp_obs,False),
                           'MERRA2':stats(cf_NEh.MERRA2,cf_NEh.wp_obs,False),
                           'MERRA2_GWA':stats(cf_NEh.MERRA2_GWA,cf_NEh.wp_obs,False),
                           'obs':[np.nan,np.nan,np.nan,cf_NEh.wp_obs.mean()]},
                          index = ['cor','rmse','mbe','avg']).reset_index().melt(id_vars=['index']).dropna()
    stat_h.columns = ['param','dataset','NE']
    return(stat_h.set_index(['param','dataset']).transpose())
# OLDER
def analyseNEh():
    # remove leading and trailing 0s in observed data
    wpobs = wpNEh.copy(deep=True)
    wpobs[wpobs.cumsum(axis=0)==0] = np.nan
    wpobs[wpobs[::-1].cumsum(axis=0)[::-1]==0] = np.nan
    compNEh= pd.DataFrame({'MERRA2':wpMER_NEh,
                           'ERA5':wpERA_NEh,
                           'MERRA2_GWA':wpMERg_NEh,
                           'ERA5_GWA':wpERAg_NEh,
                           'wp_obs':wpobs*10**6})
    # get capacities
    capNEh = get_cap_df(ANLmatchH[[state in ['BA','CE','RN'] for state in ANLmatchH.state]].cap.values,
                        ANLmatchH[[state in ['BA','CE','RN'] for state in ANLmatchH.state]].commissioning.astype(np.datetime64).values).tz_localize('UTC').tz_convert('Etc/GMT-3')
    # calculate capacity factors
    cf_NEh = compNEh.div(capNEh,axis=0).dropna()
    stat_h = pd.DataFrame({'ERA5':stats(cf_NEh.ERA5,cf_NEh.wp_obs,False),
                           'ERA5_GWA':stats(cf_NEh.ERA5_GWA,cf_NEh.wp_obs,False),
                           'MERRA2':stats(cf_NEh.MERRA2,cf_NEh.wp_obs,False),
                           'MERRA2_GWA':stats(cf_NEh.MERRA2_GWA,cf_NEh.wp_obs,False),
                           'obs':[np.nan,np.nan,np.nan,cf_NEh.wp_obs.mean()]},
                          index = ['cor','rmse','mbe','avg']).reset_index().melt(id_vars=['index']).dropna()
    stat_h.columns = ['param','dataset','NE']
    return(stat_h.set_index(['param','dataset']).transpose())

In [883]:
def analyseNEh():
    # remove leading and trailing 0s in observed data
    wpobs = wpNEh.copy(deep=True)
    wpobs[wpobs.cumsum(axis=0)==0] = np.nan
    wpobs[wpobs[::-1].cumsum(axis=0)[::-1]==0] = np.nan
    # get capacities
    capusish = pd.Series(ANLmatchH[ANLmatchH.state!='RS'].name.unique()).apply(getusicapdf).transpose()
    capusish.columns = ANLmatchH[ANLmatchH.state!='RS'].name.unique()
    capNEh = capusish.sum(axis=1)
    # mask for masking simulated data (to only use timespans where also observed data are available)
    mask = capusish.notna()
    mask.columns = ANLmatchH[ANLmatchH.state!='RS'].name.unique()
    # mask and aggregate simulated data
    wpMER_NEh = (wpMER[ANLmatchH[ANLmatchH.state!='RS'].name.unique()]*mask).sum(axis=1)
    wpERA_NEh = (wpERA[ANLmatchH[ANLmatchH.state!='RS'].name.unique()]*mask).sum(axis=1)
    wpMERg_NEh = (wpMERg[ANLmatchH[ANLmatchH.state!='RS'].name.unique()]*mask).sum(axis=1)
    wpERAg_NEh = (wpERAg[ANLmatchH[ANLmatchH.state!='RS'].name.unique()]*mask).sum(axis=1)
    compNEh= pd.DataFrame({'MERRA2':wpMER_NEh,
                            'ERA5':wpERA_NEh,
                            'MERRA2_GWA':wpMERg_NEh,
                            'ERA5_GWA':wpERAg_NEh,
                            'wp_obs':wpobs*10**6})
    # calculate capacity factors
    cf_NEh = compNEh.div(capNEh,axis=0).dropna()
    stat_h = pd.DataFrame({'ERA5':stats(cf_NEh.ERA5,cf_NEh.wp_obs,False),
                           'ERA5_GWA':stats(cf_NEh.ERA5_GWA,cf_NEh.wp_obs,False),
                           'MERRA2':stats(cf_NEh.MERRA2,cf_NEh.wp_obs,False),
                           'MERRA2_GWA':stats(cf_NEh.MERRA2_GWA,cf_NEh.wp_obs,False),
                           'obs':[np.nan,np.nan,np.nan,cf_NEh.wp_obs.mean()]},
                          index = ['cor','rmse','mbe','avg']).reset_index().melt(id_vars=['index']).dropna()
    stat_h.columns = ['param','dataset','NE']
    return(stat_h.set_index(['param','dataset']).transpose())

In [303]:
# OLD
def analyseNEd():
    # remove leading and trailing 0s in observed data
    wpobs = wpNEh.copy(deep=True)
    wpobs[wpobs.cumsum(axis=0)==0] = np.nan
    wpobs[wpobs[::-1].cumsum(axis=0)[::-1]==0] = np.nan
    
    # get capacities
    capusish = pd.Series(ANLmatchH[ANLmatchH.state!='RS'].name.values).apply(getusicapdf).transpose()
    capusish.columns = ANLmatchH[ANLmatchH.state!='RS'].name.values
    capNEh = capusish.sum(axis=1)
    # mask for masking simulated data (to only use timespans where also observed data are available)
    mask = capusish.notna()
    mask.columns = ANLmatchH[ANLmatchH.state!='RS'].name.values
    # mask and aggregate simulated data
    wpMER_NEh = (wpMER[ANLmatchH[ANLmatchH.state!='RS'].name.values]*mask).sum(axis=1)
    wpERA_NEh = (wpERA[ANLmatchH[ANLmatchH.state!='RS'].name.values]*mask).sum(axis=1)
    wpMERg_NEh = (wpMERg[ANLmatchH[ANLmatchH.state!='RS'].name.values]*mask).sum(axis=1)
    wpERAg_NEh = (wpERAg[ANLmatchH[ANLmatchH.state!='RS'].name.values]*mask).sum(axis=1)
    compNEh= pd.DataFrame({'MERRA2':wpMER_NEh,
                            'ERA5':wpERA_NEh,
                            'MERRA2_GWA':wpMERg_NEh,
                            'ERA5_GWA':wpERAg_NEh,
                            'wp_obs':wpobs*10**6})
    ccNEh = pd.concat([compNEh,capNEh],axis=1).dropna()
    ccNEh.columns = compNEh.columns.tolist() + ['cap']
    
    # aggregate daily
    ccNEd = ccNEh.resample('D').sum()
    cf_NEd = ccNEd.drop('cap',axis=1).div(ccNEd.cap,axis=0).dropna()
    stat_d = pd.DataFrame({'ERA5':stats(cf_NEd.ERA5,cf_NEd.wp_obs,False),
                           'ERA5_GWA':stats(cf_NEd.ERA5_GWA,cf_NEd.wp_obs,False),
                           'MERRA2':stats(cf_NEd.MERRA2,cf_NEd.wp_obs,False),
                           'MERRA2_GWA':stats(cf_NEd.MERRA2_GWA,cf_NEd.wp_obs,False),
                           'obs':[np.nan,np.nan,np.nan,cf_NEd.wp_obs.mean()]},
                          index = ['cor','rmse','mbe','avg']).reset_index().melt(id_vars=['index']).dropna()
    stat_d.columns = ['param','dataset','NE']
    return(stat_d.set_index(['param','dataset']).transpose())
# OLDER
def analyseNEd():
    # remove leading and trailing 0s in observed data
    wpobs = wpNEh.copy(deep=True)
    wpobs[wpobs.cumsum(axis=0)==0] = np.nan
    wpobs[wpobs[::-1].cumsum(axis=0)[::-1]==0] = np.nan
    compNEh= pd.DataFrame({'MERRA2':wpMER_NEh,
                           'ERA5':wpERA_NEh,
                           'MERRA2_GWA':wpMERg_NEh,
                           'ERA5_GWA':wpERAg_NEh,
                           'wp_obs':wpobs*10**6})
    # get capacities
    capNEh = get_cap_df(ANLmatchH[[state in ['BA','CE','RN'] for state in ANLmatchH.state]].cap.values,
                        ANLmatchH[[state in ['BA','CE','RN'] for state in ANLmatchH.state]].commissioning.astype(np.datetime64).values).tz_localize('UTC').tz_convert('Etc/GMT-3')
    ccNEh = pd.concat([compNEh,capNEh],axis=1).dropna()
    ccNEh.columns = compNEh.columns.tolist() + ['cap']
    # aggregate daily
    ccNEd = ccNEh.resample('D').sum()
    cf_NEd = ccNEd.drop('cap',axis=1).div(ccNEd.cap,axis=0).dropna()
    stat_d = pd.DataFrame({'ERA5':stats(cf_NEd.ERA5,cf_NEd.wp_obs,False),
                           'ERA5_GWA':stats(cf_NEd.ERA5_GWA,cf_NEd.wp_obs,False),
                           'MERRA2':stats(cf_NEd.MERRA2,cf_NEd.wp_obs,False),
                           'MERRA2_GWA':stats(cf_NEd.MERRA2_GWA,cf_NEd.wp_obs,False),
                           'obs':[np.nan,np.nan,np.nan,cf_NEd.wp_obs.mean()]},
                          index = ['cor','rmse','mbe','avg']).reset_index().melt(id_vars=['index']).dropna()
    stat_d.columns = ['param','dataset','NE']
    return(stat_d.set_index(['param','dataset']).transpose())

In [879]:
def analyseNEd():
    # remove leading and trailing 0s in observed data
    wpobs = wpNEh.copy(deep=True)
    wpobs[wpobs.cumsum(axis=0)==0] = np.nan
    wpobs[wpobs[::-1].cumsum(axis=0)[::-1]==0] = np.nan
    
    # get capacities
    capusish = pd.Series(ANLmatchH[ANLmatchH.state!='RS'].name.unique()).apply(getusicapdf).transpose()
    capusish.columns = ANLmatchH[ANLmatchH.state!='RS'].name.unique()
    capNEh = capusish.sum(axis=1)
    # mask for masking simulated data (to only use timespans where also observed data are available)
    mask = capusish.notna()
    mask.columns = ANLmatchH[ANLmatchH.state!='RS'].name.unique()
    # mask and aggregate simulated data
    wpMER_NEh = (wpMER[ANLmatchH[ANLmatchH.state!='RS'].name.unique()]*mask).sum(axis=1)
    wpERA_NEh = (wpERA[ANLmatchH[ANLmatchH.state!='RS'].name.unique()]*mask).sum(axis=1)
    wpMERg_NEh = (wpMERg[ANLmatchH[ANLmatchH.state!='RS'].name.unique()]*mask).sum(axis=1)
    wpERAg_NEh = (wpERAg[ANLmatchH[ANLmatchH.state!='RS'].name.unique()]*mask).sum(axis=1)
    compNEh= pd.DataFrame({'MERRA2':wpMER_NEh,
                            'ERA5':wpERA_NEh,
                            'MERRA2_GWA':wpMERg_NEh,
                            'ERA5_GWA':wpERAg_NEh,
                            'wp_obs':wpobs*10**6})
    ccNEh = pd.concat([compNEh,capNEh],axis=1).dropna()
    ccNEh.columns = compNEh.columns.tolist() + ['cap']
    
    # aggregate daily
    ccNEd = ccNEh.resample('D').sum()
    cf_NEd = ccNEd.drop('cap',axis=1).div(ccNEd.cap,axis=0).dropna()
    stat_d = pd.DataFrame({'ERA5':stats(cf_NEd.ERA5,cf_NEd.wp_obs,False),
                           'ERA5_GWA':stats(cf_NEd.ERA5_GWA,cf_NEd.wp_obs,False),
                           'MERRA2':stats(cf_NEd.MERRA2,cf_NEd.wp_obs,False),
                           'MERRA2_GWA':stats(cf_NEd.MERRA2_GWA,cf_NEd.wp_obs,False),
                           'obs':[np.nan,np.nan,np.nan,cf_NEd.wp_obs.mean()]},
                          index = ['cor','rmse','mbe','avg']).reset_index().melt(id_vars=['index']).dropna()
    stat_d.columns = ['param','dataset','NE']
    return(stat_d.set_index(['param','dataset']).transpose())

In [865]:
def analyseSUBm(SUB):
    print(SUB)
    # remove leading and trailing 0s in observed data
    wpobs = wpSUBm[SUB].copy(deep=True)
    wpobs[wpobs.cumsum(axis=0)==0] = np.nan
    wpobs[wpobs[::-1].cumsum(axis=0)[::-1]==0] = np.nan
    # get ANEEL data for subsystem
    ANLmatchSUB = ANLmatch[[state in subs.get(SUB) for state in ANLmatch.state]]
    # get capacities
    capusism = pd.Series(ANLmatchSUB.name.unique()).apply(getusicapdfM).transpose()
    capusism.columns = ANLmatchSUB.name.unique()
    capSUBm = capusism.sum(axis=1)
    # mask for masking simulated data (to only use timespans where also observed data are available)
    mask = capusism.notna()
    mask.columns = ANLmatchSUB.name.unique()
    # mask and aggregate simulated data
    wpMER_SUBm = (wpMER[ANLmatchSUB.name.unique()].resample('M').sum()*mask).sum(axis=1)
    wpERA_SUBm = (wpERA[ANLmatchSUB.name.unique()].resample('M').sum()*mask).sum(axis=1)
    wpMERg_SUBm = (wpMERg[ANLmatchSUB.name.unique()].resample('M').sum()*mask).sum(axis=1)
    wpERAg_SUBm = (wpERAg[ANLmatchSUB.name.unique()].resample('M').sum()*mask).sum(axis=1)
    compSUBm= pd.DataFrame({'MERRA2':wpMER_SUBm,
                            'ERA5':wpERA_SUBm,
                            'MERRA2_GWA':wpMERg_SUBm,
                            'ERA5_GWA':wpERAg_SUBm,
                            'wp_obs':wpobs.resample('M').sum()*10**6})
    # calculate capacity factors
    cf_SUBm = compSUBm.div(capSUBm,axis=0).dropna()
    stat_m = pd.DataFrame({'ERA5':stats(cf_SUBm.ERA5,cf_SUBm.wp_obs,False),
                           'ERA5_GWA':stats(cf_SUBm.ERA5_GWA,cf_SUBm.wp_obs,False),
                           'MERRA2':stats(cf_SUBm.MERRA2,cf_SUBm.wp_obs,False),
                           'MERRA2_GWA':stats(cf_SUBm.MERRA2_GWA,cf_SUBm.wp_obs,False),
                           'obs':[np.nan,np.nan,np.nan,cf_SUBm.wp_obs.mean()]},
                          index = ['cor','rmse','mbe','avg']).reset_index().melt(id_vars=['index']).dropna()
    stat_m.columns = ['param','dataset',SUB]
    return(stat_m.set_index(['param','dataset']).transpose())

In [884]:
stats_NEh = analyseNEh()

In [886]:
stats_NEh.to_csv(results_path + '/stats_SUBh.csv')

In [880]:
stats_NEd = analyseNEd()

In [882]:
stats_NEd.to_csv(results_path + '/stats_SUBd.csv')

In [870]:
stats_SUBm = pd.concat(pd.Series(['NE','S']).apply(analyseSUBm).tolist(),axis=0).transpose()

NE
S


In [872]:
stats_SUBm.to_csv(results_path + '/stats_SUBm.csv')

# Brazil

In [248]:
# sum up brazil
wpMER_BRAh = wpMER_ESTh.sum(axis=1)
wpERA_BRAh = wpERA_ESTh.sum(axis=1)
wpMERg_BRAh = wpMERg_ESTh.sum(axis=1)
wpERAg_BRAh = wpERAg_ESTh.sum(axis=1)
wpBRAh = wpESTh.sum(axis=1)

In [901]:
wpBRAm = wpESTm.sum(axis=1)

In [916]:
def analyseBRAh():
    # remove leading and trailing 0s in observed data
    wpobs = wpBRAh.copy(deep=True)
    wpobs[wpobs.cumsum(axis=0)==0] = np.nan
    wpobs[wpobs[::-1].cumsum(axis=0)[::-1]==0] = np.nan
    # get capacities
    capusish = pd.Series(ANLmatchH.name.unique()).apply(getusicapdf).transpose()
    capusish.columns = ANLmatchH.name.unique()
    capBRAh = capusish.sum(axis=1)
    # mask for masking simulated data (to only use timespans where also observed data are available)
    mask = capusish.notna()
    mask.columns = ANLmatchH.name.unique()
    # mask and aggregate simulated data
    wpMER_BRAh = (wpMER*mask).sum(axis=1)
    wpERA_BRAh = (wpERA*mask).sum(axis=1)
    wpMERg_BRAh = (wpMERg*mask).sum(axis=1)
    wpERAg_BRAh = (wpERAg*mask).sum(axis=1)
    compBRAh= pd.DataFrame({'MERRA2':wpMER_BRAh,
                            'ERA5':wpERA_BRAh,
                            'MERRA2_GWA':wpMERg_BRAh,
                            'ERA5_GWA':wpERAg_BRAh,
                            'wp_obs':wpobs*10**6})
    # calculate capacity factors
    cf_BRAh = compBRAh.div(capBRAh,axis=0).dropna()
    stat_h = pd.DataFrame({'ERA5':stats(cf_BRAh.ERA5,cf_BRAh.wp_obs,False),
                           'ERA5_GWA':stats(cf_BRAh.ERA5_GWA,cf_BRAh.wp_obs,False),
                           'MERRA2':stats(cf_BRAh.MERRA2,cf_BRAh.wp_obs,False),
                           'MERRA2_GWA':stats(cf_BRAh.MERRA2_GWA,cf_BRAh.wp_obs,False),
                           'obs':[np.nan,np.nan,np.nan,cf_BRAh.wp_obs.mean()]},
                          index = ['cor','rmse','mbe','avg']).reset_index().melt(id_vars=['index']).dropna()
    stat_h.columns = ['param','dataset','BRA']
    return(stat_h.set_index(['param','dataset']).transpose())

In [912]:
def analyseBRAd():
    # remove leading and trailing 0s in observed data
    wpobs = wpBRAh.copy(deep=True)
    wpobs[wpobs.cumsum(axis=0)==0] = np.nan
    wpobs[wpobs[::-1].cumsum(axis=0)[::-1]==0] = np.nan
    
    # get capacities
    capusish = pd.Series(ANLmatchH.name.unique()).apply(getusicapdf).transpose()
    capusish.columns = ANLmatchH.name.unique()
    capBRAh = capusish.sum(axis=1)
    # mask for masking simulated data (to only use timespans where also observed data are available)
    mask = capusish.notna()
    mask.columns = ANLmatchH.name.unique()
    # mask and aggregate simulated data
    wpMER_BRAh = (wpMER*mask).sum(axis=1)
    wpERA_BRAh = (wpERA*mask).sum(axis=1)
    wpMERg_BRAh = (wpMERg*mask).sum(axis=1)
    wpERAg_BRAh = (wpERAg*mask).sum(axis=1)
    compBRAh= pd.DataFrame({'MERRA2':wpMER_BRAh,
                            'ERA5':wpERA_BRAh,
                            'MERRA2_GWA':wpMERg_BRAh,
                            'ERA5_GWA':wpERAg_BRAh,
                            'wp_obs':wpobs*10**6})
    ccBRAh = pd.concat([compBRAh,capBRAh],axis=1).dropna()
    ccBRAh.columns = compBRAh.columns.tolist() + ['cap']
    
    # aggregate daily
    ccBRAd = ccBRAh.resample('D').sum()
    cf_BRAd = ccBRAd.drop('cap',axis=1).div(ccBRAd.cap,axis=0).dropna()
    stat_d = pd.DataFrame({'ERA5':stats(cf_BRAd.ERA5,cf_BRAd.wp_obs,False),
                           'ERA5_GWA':stats(cf_BRAd.ERA5_GWA,cf_BRAd.wp_obs,False),
                           'MERRA2':stats(cf_BRAd.MERRA2,cf_BRAd.wp_obs,False),
                           'MERRA2_GWA':stats(cf_BRAd.MERRA2_GWA,cf_BRAd.wp_obs,False),
                           'obs':[np.nan,np.nan,np.nan,cf_BRAd.wp_obs.mean()]},
                          index = ['cor','rmse','mbe','avg']).reset_index().melt(id_vars=['index']).dropna()
    stat_d.columns = ['param','dataset','BRA']
    return(stat_d.set_index(['param','dataset']).transpose())

In [250]:
# OLD

def analyseBRAh():
    # remove leading and trailing 0s in observed data
    wpobs = wpBRAh.copy(deep=True)
    wpobs[wpobs.cumsum(axis=0)==0] = np.nan
    wpobs[wpobs[::-1].cumsum(axis=0)[::-1]==0] = np.nan
    compBRAh= pd.DataFrame({'MERRA2':wpMER_BRAh,
                            'ERA5':wpERA_BRAh,
                            'MERRA2_GWA':wpMERg_BRAh,
                            'ERA5_GWA':wpERAg_BRAh,
                            'wp_obs':wpobs*10**6})
    # get capacities
    capBRAh = get_cap_df(ANLmatchH.cap.values,
                         ANLmatchH.commissioning.astype(np.datetime64).values).tz_localize('UTC').tz_convert('Etc/GMT-3')
    # calculate capacity factors
    cf_BRAh = compBRAh.div(capBRAh,axis=0).dropna()
    stat_h = pd.DataFrame({'ERA5':stats(cf_BRAh.ERA5,cf_BRAh.wp_obs,False),
                           'ERA5_GWA':stats(cf_BRAh.ERA5_GWA,cf_BRAh.wp_obs,False),
                           'MERRA2':stats(cf_BRAh.MERRA2,cf_BRAh.wp_obs,False),
                           'MERRA2_GWA':stats(cf_BRAh.MERRA2_GWA,cf_BRAh.wp_obs,False),
                           'obs':[np.nan,np.nan,np.nan,cf_BRAh.wp_obs.mean()]},
                          index = ['cor','rmse','mbe','avg']).reset_index().melt(id_vars=['index']).dropna()
    stat_h.columns = ['param','dataset','BRA']
    return(stat_h.set_index(['param','dataset']).transpose())

In [297]:
# OLD

def analyseBRAd():
    # remove leading and trailing 0s in observed data
    wpobs = wpBRAh.copy(deep=True)
    wpobs[wpobs.cumsum(axis=0)==0] = np.nan
    wpobs[wpobs[::-1].cumsum(axis=0)[::-1]==0] = np.nan
    compBRAh= pd.DataFrame({'MERRA2':wpMER_BRAh,
                           'ERA5':wpERA_BRAh,
                           'MERRA2_GWA':wpMERg_BRAh,
                           'ERA5_GWA':wpERAg_BRAh,
                           'wp_obs':wpobs*10**6})
    # get capacities
    capBRAh = get_cap_df(ANLmatchH.cap.values,
                         ANLmatchH.commissioning.astype(np.datetime64).values).tz_localize('UTC').tz_convert('Etc/GMT-3')
    ccBRAh = pd.concat([compBRAh,capBRAh],axis=1).dropna()
    ccBRAh.columns = compBRAh.columns.tolist() + ['cap']
    # aggregate daily
    ccBRAd = ccBRAh.resample('D').sum()
    cf_BRAd = ccBRAd.drop('cap',axis=1).div(ccBRAd.cap,axis=0).dropna()
    stat_d = pd.DataFrame({'ERA5':stats(cf_BRAd.ERA5,cf_BRAd.wp_obs,False),
                           'ERA5_GWA':stats(cf_BRAd.ERA5_GWA,cf_BRAd.wp_obs,False),
                           'MERRA2':stats(cf_BRAd.MERRA2,cf_BRAd.wp_obs,False),
                           'MERRA2_GWA':stats(cf_BRAd.MERRA2_GWA,cf_BRAd.wp_obs,False),
                           'obs':[np.nan,np.nan,np.nan,cf_BRAd.wp_obs.mean()]},
                          index = ['cor','rmse','mbe','avg']).reset_index().melt(id_vars=['index']).dropna()
    stat_d.columns = ['param','dataset','BRA']
    return(stat_d.set_index(['param','dataset']).transpose())

In [906]:
def analyseBRAm():
    # remove leading and trailing 0s in observed data
    wpobs = wpBRAm.copy(deep=True)
    wpobs[wpobs.cumsum(axis=0)==0] = np.nan
    wpobs[wpobs[::-1].cumsum(axis=0)[::-1]==0] = np.nan
    # get capacities
    capusism = pd.Series(ANLmatch.name.unique()).apply(getusicapdfM).transpose()
    capusism.columns = ANLmatch.name.unique()
    capBRAm = capusism.sum(axis=1)
    # mask for masking simulated data (to only use timespans where also observed data are available)
    mask = capusism.notna()
    mask.columns = ANLmatch.name.unique()
    # mask and aggregate simulated data
    wpMER_BRAm = (wpMER[ANLmatch.name.unique()].resample('M').sum()*mask).sum(axis=1)
    wpERA_BRAm = (wpERA[ANLmatch.name.unique()].resample('M').sum()*mask).sum(axis=1)
    wpMERg_BRAm = (wpMERg[ANLmatch.name.unique()].resample('M').sum()*mask).sum(axis=1)
    wpERAg_BRAm = (wpERAg[ANLmatch.name.unique()].resample('M').sum()*mask).sum(axis=1)
    compBRAm= pd.DataFrame({'MERRA2':wpMER_BRAm,
                            'ERA5':wpERA_BRAm,
                            'MERRA2_GWA':wpMERg_BRAm,
                            'ERA5_GWA':wpERAg_BRAm,
                            'wp_obs':wpobs.resample('M').sum()*10**6})
    # calculate capacity factors
    cf_BRAm = compBRAm.div(capBRAm,axis=0).dropna()
    stat_m = pd.DataFrame({'ERA5':stats(cf_BRAm.ERA5,cf_BRAm.wp_obs,False),
                           'ERA5_GWA':stats(cf_BRAm.ERA5_GWA,cf_BRAm.wp_obs,False),
                           'MERRA2':stats(cf_BRAm.MERRA2,cf_BRAm.wp_obs,False),
                           'MERRA2_GWA':stats(cf_BRAm.MERRA2_GWA,cf_BRAm.wp_obs,False),
                           'obs':[np.nan,np.nan,np.nan,cf_BRAm.wp_obs.mean()]},
                          index = ['cor','rmse','mbe','avg']).reset_index().melt(id_vars=['index']).dropna()
    stat_m.columns = ['param','dataset','BRA']
    return(stat_m.set_index(['param','dataset']).transpose())

In [917]:
stats_BRAh = analyseBRAh()

In [919]:
stats_BRAh.to_csv(results_path + '/stats_BRAh.csv')

In [913]:
stats_BRAd = analyseBRAd()

In [915]:
stats_BRAd.to_csv(results_path + '/stats_BRAd.csv')

In [907]:
stats_BRAm = analyseBRAm()

In [909]:
stats_BRAm.to_csv(results_path + '/stats_BRAm.csv')