In [1]:
import warnings

warnings.filterwarnings("ignore")

from common_utils.utils.config import Config
from common_utils.io.data_access.data_access_factory import DataAccessFactory

# from axpo_trading.forecast.forecast_preprocess_iberia import preproces_ufis
from common_utils.utils import utils, utils_io, utils_date
from axpo_trading.forecast import forecast_sql_preprocess_iberia
from axpo_trading.forecast import forecast_preprocess_iberia
import pandas as pd
import numpy as np
import os
import datetime
from numpy import array
import matplotlib.pyplot as plt

# Random seeds
from numpy.random import seed

seed(42)
from tensorflow.keras.utils import set_random_seed

set_random_seed(42)
import random as rn

rn.seed(1254)
from keras.metrics import RootMeanSquaredError, MeanAbsoluteError

# wind_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
wind_path = "/home/jovyan/projects/AdvancedAnalytics-UseCase-Wind"
os.chdir(wind_path)

os.environ["CONFIG_DIR"] = "config_files"
os.environ["AUTH_CONFIG_DIR"] = "auth"
os.environ["AZURE_STORAGE_ACCOUNT_RAW_CONTAINER_NAME_WIND_RAW"] = "raw"
os.environ["AZURE_STORAGE_ACCOUNT_RAW_CONTAINER_NAME_WIND_STAGING"] = "staging"
os.environ[
    "AZURE_SQL_SHARED_RAW_SERVER"
] = "axso-prod-appl-aa-prod-shared-sql-secondary.database.windows.net"
# os.environ["AZURE_SQL_SHARED_RAW_SERVER"] = 'axso-prod-appl-aa-prod-shared-sql.database.windows.net'
os.environ["AZURE_SQL_SHARED_RAW_DATABASE"] = "axso-prod-appl-aa-prod-shared-raw-sqldb"
os.environ["N_THREADS_SQL"] = "1"

# DEV
os.environ["ENV"] = "azure_iberia_k8s_dev"
# BLOB DEV
os.environ["AZURE_STORAGE_ACCOUNT_DATA_NAME"] = "axsonpaadevdslabdls"
os.environ["AZURE_STORAGE_ACCOUNT_RAW_NAME"] = "axsoprodaaprodshareddls-secondary"
os.environ["AZURE_STORAGE_ACCOUNT_DATA_CONTAINER_NAME_WIND_REFINED"] = "wind-refined"
os.environ["AZURE_STORAGE_ACCOUNT_DATA_CONTAINER_NAME_WIND_RESULTS"] = "wind-results"

In [6]:
date_from_train = "2021-05-01"
date_to_train = "2022-03-01"
# date_from_train = "2021-01-01"
# date_to_train = "2021-05-20"

date_from_validate = "2022-03-02"
date_to_validate = "2022-05-01"

date_from_test = "2022-05-01"
date_to_test = "2022-06-01"


n_steps_in = 12
n_steps_out = 3

portfolio_level = True

main_premaster_columns = ["datetime_market","datetime","hours_fwd","ufi","telemetry","forecast","metering"]
info_columns = ["telemetry","forecast","metering","forecast_error_metering","forecast_error_telemetry"]
groupping_columns = ['ufi','hours_fwd']
target_hours_fwd = [1,2,3]
# target_ufis = ["ZAPATER","PEARBO","ROMERA"]
# target_ufis = ["ABELLA","PAXAMON","SPADRON","PELALIN","TIGUEIR","PEIRIXO","MONTCEO","MONTOUT"]
target_ufis= ['ABELLA', 'CERROS', 'LAMESA', 'LACAYA', 'VILACHA', 'TIGUEIR',
       'ESQUILE', 'BRULLES', 'PELALIN', 'PESLOB', 'VISOS', 'DEFERII',
       'PECORTI', 'LASORDA', 'ESCANDO', 'BAYO', 'HINOJII',
       'PEOCHAO', 'CALERA', 'CPELAOS', 'ELGALLO', 'SPADRON', 'PAXAMON',
       'TRAPERA', 'SABUCED', 'PEZARZU', 'PESLOA', 'ASNEVES', 'CAMPANA',
       'PECOUTE', 'HINOJAI', 'PESLOD', 'AXIABRE', 'FEIXOS', 'OTERO',
       'POTRA', 'ZARZUEL', 'CERCEDA', 'GRAIADE', 'PEOUROL', 'RODERA',
       'MONTOUT', 'ARTEIXO', 'ELLLAN', 'MONTCEO', 'LALOMBA', 'CARRACE',
       'PEIRIXO', 'ATALAYA', 'FRAILA', 'DEHESII', 'MONTERO', 'MONDONE',
       'ROMERA', 'ESE', 'BANDELE', 'SANJOSE', 'SERRETA', 'DEHEII',
       'AEROGEN', 'ZAPATER', 'LARUYA', 'PESLOC', 'PEARBO', 'PELALOM',
       'MUDEFER']

## Functions

### Load

In [2]:
def pivot_master_by_levels(multiple_line_df):

    # TODO Parametrize levels

    # Pivot data according to level
    # Level 1: market dependent columns
    index_cols = ["datetime_market"]
    market_level_columns = ["hour_market"]
    reduced_df_lv_1 = multiple_line_df[market_level_columns + index_cols]
    reduced_df_lv_1["day_market"] = multiple_line_df["datetime_market"].dt.day
    reduced_df_lv_1["month_market"] = multiple_line_df["datetime_market"].dt.month
    # Add missing levels to even the final shapes
    reduced_df_lv_1 = reduced_df_lv_1.drop_duplicates()
    reduced_df_lv_1 = reduced_df_lv_1.set_index(index_cols, drop=True)
    even_level_1_arrays = [
        market_level_columns + ["day_market", "month_market"],
        [""],
        [""]
    ]
    reduced_df_lv_1.columns = pd.MultiIndex.from_product(even_level_1_arrays, names=["feature", "ufi", "hours_fwd"])

    # Level 2: ufi dependent columns
    index_cols = index_cols + ["ufi"]
    ufi_level_columns = ["p_max", "p_min", "telemetry", "telemetry_pct_good", "telemetry_open", "telemetry_close", "telemetry_min", "telemetry_max", "telemetry_std", "telemetry_value_count", "telemetry_slope", "lat","lon"] # "codCliente", "up", 
    reduced_df_lv_2 = multiple_line_df[ufi_level_columns + index_cols]
    reduced_df_lv_2 = reduced_df_lv_2.drop_duplicates(subset=["datetime_market","ufi"])
    reduced_df_lv_2 = reduced_df_lv_2.pivot(index=['datetime_market'], columns=['ufi'], values=ufi_level_columns)
    # Add missing level to even the shapes
    even_level_2_arrays = [
        list(reduced_df_lv_2.columns.get_level_values(0)),
        list(reduced_df_lv_2.columns.get_level_values(1)),
        list([""] * reduced_df_lv_2.columns.shape[0])
    ]
    even_level_2_tuples = list(zip(*even_level_2_arrays))
    reduced_df_lv_2.columns = pd.MultiIndex.from_tuples(even_level_2_tuples, names=["feature", "ufi", "hours_fwd"])
    reduced_df_lv_2

    # Level 3: horizon dependent columns
    index_cols = index_cols + ["hours_fwd"]
    horizon_level_columns = ["forecast","metering"] #,"forecast_error_metering" ,"forecast_error_telemetry"]
    reduced_df_lv_3 = multiple_line_df[horizon_level_columns + index_cols]
    reduced_df_lv_3 = reduced_df_lv_3.drop_duplicates()
    reduced_df_lv_3 = reduced_df_lv_3.pivot(index=['datetime_market'], columns=['ufi','hours_fwd'], values=horizon_level_columns)

    pivotted_df = pd.concat([reduced_df_lv_1,  pd.concat([reduced_df_lv_2, reduced_df_lv_3], axis=1)], axis=1)

    return pivotted_df



def add_forecast_error_pivot(pivot_df, error_reference="telemetry"):

    ufis_in_df = pivot_df.columns.get_level_values("ufi").unique()
    # Remove empty ufi used for even levels
    ufis_in_df = [ufi for ufi in ufis_in_df if ufi]
    fcst_error_df = pd.DataFrame()
    fcst_error_df_partial = pd.DataFrame()

    for ufi in ufis_in_df:

        if error_reference == "telemetry":
            # Telemetry aligned with index hour (it comes with 1 hour lag)
            telemetry_market_t = pivot_df[error_reference,ufi].shift(-1)
            # Forecasted production aligned with the index hour (we take the t+1 forecast)
            forecast_market_t = pivot_df["forecast",ufi,1].shift(1)
            fcst_error_df_partial[f"forecast_error_{error_reference}"] = forecast_market_t - telemetry_market_t
            # Lag the forecast error 1 hour so it is available at prediction time
            fcst_error_df_partial[f"forecast_error_{error_reference}"] = fcst_error_df_partial[f"forecast_error_{error_reference}"].shift(1)
        else:
            # Error with respect to Metering  which is already aligned
            metering_market_t = pivot_df[error_reference,ufi,1]
            forecast_market_t = pivot_df["forecast",ufi,1]
            fcst_error_df_partial[f"forecast_error_{error_reference}"] = forecast_market_t - metering_market_t
            # Lag the forecast error 1 hour so it is available at prediction time
            fcst_error_df_partial[f"forecast_error_{error_reference}"] = fcst_error_df_partial[f"forecast_error_{error_reference}"]


        fcst_error_df_partial["ufi"] = ufi
        fcst_error_df = pd.concat([fcst_error_df, fcst_error_df_partial])

    fcst_error_df = fcst_error_df.pivot(columns=['ufi'], values=[f"forecast_error_{error_reference}"])

    # Add missing level to even the shapes
    even_level_2_arrays = [
        list(fcst_error_df.columns.get_level_values(0)),
        list(fcst_error_df.columns.get_level_values(1)),
        list([""] * fcst_error_df.columns.shape[0])
    ]
    even_level_2_tuples = list(zip(*even_level_2_arrays))
    fcst_error_df.columns = pd.MultiIndex.from_tuples(even_level_2_tuples, names=["feature", "ufi", "hours_fwd"])

    return pd.concat([fcst_error_df, pivot_df], axis=1)


def get_master(date_from, date_to, cols_to_keep, horizons, ufis, values_to_pivot, do_pivot=True):

    # Load premaster data
    config_dict = Config.get_config()
    factory = DataAccessFactory()
    data_config = config_dict["data_access_factory"]
    source = factory.get(data_config["master_overcost"]["source"])

    master = utils_io.load_monthly(
        path=f"forecast/research/premaster_eolic",
        date_col="date",
        date_from=date_from,
        date_to=date_to,
        data_access=source,
    )

    # Get sample of premaster
    if cols_to_keep == "all":
        cols_to_keep = master.columns
    reduced_df = master[cols_to_keep]
    # Get only info for the next three hours
    reduced_df = reduced_df[reduced_df["hours_fwd"].isin(horizons)]
    # Get only records for target ufis
    reduced_df = reduced_df[reduced_df["ufi"].isin(ufis)][cols_to_keep]
    # Drop columns with empty meterings
    reduced_df = reduced_df[reduced_df['metering'].notna()]
    # Add forecast_error_predict_time
#     reduced_df["forecast_error_metering"] = reduced_df["forecast"] - reduced_df["metering"]

    # The telemetry is not aligned with the forecast thus we cannot simply subtract
    #     reduced_df["forecast_error_telemetry"] = reduced_df["forecast"] - reduced_df["telemetry"]

    # ?Drop rows with empty forecast error since we cannot know their real values 
    reduced_df = reduced_df.drop_duplicates()
    if do_pivot:
        pivot_df = pivot_master_by_levels(reduced_df)
    else:
        return reduced_df

    # Now we can align the forecasts and telemetry at market time to get the recent forecast error
    pivot_df = add_forecast_error_pivot(pivot_df, error_reference="telemetry")
    pivot_df = add_forecast_error_pivot(pivot_df, error_reference="metering")

    # It's really important to determine the order of the columns since we will be working with their array representation, not the dataframe
    pivot_df = pivot_df.sort_index(axis='columns', level=[0,1,2])

    return pivot_df



### Get ufi coords

In [3]:
def get_ufis_location(df_with_locations):
    '''
    :param df_with_locations: master table with lat lon columns
    
    Receives the tabular master with temporal index
    It sumarizes the coord info of each ufi in a dataframe identified by an id
    In case some ufis have the same coords a slight offset is applied to them
    so they don't share the exact location
    
    :return ufi_coords_df: summary of each ufi's location identified by name and
    a numeric id
    '''
    
    # Get ufi list
    ufi_list = df_with_locations.columns.get_level_values(1)
    # Filter out empty ufi used for  index levels and remove duplicates
    ufi_list = list(filter(None, ufi_list.unique()))

    ufi_coord_df = pd.DataFrame()
    for ufi in ufi_list:
        # Use the tail since the head has some nans
        lat_ufi = df_with_locations[~np.isnan(df_with_locations["lat"][ufi])]["lat"][ufi].values[0]
        lon_ufi = df_with_locations[~np.isnan(df_with_locations["lon"][ufi])]["lon"][ufi].values[0]

        ufi_info = pd.Series([ufi,lat_ufi,lon_ufi])
        ufi_coord_df = ufi_coord_df.append(ufi_info, ignore_index=True)

    ufi_coord_df.columns = ["ufi", "lat", "lon"]

    # Offset ufis sharing location
    repeated_coords = ufi_coord_df.groupby(["lat", "lon"]).agg(set).reset_index()
    for lat,lon,ufis in repeated_coords[repeated_coords["ufi"].apply(lambda x: len(x)) > 1].values:

        offset = 1e-5
        for ufi in list(ufis):
            print(f"Offsetting {ufi}...")
            ufi_coord_df.loc[ufi_coord_df["ufi"] == ufi,"lat"] = lat + offset
            offset = offset + 1e-5
    ufi_coord_df = ufi_coord_df.sort_values("ufi").reset_index(drop=True).reset_index() 
    ufi_coord_df = ufi_coord_df.rename(columns={"index": "id"})
    
    
    return ufi_coord_df

### Get grid

In [4]:
def embed_ufis_to_grid(df_coord_summary):
    '''
    : param df_coord_summary: info with each ufi of the data set
                              with its coordinates and an id
                              
    Maps the coordinates of the ufis to a narrower space to reduce
    the sparsity of the initial map.

    :return grid: 2D matrix representing the location of each
                  ufi. The values of each index is the id of the 
                  ufi located in it. In case no there is no ufi
                  the id is -1.
    '''
    sorted_latitudes = df_coord_summary["lat"].unique()
    sorted_latitudes.sort()

    sorted_longitudes = df_coord_summary["lon"].unique()
    sorted_longitudes.sort()

    grid_shape = [len(sorted_latitudes), len(sorted_longitudes)]
    grid = np.ones(grid_shape)
    grid = grid * -1


    for id, ufi, lat, lon in df_coord_summary.values:
        id_x = np.where(sorted_latitudes == lat)[0][0]
        id_y = np.where(sorted_longitudes == lon)[0][0]
        grid[id_x][id_y] = id

    return grid


## Generate grid

In [None]:
master = get_master("2019-01-01", "2022-11-01", cols_to_keep="all", horizons=target_hours_fwd, ufis=target_ufis, values_to_pivot=info_columns)
ufi_location_summary = get_ufis_location(master)
ufi_grid = embed_ufis_to_grid(ufi_location_summary)

In [16]:
len(master.columns.get_level_values(1).unique())

67

In [25]:
ufi_grid.shape

(66, 47)

## Save grid and location summary

In [28]:
ufi_location_summary.to_csv("data/tfm/scenes/ufi_location_summary.csv",index=False)

In [29]:
with open('data/tfm/scenes/ufi_grid.npy', 'wb') as f:
    np.save(f, ufi_grid)