In [1]:
import warnings
from math import floor
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from extract_features import *      
from extract_sample_features import *    

In [2]:
def read_data(filename, targetcol = 'VALUE', datecol = 'DATE', sep = ';', 
              decimal = ',', date_format = "%d/%m/%Y %H:%M:%S.%f", freq = '1T'):
    
    if filename[-3:] == 'csv' or filename[-3:] == 'txt':
        file_ = pd.read_table(filename, sep = sep, decimal = decimal, parse_dates = False)
        file_parsed = parsedates(file_, date_format = date_format, freq = freq, datecol = datecol)
        
    elif filename[-3:] == 'xls' or filename[-3:] == 'lsx':
        file_ = pd.read_excel(filename, parse_dates = False)
        file_parsed = parsedates(file_, date_format = date_format, freq = freq,datecol = datecol)
        
    else:
        raise NotImplementedError("This file type is not supported" )
    
    return(file_parsed[targetcol])

def parsedates(file_, date_format, freq, datecol):
    freq = pd.to_timedelta(freq)
    datetime_i = pd.to_datetime(file_[datecol], format = date_format)
    print(type(datetime_i.iloc[-1]))
    if freq < (datetime_i.iloc[1] - datetime_i.iloc[0]):
        raise ValueError('Expected frequency is smaller than dataset information')
        
    file_.index = datetime_i
    desired_index = pd.date_range(start = datetime_i.iloc[0], end = datetime_i.iloc[-1], freq = freq)
    
    file_int = file_.reindex(file_.index.union(desired_index)).interpolate(method = 'time').reindex(desired_index)
    
    return(file_int)

In [3]:
def ts_split(data, test_size=50):
    ts_train, ts_test = train_test_split(data, shuffle=False)
    print('Test size: ', len(ts_test), '  Input size: ', len(data))
    return(ts_train, ts_test)


In [4]:
def lag_creation(X, lags=3, step=1, dropna = False):
    X = X.iloc[:,0]
    print(type(X))
    if type(X) is pd.DataFrame:
        new_dict = {}
        for col_name in X:                                                          
            for l in range(1,self.lag+1, self.step):                                
                new_dict['%s_lag%d' %(col_name,l)]=X[col_name].shift(l)            
        res=pd.DataFrame(new_dict,index=X.index)  

    elif type(X) is pd.Series:                                                      ## si es serie
        the_range=range(0,lags+1, step)                                    ## de 0 a numero de lags
        res=pd.concat([X.shift(i) for i in the_range],axis=1)                       ## concatenamos los lags a cada fila del nuevo dataframe 
        res.columns=['lag_%d' %i for i in the_range]
    if dropna:                                                                 ## si hay que quitar na
        res = res.dropna()                                                          ## quitamos na del principio del df
        res = res[res.columns[::-1]]                                                ## res = todo menos la ultima columna 
        return res
    else:
        res = res[res.columns[::-1]]                                                ## res = todo menos la ultima columna
        return res

------

In [5]:
points = 100                        ## tamaño del test para entrenar el regresor
window_length = 20                  ## tamaño de ventana para la prediccion de ventana
rolling_window = [5,10,20]          ## Cuantas caracteristicas debe tener la rolling window para el calculo de features rollo moving average
horizon = 1                         ## horizonte a predecir en cada recursion
step = 1                            ## paso que se mueve la ventana para cada recursión.
freq = '5s'                         ## Frecuencia de la serie de entrada o de prediccion

In [6]:
freq = '20T'
filename = 'Seat.csv'

df = read_data(filename='Seat.csv',
                 freq='20T',
                 targetcol='INSTALACIONES [kWh]',
                 datecol='MSJO_DATUM',
                 sep=',',
                 decimal='.',
                 date_format="%d/%m/%Y %H:%M")
df.head()
df.tail()

<class 'pandas._libs.tslibs.timestamps.Timestamp'>


2018-11-13 22:30:00    276.036780
2018-11-13 22:50:00    275.003310
2018-11-13 23:10:00    261.639657
2018-11-13 23:30:00    245.255448
2018-11-13 23:50:00    242.053699
Freq: 20T, Name: INSTALACIONES [kWh], dtype: float64

In [7]:
def create_sample_features(X, window_length = 5, features = [], rolling_window = [2]):      ## Creacion de features para los datos de entrada (se llama solo una vez)
    lags_X = lags_sample(X, window_length=window_length, step=1)                            ## Creacion de los lags (min t-1, -2 ....)
    df = lags_X.copy()                                                                      ## Copia a otro dataframe for some reason
    for f in features:                                                                      ## Features contiene la lista de features que queremos añadir al dataframe con lags.b
        if rolling_window:                                                                  ## si se ha especificado una rolling window
            for rol in rolling_window:   
                aux = switch_sample_features(f, X, lags_X, rol)                             ## Variable auxiliar para crear la feature (se borra a cada iteracion) con los lags (la fila vaya)
                df = pd.concat([df, aux], axis = 1).dropna()                                ## concatenamos el valor al dataframe
        else:
            aux = switch_sample_features(f, X, lags_X, [])                                  ## opcion sin rolling windows
            df = pd.concat([df, aux], axis = 1).dropna()                                    ## añadir a dataframe
    return df

In [8]:
def switch_sample_features(value, X, lags_X, rolling_window):                               ## Switch con el que elegir las caracteristicas del sample.
    return { 
        'mean': lambda : mean_sample(lags_X, rolling_window),
        'std': lambda : std_sample(lags_X, rolling_window),
        'max': lambda : max_sample(lags_X, rolling_window),
        'min': lambda : min_sample(lags_X, rolling_window),
        'quantile': lambda : quantile_sample(lags_X, rolling_window),
        'iqr': lambda : iqr_sample(lags_X, rolling_window),
        'entropy': lambda : entropy_sample(lags_X, rolling_window),
        'trimmean': lambda : trimmean_sample(lags_X, rolling_window),
        'variation': lambda : variation_sample(lags_X, rolling_window),
        'hmean': lambda : hmean_sample(lags_X, rolling_window),
        'gmean': lambda : gmean_sample(lags_X, rolling_window),
        'mad': lambda : mad_sample(lags_X, rolling_window),
#         'gstd': lambda : gstd_sample(lags_X, rolling_window),
        'tvar': lambda : tvar_sample(lags_X, rolling_window),
        'kurtosis': lambda : kurtosis_sample(lags_X, rolling_window),
        'sem': lambda : sem_sample(lags_X, rolling_window),
        'wav': lambda : wav_sample(lags_X, rolling_window),
        #-----------------------------------------
        'minute': lambda : minute_sample(X),
        'hour': lambda : hour_sample(X),
        'dayofweek': lambda : dayofweek_sample(X),
        'day': lambda : day_sample(X),
        'month': lambda : month_sample(X),
        'quarter': lambda : quarter_sample(X),
        'weekofyear': lambda : weekofyear_sample(X),
        'weekend': lambda : weekend_sample(X)
        
    }.get(value)()

In [9]:
def create_horizon(df, horizon):
    y = df.copy()
    ht = HorizonTransformer(horizon = horizon+1)
    y_horizon = ht.fit_transform(y.iloc[:,0]) #Los horizon ultimos valores son nan ya que no se puede crear un horizonte completo para ellos
    y_horizon = pd.DataFrame(y_horizon[:-horizon, :], index = y.index[horizon:])
    name = 't+'+str(horizon)
    y_horizon = pd.DataFrame({name : y_horizon.iloc[:, -1]})
    
    return y_horizon

In [10]:
def feature_selection(X, y, num_features):                                ## Elegimos componente principales entrenando un lgbmr y extrayendo los best_features
    clf = lgb.LGBMRegressor(n_estimators=40).fit(X, y)                    ## Fit modelo con poquitos estimators (40)
    best_indx_col = clf.feature_importances_.argsort()[-num_features:]    ## elegimos el num_features que nos queramos quedar
    best_features = list(X_train.columns[best_indx_col])                  ## Creamos lista para indexar el dataframe
    return order_by_other_list(X_train.columns, best_features)  

In [11]:
def order_by_other_list(list_order, list_tobe_order):                     ## Orden de la lista y lista a ordenar
    d = {k:v for v,k in enumerate(list_order)}                            ## creas un dict key:value for enumerate tal
    list_tobe_order.sort(key=d.get)                                       ## le das un sort con key = d.get (pillamos la key del dict)
    return list_tobe_order

In [12]:
def switch_features(value, lags_data, date, rolling_window, num):         ## Switch igual que antes, se añade el parametro num, que significa?
    return {
        'lag': lambda : lag_feature(lags_data, num),
        #--------------------------------------------------------------
        'mean': lambda : mean_feature(lags_data, rolling_window, num),
        'std': lambda : std_feature(lags_data, rolling_window, num),
        'max': lambda : max_feature(lags_data, rolling_window, num),
        'min': lambda : min_feature(lags_data, rolling_window, num),
        'quantile': lambda : quantile_feature(lags_data, rolling_window, num),
        'iqr': lambda : iqr_feature(lags_data, rolling_window, num),
        'entropy': lambda : entropy_feature(lags_data, rolling_window, num),
        'trimmean': lambda : trimmean_feature(lags_data, rolling_window, num),
        'variation': lambda : variation_feature(lags_data, rolling_window, num),
        'hmean': lambda : hmean_feature(lags_data, rolling_window, num),
        'gmean': lambda : gmean_feature(lags_data, rolling_window, num),
        'mad': lambda : mad_feature(lags_data, rolling_window, num),
#         'gstd': lambda : gstd_feature(lags_data, rolling_window, num),
        'tvar': lambda : tvar_feature(lags_data, rolling_window, num),
        'kurtosis': lambda : kurtosis_feature(lags_data, rolling_window, num),
        'sem': lambda : sem_feature(lags_data, rolling_window, num),
        'wav': lambda : wav_feature(lags_data, rolling_window, num),
        #--------------------------------------------------------------
        'minute': lambda : minute_feature(date, num),
        'hour': lambda : hour_feature(date, num),
        'dayofweek': lambda : dayofweek_feature(date, num),
        'day': lambda : day_feature(date, num),
        'month': lambda : month_feature(date, num),
        'quarter': lambda : quarter_feature(date, num),
        'weekofyear': lambda : weekofyear_feature(date, num),
        'weekend': lambda : weekend_feature(date, num)
    }.get(value)()

In [13]:
def create_features(feature_names, lags_data, date):                       ## Creacion de features para las ventanas segun avanzan (se llama varias veces)
    features = []                                                          ## Lista vacía de features para inicializar
    for f in feature_names:                                                ## Por cada feature que nos pidan añadir
        ftype, info = f.split('_',1)                                       ## Separamos el str a la altura de la barrabaja para separar en tipo de feature (p. ej minute) y numero (p ej 60)
        if '_' in info:                                                    ## Si nos queda otra barrabaja es que es dato calculado con una rolling window
            rolling_window, num = info.split('_')                          ## separamos en rolling window y numero
            num = int(num)                                                 ## str a int
            rolling_window = int(rolling_window)                           ## str a int
        else:
            num = int(info)                                                ## tambien puede ser que no se de el caso de incluir el ftype en el nombre, asi que seguimos adelante simplemente sin rolling window
            rolling_window = None
        features.append(switch_features(ftype, lags_data, date, rolling_window, num))
                                
    return features

In [14]:
def switch_optimization(value, regressor):                                  ## Switch para elegir optimizaciones del regresor
    return {
        'tpe': lambda : tpe_optimization(),                                 ## TPE Tree-structured Parzen optimization (Secuencial)
        'pso': lambda : pso_optimization(),                                 ## PSO Particle Swarm Optimization (Enjambre)
    }.get(value)()

In [15]:
features = ['mean', 'std', 'max', 'min', 'quantile', 'iqr', 'entropy', 'trimmean', 'variation', 'hmean', 'gmean', 'mad', 'tvar',
            'kurtosis', 'sem', 'minute', 'hour', 'dayofweek', 'day', 'month', 'quarter', 'weekofyear', 'weekend']

In [16]:
 def recursive_forecast(y, model, window_length, feature_names, rolling_window, n_steps): ## modelo debe estar pre-entrenado
    """
    Parameters
    ----------
    y: pd.Series holding the input time-series to forecast
    model: pre-trained machine learning model
    lags: list of lags used for training the model
    n_steps: number of time periods in the forecasting horizon
    step: forecasting time period
   
    Returns
    -------
    fcast_values: pd.Series with forecasted values
    """
   
    # get the dates to forecast
    last_date = y.index[-1] + datetime.timedelta(minutes=15)                     ## ultima fecha, le añadimos un delay de 15 mins porque?¿ usamos 15 minutos de validacion o test 
    target_range = pd.date_range(last_date, periods=n_steps, freq=freq)          ## creamos el target a partir del ultimo dia. 
    target_value = np.arange(n_steps, dtype = float)                             ## Si no guardamos como dtype float nos guarda como enteros
    max_rol = max(rolling_window, default=1)                                     ## Maximo del array del rolling window
    lags = list(y.iloc[-(window_length+(max_rol-1)):,0].values)                  ## creacion de lags en forma de lista. Indexando el valor de las ys (solo hay ys porque es una serie temporal duh)
    ####
    
    
    for i in range(n_steps):                                                     ## Para cada uno de los steps
        train = create_features(feature_names, lags, target_range[i])            ## 
        new_value = model.predict(pd.DataFrame(train).transpose())               ## Calculo del nuevo valor
        target_value[i] = new_value[0]                                           ## Guardamos el valor en el target
        lags.pop(0)                                                              ## Quitamos el primer valor del lag
        lags.append(new_value[0])                                                ## añadimos la prediccion al final del lag
                                                                                 ## Y volvemos a gestionar con el nuevo lag
           
    return target_value

In [17]:
y_train, y_test = train_test_split(df, test_size=points, shuffle=False)      ## Split en train y test sin shuffle por ser time series. Test_size se da como points
print(y_train.shape[0], y_test.shape[0])  

53131 100


In [18]:
X_train = create_sample_features(y_train, window_length=window_length, features=features, rolling_window=rolling_window)

IndexingError: Too many indexers

In [None]:
selected_feat = 40
%time best_features = feature_selection(X_train, y_horizon.values.ravel(), selected_feat)

X_train_selec = X_train.loc[:, best_features]
X_train_selec.head()