In [2]:
import pandas as pd
import numpy as np

def elongate(df):
    df_long = pd.wide_to_long(df, i = "PRICES", j = "hour", stubnames=["Hour"], sep = " ").reset_index()
    df_long.rename(columns={"Hour": "price", "PRICES": "date"}, inplace = True)
    df_long['datetime'] = pd.to_datetime(df_long['date']) + pd.to_timedelta(df_long['hour'], unit='h')
    df_long.sort_values(['datetime'], ascending=[True], inplace=True)
    df_long['price'] = df_long['price'].astype(float) / 1000 # Convert price per MWh to price per KWh
    return df_long

train = elongate(pd.read_excel('data/train.xlsx'))
val = elongate(pd.read_excel('data/validate.xlsx'))

In [19]:
### FOURIER TRANSFORM ###

def fourier_top_freq(data, segment_size=72):
    '''
    Applies Fourier transform to segments of the 'price' data and extracts the top 3 frequencies.
    
    Parameters:
    data (DataFrame): Input data.
    segment_size (int): Number of data points in each segment for Fourier transform.
    
    Returns:
    DataFrame: The input data with top 3 Fourier frequencies for each segment.
    '''

    # Create new columns for the top 3 fourier frequencies
    for i in range(3):
        data[f'fourier_freq_{i + 1}'] = np.nan

    # For each range of data points, calculate the Fourier transform
    for i in range(segment_size, len(data), 1): # Start at <segment_size>
        # Fourier transform of the last <segment_size> data points
        segment = data['price'][i - segment_size:i]
        fourier_coeffs = np.fft.fft(segment)
        freqs = np.fft.fftfreq(segment_size, d=1)  # Assuming hourly data, hence d=1

        # Get indices of top 3 frequencies based on magnitude of Fourier coefficients
        indices = np.argsort(np.abs(fourier_coeffs))[::-1][1:4] # ::-1 to sort in descending order

        for j in range(3):
            column_name = f'fourier_freq_{j + 1}'
            data.loc[i, column_name] = freqs[indices[j]]

    return data

train = fourier_top_freq(train, segment_size=72)
val = fourier_top_freq(val)


            date  hour    price            datetime  fourier_freq_1  \
16439 2009-12-31    15  0.03300 2009-12-31 15:00:00        0.041667   
17535 2009-12-31    16  0.02982 2009-12-31 16:00:00       -0.041667   
18631 2009-12-31    17  0.03028 2009-12-31 17:00:00       -0.041667   
19727 2009-12-31    18  0.04000 2009-12-31 18:00:00       -0.041667   
20823 2009-12-31    19  0.04048 2009-12-31 19:00:00       -0.041667   
21919 2009-12-31    20  0.03600 2009-12-31 20:00:00        0.041667   
23015 2009-12-31    21  0.02900 2009-12-31 21:00:00       -0.041667   
24111 2009-12-31    22  0.02482 2009-12-31 22:00:00        0.041667   
25207 2009-12-31    23  0.03128 2009-12-31 23:00:00       -0.041667   
26303 2009-12-31    24  0.03100 2010-01-01 00:00:00       -0.041667   

       fourier_freq_2  fourier_freq_3  
16439       -0.041667        0.013889  
17535        0.041667        0.083333  
18631        0.041667        0.083333  
19727        0.041667        0.083333  
20823        0.041

In [21]:
### GRADIENT FEATURES ###
def gradient_features(data):
    data['gradient'] = np.gradient(data['price'])
    return data

def second_gradient_features(data):
    data['second_gradient'] = np.gradient(data['gradient'])
    return data

train = gradient_features(train)
train = second_gradient_features(train)
val = gradient_features(val)
val = second_gradient_features(val)

print(train.head())

           date  hour    price            datetime  fourier_freq_1  \
0    2007-01-01     1  0.02431 2007-01-01 01:00:00             NaN   
1096 2007-01-01     2  0.02431 2007-01-01 02:00:00       -0.041667   
2192 2007-01-01     3  0.02171 2007-01-01 03:00:00        0.083333   
3288 2007-01-01     4  0.00842 2007-01-01 04:00:00        0.041667   
4384 2007-01-01     5  0.00001 2007-01-01 05:00:00        0.041667   

      fourier_freq_2  fourier_freq_3  gradient  second_gradient  
0                NaN             NaN  0.000000        -0.001300  
1096        0.041667        0.083333 -0.001300        -0.003972  
2192       -0.083333        0.013889 -0.007945        -0.004775  
3288       -0.041667        0.013889 -0.010850         0.001870  
4384       -0.041667        0.013889 -0.004205         0.005428  


In [None]:
### HISTORIC FEATURES ###

def moving_averages(data, window_size=72):
    '''
    Calculates the moving average of the 'price' data for each data point.

    '''
    data['moving_average'] = data['price'].rolling(window=window_size, min_periods=1).mean()
    return data

def moving_std(data, window_size=72):
    data['moving_std'] = data['price'].rolling(window=window_size, min_periods=1).std()
    return data

def moving_min(data, window_size=72):
    data['moving_min'] = data['price'].rolling(window=window_size, min_periods=1).min()
    return data

def moving_max(data, window_size=72):
    data['moving_max'] = data['price'].rolling(window=window_size, min_periods=1).max()
    return data


In [None]:
### DATE FEATURES ###
def date_features(data):
    data['day_of_week'] = data['datetime'].dt.dayofweek
    data['day_of_month'] = data['datetime'].dt.day
    data['month'] = data['datetime'].dt.month
    data['year'] = data['datetime'].dt.year
    data['hour'] = data['datetime'].dt.hour
    data['season'] = (data['month'] % 12 + 3) // 3
    return data

def average_features(data):
    data['average_day'] = data.groupby(['day_of_week', 'hour'])['price'].transform('mean')
    data['average_day_of_month'] = data.groupby(['day_of_month', 'hour'])['price'].transform('mean')
    
    data['average_month'] = data.groupby(['month', 'hour'])['price'].transform('mean')
    data['average_season'] = data.groupby(['season', 'hour'])['price'].transform('mean')
    return data

train = date_features(train)