In [69]:
import pandas as pd
import numpy as np

def elongate(df):
    df_long = pd.wide_to_long(df, i = "PRICES", j = "hour", stubnames=["Hour"], sep = " ").reset_index()
    df_long.rename(columns={"Hour": "price", "PRICES": "date"}, inplace = True)
    df_long['datetime'] = pd.to_datetime(df_long['date']) + pd.to_timedelta(df_long['hour'], unit='h')
    df_long.sort_values(['datetime'], ascending=[True], inplace=True)
    df_long['price'] = df_long['price'].astype(float) / 1000 # Convert price per MWh to price per KWh
    return df_long

# delete date:time column
train = elongate(pd.read_excel('data/train.xlsx'))
val = elongate(pd.read_excel('data/validate.xlsx'))


In [70]:
# clean up datetime column
del train['datetime']
del val['datetime']

# We should fix the loc of the train data to 0,1,2 etc
print(train.head())
print(train.loc[1, 'price'])
train.reset_index(inplace=True)
print(train.head())
print(train.loc[1, 'price'])

           date  hour    price
0    2007-01-01     1  0.02431
1096 2007-01-01     2  0.02431
2192 2007-01-01     3  0.02171
3288 2007-01-01     4  0.00842
4384 2007-01-01     5  0.00001
0.01601
   index       date  hour    price
0      0 2007-01-01     1  0.02431
1   1096 2007-01-01     2  0.02431
2   2192 2007-01-01     3  0.02171
3   3288 2007-01-01     4  0.00842
4   4384 2007-01-01     5  0.00001
0.02431


In [73]:
### GRADIENT FEATURES ### Q: Is trend a better name?
def gradient_features(data, num_prev_points=1):
    for i in range(len(data) - 1):
        if i == 0:
            data.loc[i, f'gradient_{num_prev_points}'] = 0
        else:
            gradient_sum = 0
            for j in range(num_prev_points):
                location_point_a = max(i - j, 0)
                location_point_b = max(i - j - 1, 1)
                point_a = data.loc[location_point_a, 'price']
                point_b = data.loc[location_point_b, 'price']
                gradient_sum += point_a - point_b

            data.loc[i, f'gradient_{num_prev_points}'] = gradient_sum 
    
    return data


def second_gradient_features(data, num_prev_points=1):
    # check if gradient_1 column exists
    if 'gradient_1' not in data.columns:
        data = gradient_features(data, num_prev_points=1)
    
    for i in range(len(data) - 1):
        if i == 0:
            data.loc[i, f'second_gradient_{num_prev_points}'] = 0
        else:
            second_gradient_sum = 0

            for j in range(num_prev_points): # for amount of num_prev_points compare to the previous point
                location_point_a = max(i - j, 0)
                location_point_b = max(i - j - 1, 1)
                point_a = data.loc[location_point_a, 'gradient_1']
                point_b = data.loc[location_point_b, 'gradient_1']
                second_gradient_sum += point_a - point_b
            
            data.loc[i, f'second_gradient_{num_prev_points}'] = second_gradient_sum    
    
    return data



# # TEST
# test = [1, 2, 4, 7, 6, 5]

# # make test dataframe
# test = pd.DataFrame(test, columns=['price'])


# test = gradient_features(test, num_prev_points=1)
# test = second_gradient_features(test, num_prev_points=1)

# print(test.head())
train = gradient_features(train, num_prev_points=2)
train = second_gradient_features(train, num_prev_points=2)
print(train.head())

   index       date  hour    price  gradient_2  gradient_1  second_gradient_1  \
0      0 2007-01-01     1  0.02431     0.00000     0.00000            0.00000   
1   1096 2007-01-01     2  0.02431     0.00000     0.00000            0.00000   
2   2192 2007-01-01     3  0.02171    -0.00260    -0.00260           -0.00260   
3   3288 2007-01-01     4  0.00842    -0.01589    -0.01329           -0.01069   
4   4384 2007-01-01     5  0.00001    -0.02170    -0.00841            0.00488   

   second_gradient_2  
0            0.00000  
1            0.00000  
2           -0.00260  
3           -0.01329  
4           -0.00581  


In [20]:
### FOURIER TRANSFORM ###

def fourier_top_freq(data, segment_size=72):
    '''
    Applies Fourier transform to segments of the 'price' data and extracts the top 3 frequencies.
    
    Parameters:
    data (DataFrame): Input data.
    segment_size (int): Number of data points in each segment for Fourier transform.
    
    Returns:
    DataFrame: The input data with top 3 Fourier frequencies for each segment.
    '''

    # Create new columns for the top 3 fourier frequencies
    for i in range(3):
        data[f'fourier_freq_{i + 1}'] = np.nan

    # For each range of data points, calculate the Fourier transform
    for i in range(segment_size, len(data), 1): # Start at <segment_size>
        # Fourier transform of the last <segment_size> data points
        segment = data['price'][i - segment_size:i]
        fourier_coeffs = np.fft.fft(segment)
        freqs = np.fft.fftfreq(segment_size, d=1)  # Assuming hourly data, hence d=1

        # Get indices of top 3 frequencies based on magnitude of Fourier coefficients
        indices = np.argsort(np.abs(fourier_coeffs))[::-1][1:4] # ::-1 to sort in descending order

        for j in range(3):
            column_name = f'fourier_freq_{j + 1}'
            data.loc[i, column_name] = freqs[indices[j]]

    return data

train = fourier_top_freq(train, segment_size=72)
val = fourier_top_freq(val)

print(train.head(5))


           date  hour    price  gradient_2  gradient_1  fourier_freq_1  \
0    2007-01-01     1  0.02431     0.00000     0.00000             NaN   
1096 2007-01-01     2  0.02431     0.00668     0.00668       -0.041667   
2192 2007-01-01     3  0.02171     0.00956     0.00956        0.083333   
3288 2007-01-01     4  0.00842    -0.00179    -0.00179        0.041667   
4384 2007-01-01     5  0.00001    -0.00888    -0.00888        0.041667   

      fourier_freq_2  fourier_freq_3  
0                NaN             NaN  
1096        0.041667        0.083333  
2192       -0.083333        0.013889  
3288       -0.041667        0.013889  
4384       -0.041667        0.013889  


In [None]:
### HISTORIC FEATURES ###

def moving_averages(data, window_size=72):
    '''
    Calculates the moving average of the 'price' data for each data point.

    '''
    data['moving_average'] = data['price'].rolling(window=window_size, min_periods=1).mean()
    return data

def moving_std(data, window_size=72):
    data['moving_std'] = data['price'].rolling(window=window_size, min_periods=1).std()
    return data

def moving_min(data, window_size=72):
    data['moving_min'] = data['price'].rolling(window=window_size, min_periods=1).min()
    return data

def moving_max(data, window_size=72):
    data['moving_max'] = data['price'].rolling(window=window_size, min_periods=1).max()
    return data


In [None]:
### DATE FEATURES ###
def date_features(data):
    data['day_of_week'] = data['datetime'].dt.dayofweek
    data['day_of_month'] = data['datetime'].dt.day
    data['month'] = data['datetime'].dt.month
    data['year'] = data['datetime'].dt.year
    data['hour'] = data['datetime'].dt.hour
    data['season'] = (data['month'] % 12 + 3) // 3
    return data

def average_features(data):
    # Lets reconsider to not use this it feels like it would not translate well to the tests set
    data['average_day'] = data.groupby(['day_of_week', 'hour'])['price'].transform('mean')
    data['average_day_of_month'] = data.groupby(['day_of_month', 'hour'])['price'].transform('mean')
    
    data['average_month'] = data.groupby(['month', 'hour'])['price'].transform('mean')
    data['average_season'] = data.groupby(['season', 'hour'])['price'].transform('mean')
    return data

train = date_features(train)