In [None]:
import pandas as pd
import os
import numpy as np
import pickle

In [None]:
def aggregate_log_return(df, bucket):
    # Define the mapping of buckets to their respective minutes
    bucket_dict = {
        '1min': 1,
        '5min': 5,
        '10min': 10,
        '30min': 30,
        '1h': 60,
        '3h': 60*3,
        '5h': 60*5,
        '1d': 60*6.5, 
        '2d': 60*6.5*2,
        '7d': 60*6.5*7,
        '14d': 60*6.5*14,
        '21d': 60*6.5*21,
        '30d': 60*6.5*30
    }
    
    # Validate the provided bucket
    if bucket not in bucket_dict:
        raise ValueError("Invalid bucket. Please choose from '1min', '5min', '10min', '30min', '1h', '1d', '2d', '7d', '14d', '21d', '30d'.")
    
    # No processing needed for 1-minute intervals
    if bucket == '1min':
        return df
    
    # Number of rows to aggregate
    n = int(bucket_dict[bucket])
    
    # Save date and time columns
    dates = df['date'].values
    times = df['time'].values
    
    # Drop date and time columns for calculation
    data = df.drop(columns=['date', 'time']).values
    
    # Add 1 to all values
    data += 1
    
    # Determine the new number of rows
    new_row_count = len(data) // n
    
    # Reshape the data to (new_row_count, n, num_columns)
    reshaped_data = data[:new_row_count * n].reshape(new_row_count, n, -1)
    
    # Multiply along the second axis and subtract 1
    aggregated_data = reshaped_data.prod(axis=1) - 1
    
    # Extract the corresponding date and time for the new rows
    new_dates = dates[(np.arange(new_row_count) + 1) * n - 1]
    new_times = times[(np.arange(new_row_count) + 1) * n - 1]
    
    # Create the new DataFrame
    aggregated_df = pd.DataFrame(aggregated_data, columns=df.columns[2:])
    aggregated_df['date'] = new_dates
    aggregated_df['time'] = new_times
    
    # Reorder columns to put Date and Time first
    cols = ['date', 'time'] + [col for col in aggregated_df.columns if col not in ['date', 'time']]
    aggregated_df = aggregated_df[cols]
    
    return aggregated_df


In [None]:
df = pd.read_csv('../0_data_preprocessing/log_returns_1min_252.csv')
df.iloc[:,2:] = np.exp(df.iloc[:,2:])-1

bucket = '1d'
lookback_window = 10
method = 'levy_area'

In [None]:
result_dir = f'./'
file_path = os.path.join(result_dir, f'returns_bucket{bucket}_lookback_window{lookback_window}_{method}.pkl')
leaders_followers_file = os.path.join(result_dir, f'leaders_followers_bucket{bucket}_lookback_window{lookback_window}_{method}.pkl')

data = aggregate_log_return(df, bucket)
data['datetime'] = pd.to_datetime(data['date'] + ' ' + data['time'])
data.set_index('datetime', inplace=True)
data.drop(columns=['date', 'time'], inplace=True)

In [None]:
dic = pd.read_pickle('./leaders_followers_bucket1d_lookback_window10_levy_area.pkl')

In [None]:
average_returns = []

for idx in range(len(dic['t'])):
    row_number = dic['t'][idx] 
    leaders = dic['leaders'][idx] 
    selected_data = data.iloc[row_number][leaders]
    average_return = selected_data.mean()
    average_returns.append(average_return)

In [None]:
with open('leaders_average_returns_over_t_window10.pkl', 'wb') as file:
    pickle.dump(average_returns, file)