In [None]:
import pandas as pd
import numpy as np
from scipy import stats

## Load Data from Simulation

In [None]:
# Load the data from the SAI method
ret_bench = pd.read_csv('ret_bench.csv', index_col='Date')
ret_raw = pd.read_csv('raw_returns.csv', index_col='Date')  
ret_extended = pd.read_csv('extended_returns.csv', index_col='Date')  
ret_featuretools = pd.read_csv('featuretools_extended_returns.csv', index_col='Date')  
ret_tsfresh = pd.read_csv('tsfresh_extended_returns.csv', index_col='Date')  
ret_featurewiz = pd.read_csv('featurewiz_extended_returns.csv', index_col='Date') 
ret_pycaret = pd.read_csv('pycaret_extended_returns.csv', index_col='Date')  
ret_optimal = pd.read_csv('ret_opt.csv', index_col='Date')  

siz_bench = pd.read_csv('siz_bench.csv', index_col='Date')
siz_raw = pd.read_csv('raw_sizes.csv', index_col='Date')  
siz_extended = pd.read_csv('extended_sizes.csv', index_col='Date') 
siz_featuretools = pd.read_csv('featuretools_extended_sizes.csv', index_col='Date') 
siz_tsfresh = pd.read_csv('tsfresh_extended_sizes.csv', index_col='Date')  
siz_featurewiz = pd.read_csv('featurewiz_extended_sizes.csv', index_col='Date')  
siz_pycaret = pd.read_csv('pycaret_extended_sizes.csv', index_col='Date')  
siz_optimal = pd.read_csv('siz_opt.csv', index_col='Date')  

In [None]:
# Create dictionaries to hold all the return and size DataFrames
return_datasets = {
    'Benchmark': ret_bench,
    'Raw': ret_raw,
    'Extended': ret_extended,
    'Featuretools': ret_featuretools,
    'TSFresh': ret_tsfresh,
    'Featurewiz': ret_featurewiz,
    'PyCaret': ret_pycaret,
    'Optimal': ret_optimal
}

size_datasets = {
    'Benchmark': siz_bench,
    'Raw': siz_raw,
    'Extended': siz_extended,
    'Featuretools': siz_featuretools,
    'TSFresh': siz_tsfresh,
    'Featurewiz': siz_featurewiz,
    'PyCaret': siz_pycaret,
    'Optimal': siz_optimal
}

## Calculate Portfolio Metrics

In [None]:
# Calculate portfolio metrics for a given set of returns and sizes
def calculate_metrics(returns, sizes, benchmark_returns):
    # Define metrics
    metrics = [
        'Annualised Return', 
        'STD of Return', 
        'Mean # of Equities', 
        'Sharpe Ratio',
        'Sharpe Ratio p-value', 
        'Tracking Error', 
        'Info Ratio',
        'Info Ratio p-value'
    ]
    
    # Create a DataFrame to store the results
    results = pd.DataFrame(index=metrics, columns=returns.columns)

    # Calculate annualised returns 
    total_ret = (returns.prod(axis=0) ** (1 /((len(returns)/4)-0.25))) - 1
    results.loc['Annualised Return'] = total_ret

    # Calculate annualised standard deviation of returns
    ret_std = returns.std(axis=0)*2  
    results.loc['STD of Return'] = ret_std

    # Calculate mean portfolio size
    mean_size = sizes.mean(axis=0)
    results.loc['Mean # of Equities'] = mean_size

    # Calculate Sharpe ratios (with excess returns)
    sharpe_ratios = total_ret / ret_std
    results.loc['Sharpe Ratio'] = sharpe_ratios

    # Calculate tracking error using the benchmark
    relative_ret = returns.sub(benchmark_returns.values, axis=0)
    tracking_error = relative_ret.std(axis=0) * np.sqrt(4)  
    results.loc['Tracking Error'] = tracking_error

    # Calculate information ratio
    annualized_rel_ret = relative_ret.mean(axis=0) * 4 
    info_ratio = annualized_rel_ret / tracking_error
    results.loc['Info Ratio'] = info_ratio

    # Perform t-tests for Sharpe Ratio and Information Ratio
    sharpe_t_stat = sharpe_ratios * np.sqrt(len(returns)) / (ret_std)
    sharpe_p_values = stats.t.sf(sharpe_t_stat, df=len(returns)-1)  

    info_t_stat = info_ratio * np.sqrt(len(returns)) / (tracking_error)
    info_p_values = stats.t.sf(info_t_stat, df=len(returns)-1) 

    # Add p-values to the results
    results.loc['Sharpe Ratio p-value'] = sharpe_p_values
    results.loc['Info Ratio p-value'] = info_p_values

    return results

In [None]:
# Run the simulation on each dataset and compile performance metrics
def compile_metrics(datasets, sizes, benchmark_returns):

    compiled_results = {}

    for name in datasets.keys():
        # Extract chi column 
        chi_returns = datasets[name]['chi'].to_frame(name=name)
        chi_sizes = sizes[name]['chi'].to_frame(name=name)

        # Calculate metrics 
        results = calculate_metrics(chi_returns, chi_sizes, benchmark_returns)

        # Store results
        compiled_results[name] = results

    # Concatenate all results into a single dataset
    final_results = pd.concat(compiled_results, axis=1)

    return final_results

In [None]:
# Use the benchmark returns
benchmark_returns = ret_bench['chi']

In [None]:
# Run analysis on all datasets
final_results = compile_metrics(return_datasets, size_datasets, benchmark_returns)

In [None]:
def format_results_table(results):
    # Format Dataframe to 4 sig fig
    results = results.map(lambda x: f'{x:.4g}' if pd.notnull(x) else x)

    # Convert returns and standard deviations to percentages
    results.loc['Annualised Return'] = results.loc['Annualised Return'].astype(float) * 100
    results.loc['Annualised Return'] = results.loc['Annualised Return'].apply(lambda x: f'{x:.2f}%')
    
    results.loc['STD of Return'] = results.loc['STD of Return'].astype(float) * 100
    results.loc['STD of Return'] = results.loc['STD of Return'].apply(lambda x: f'{x:.2f}%')
        
    # Format benchmark column
    results.loc[['Tracking Error', 'Info Ratio', 'Info Ratio p-value'], 'Benchmark'] = 'NA'

    return results

In [None]:
formatted_results = format_results_table(final_results)

In [None]:
formatted_results.to_csv('Final_results.csv')