In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, mutual_info_regression

## Final Changes to Data and Size Reduction

In [None]:
# Read in the data
uni_pure = pd.read_csv('EM_universe.csv', index_col='Date')

In [None]:
# Make copy of uni_pure to use
uni = uni_pure.copy()

In [None]:
# Set index to datetiem and sort 
uni.index = pd.to_datetime(uni.index)
uni = uni.sort_index()

In [None]:
# Drop unnecessary columns for this analysis
uni = uni[['Name', 'ID', 'Price', 'Total Assets', 'Accounts Payable', 
           'Total Liabilities', 'Net Income', 'EBIT', 'Depreciation', 'Gross Profit', 
           'Inventories', 'Receivables', 'Sales/Turnover', 'Stockholders Equity', 
           'Interest Expense', 'EBITDA', 'Free Cash Flow', 'Financing Cash Flow', 
           'Investing Cash Flow', 'Operating Cash Flow', 'Current Liabilities', 
           'Current Assets', 'Cost of Goods Sold', 'ROA', 'ROE', 'ROI', 'Gross Profit Margin', 
           'Operating Profit Margin', 'Net Profit Margin', 'Current Ratio', 'Quick Ratio', 
           'D/E Ratio', 'Interest Coverage Ratio', 'Asset Turnover Ratio', 
           'Inventory Turnover Ratio', 'Operational Gearing', 'Revenue Growth', 
           'Earnings Growth', 'Asset Growth', 'Equity Growth', 'Accruals Ratio', 
           'Cashflow to Net Income Ratio', 'High Price', 'Low Price', 'Volume', 
           'Shares', 'Equity', 'Market Cap', 'Alpha', 'Mean_Return', 'Volatility', 'Excess_Return_adjusted']]

In [None]:
# Rename columns for consistency
uni.rename(columns={'Excess_Return_adjusted': 'Return'}, inplace=True)

In [None]:
# Split dataset
X = uni.drop(columns=['Name', 'ID','Return'])
y = uni['Return']

In [None]:
# Drop highly correlated features to reduce dataset size
correlation_matrix = X.corr().abs()

upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

highly_correlated = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.9)]

reduced_uni = uni.drop(columns=highly_correlated)

# Output the reduced DataFrame
print("Original number of features:", uni.shape[1])
print("Reduced number of features:", reduced_uni.shape[1])
print("Dropped features:", highly_correlated)

In [None]:
X = reduced_uni.drop(columns=['Name', 'ID','Return'])
y = reduced_uni['Return']

In [None]:
# Select 30 most relevant features
best_features = SelectKBest(score_func=mutual_info_regression, k=30).fit(X, y)
selected_features = best_features.get_support(indices=True)
X_filtered = X.iloc[:, selected_features]

In [None]:
# Recombine metadata and target columns
date_index = X_filtered.index

metadata = reduced_uni[['Name', 'ID']].reset_index(drop=True)
target = reduced_uni[['Return']].reset_index(drop=True)
X_filtered = X_filtered.reset_index(drop=True)

# Recombine the dataframes
df_filtered = pd.concat([metadata, X_filtered, target], axis=1)
df_filtered.index = date_index

In [None]:
# Filter out companies with insufficient data
min_periods = 20
company_counts = df_filtered['ID'].value_counts()
companies_to_keep = company_counts[company_counts >= min_periods].index
filtered_uni = df_filtered[df_filtered['ID'].isin(companies_to_keep)]

In [None]:
uni = filtered_uni.reset_index(drop=False)

In [None]:
uni.to_csv('Final_FE_Data.csv')

## Functions to Track Performance

In [None]:
import time
from memory_profiler import memory_usage
import psutil
import threading
import pickle

In [None]:
# Function to track CPU usage
def get_cpu_usage():
    return psutil.cpu_percent(interval=1)

In [None]:
# Function to track CPU usage, time and memory
def track_resources(function, *args):
    start_time = time.time()
    start_mem = memory_usage()[0]
    process = psutil.Process()
    start_cpu_times = process.cpu_times()

    result = function(*args)
    
    end_time = time.time()
    end_mem = memory_usage()[0]
    end_cpu_times = process.cpu_times()

    user_time = end_cpu_times.user - start_cpu_times.user
    system_time = end_cpu_times.system - start_cpu_times.system
    total_cpu_time = user_time + system_time

    elapsed_time = end_time - start_time
    avg_cpu_usage = (total_cpu_time / elapsed_time) * 100
    
    print(f"Initial memory usage: {start_mem} MB")
    print(f"Final memory usage: {end_mem} MB")
    print(f"Average CPU usage during execution: {avg_cpu_usage:.2f}%")
    
    time_taken = elapsed_time
    memory_used = end_mem - start_mem
    cpu_used = avg_cpu_usage
    
    return (*result, time_taken, memory_used, cpu_used)

In [None]:
# Function to save outputs
def save_object(obj, filename):
    with open(filename, 'wb') as f:
        pickle.dump(obj, f)

In [None]:
# Function to unpickle to csv
def pickle_to_csv(pickle_file_path, csv_file_path):

    with open(pickle_file_path, 'rb') as f:
        df = pickle.load(f)

    df.to_csv(csv_file_path, index=True)

In [None]:
# Function to load a pickled object
def load_pickle(file_path):
    with open(file_path, 'rb') as f:
        obj = pickle.load(f)
    return obj

## Functions to Implement Each Framework

Each framework needs to be run in a seperate environment (set up in Anaconda cmd prompt) to ensure no clashes between dependancy versions. See documentation/install information for each package for more information

In [None]:
# Load full dataset
uni_pure = pd.read_csv('Final_FE_Data.csv')

In [None]:
uni = uni_pure.copy()

In [None]:
# Obtain raw data without metrics and ratios
raw_data_pure = uni[['Date','Name', 'ID', 'Price', 'Total Assets', 'Accounts Payable', 
            'Net Income', 'Depreciation', 'Receivables',
            'Sales/Turnover', 'Free Cash Flow', 'Financing Cash Flow', 
           'Investing Cash Flow', 'Operating Cash Flow', 'Shares', 'Return']]

In [None]:
# Obtain metrics dataset
metrics_data_pure = uni[['Date','Name', 'ID', 'ROA', 'ROE', 'ROI', 
            'Net Profit Margin', 'D/E Ratio', 'Interest Coverage Ratio', 
            'Operational Gearing', 'Revenue Growth', 'Earnings Growth', 
            'Asset Growth', 'Equity Growth', 'Accruals Ratio', 'Cashflow to Net Income Ratio',
            'Volume', 'Market Cap', 'Alpha', 'Mean_Return', 'Volatility', 'Return']]

In [None]:
# Ensure we are working with a copy of the DataFrames
raw_data = raw_data_pure.copy()
metrics_data = metrics_data_pure.copy()

### Featuretools

In [None]:
import featuretools as ft
import woodwork as ww

In [None]:
# Specify the date format explicitly
date_format = '%Y-%m-%d'
raw_data['Date'] = pd.to_datetime(raw_data['Date'], format=date_format)
metrics_data['Date'] = pd.to_datetime(metrics_data['Date'], format=date_format)

# Create a unique identifier by combining Name and date
raw_data['unique_id'] = raw_data['Name'] + '_' + raw_data['Date'].astype(str)
metrics_data['unique_id'] = metrics_data['Name'] + '_' + metrics_data['Date'].astype(str)

# Add an index column if it doesn't exist
raw_data['index'] = raw_data.index

# Add an additional index column to metrics_data
metrics_data['metrics_index'] = metrics_data.index

# Merge to create the reference
metrics_data = metrics_data.merge(raw_data[['unique_id']], on='unique_id', how='left')
metrics_data.rename(columns={'unique_id': 'raw_unique_id'}, inplace=True)

In [None]:
# Initialize woodwork schema
raw_data.ww.init(name='raw_data', index='unique_id', time_index='Date')
metrics_data.ww.init(name='metrics_data', index='metrics_index', time_index='Date')

In [None]:
def featuretools_engineering(raw_data, metrics_data=None):

    raw_data_copy = raw_data.copy()
    metrics_data_copy = metrics_data.copy() if metrics_data is not None else None
    
    # Drop the 'Return' column from the copied dataframes
    return_column = raw_data_copy['Return']
    raw_data_copy = raw_data_copy.drop(columns=['Return'])

    if metrics_data_copy is not None:
        metrics_data_copy = metrics_data_copy.drop(columns=['Return'])

    # Initialize woodwork schemas
    raw_data_copy.ww.init(name='raw_data', index='unique_id', time_index='Date')
    if metrics_data_copy is not None:
        metrics_data_copy.ww.init(name='metrics_data', index='metrics_index', time_index='Date')

    # Create the entity set
    es = ft.EntitySet(id='company_data')
    es = es.add_dataframe(dataframe_name='raw_data', dataframe=raw_data_copy)

    if metrics_data_copy is not None:
        es = es.add_dataframe(dataframe_name='metrics_data', dataframe=metrics_data_copy)
        es = es.add_relationship('raw_data', 'unique_id', 'metrics_data', 'raw_unique_id')
    
    # Define transformation primitives
    trans_primitives = [
        ft.primitives.Year, 
        ft.primitives.Month, 
        ft.primitives.Weekday, 
        ft.primitives.CumSum, 
        ft.primitives.CumMean, 
        ft.primitives.CumMin, 
        ft.primitives.CumMax, 
        ft.primitives.Diff, 
        ft.primitives.TimeSince, 
    ]
    
    # Run deep feature synthesis
    feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='raw_data', trans_primitives=trans_primitives)
    
    # Add back the 'Return' column to the feature matrix
    feature_matrix['Return'] = return_column
    
    return feature_matrix, feature_defs

In [None]:
# Run the function with just raw data
raw_features_ft, raw_feature_defs_ft, raw_ft_time, raw_ft_mem, raw_ft_cpu = track_resources(featuretools_engineering, raw_data)

In [None]:
# Run the function with both datasets
metrics_features_ft, metrics_feature_defs_ft, metrics_ft_time, metrics_ft_mem, metrics_ft_cpu = track_resources(featuretools_engineering, raw_data, metrics_data)

In [None]:
# Save all the outputs using pickle
save_object(raw_features_ft, 'raw_features_ft.pkl')
save_object(raw_feature_defs_ft, 'raw_feature_defs_ft.pkl')
save_object(metrics_features_ft, 'metrics_features_ft.pkl')
save_object(metrics_feature_defs_ft, 'metrics_feature_defs_ft.pkl')
    
# Save resource usage data
resource_usage_data = {
    'raw_ft_time': raw_ft_time,
    'raw_ft_mem': raw_ft_mem,
    'raw_ft_cpu': raw_ft_cpu,
    'metrics_ft_time': metrics_ft_time,
    'metrics_ft_mem': metrics_ft_mem,
    'metrics_ft_cpu': metrics_ft_cpu
}
save_object(resource_usage_data, 'resource_usage_data_ft.pkl')

 ### TSFresh

In [None]:
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import EfficientFCParameters

In [None]:
# Function for feature engineering with TSFresh
def tsfresh_engineering(data):

    data = data.copy()
    data['Date'] = pd.to_datetime(data['Date'])
    
    # Define value variables 
    value_vars = list(data.columns)
    value_vars.remove('Date')
    value_vars.remove('Name')
    value_vars.remove('ID')
    value_vars.remove('Return')
    
    # Reshape the data for tsfresh
    ts_data = data.melt(id_vars=['Date', 'Name'], value_vars=value_vars, var_name='variable', value_name='value')
    
    # Combine 'Name' and 'Date' to create a unique ID for each company at each date
    ts_data['id'] = ts_data['Name'] + '_' + ts_data['Date'].astype(str)
    
    # Extract features using tsfresh
    extracted_features = extract_features(
        ts_data, 
        column_id="id", 
        column_sort="Date", 
        column_value="value",
        default_fc_parameters=EfficientFCParameters(),
        n_jobs=8
    )
    
    # Impute missing values in the extracted features
    impute(extracted_features)
    
    # Split the 'id' back into 'Name' and 'Date' columns
    extracted_features['Name'] = extracted_features.index.str.split('_').str[0]
    extracted_features['Date'] = pd.to_datetime(extracted_features.index.str.split('_').str[1])
    
    # Merge the extracted features back with the original data
    final_data = pd.merge(data, extracted_features, on=['Name', 'Date'], how='left')
    
    # Align the target column with the final dataset
    target = final_data['Return']
    
    return final_data, target

In [None]:
# Run the function for the raw dataset
raw_features_tf, raw_feature_target_tf, raw_tf_time, raw_tf_mem, raw_tf_cpu = track_resources(tsfresh_engineering, raw_data)

In [None]:
# Run the function for the extended dataset
metrics_features_tf, metrics_feature_target_tf, metrics_tf_time, metrics_tf_mem, metrics_tf_cpu = track_resources(tsfresh_engineering, uni)

In [None]:
# Save all the outputs using pickle
save_object(raw_features_tf, 'raw_features_tf.pkl')
save_object(raw_feature_target_tf, 'raw_feature_defs_tf.pkl')
save_object(metrics_features_tf, 'metrics_features_tf.pkl')
save_object(metrics_feature_target_tf, 'metrics_feature_defs_tf.pkl')
    
# Save resource usage data
resource_usage_data = {
    'raw_tf_time': raw_tf_time,
    'raw_tf_mem': raw_tf_mem,
    'raw_tf_cpu': raw_tf_cpu,
    'metrics_tf_time': metrics_tf_time,
    'metrics_tf_mem': metrics_tf_mem,
    'metrics_tf_cpu': metrics_tf_cpu
}
save_object(resource_usage_data, 'resource_usage_data_tf.pkl')

### Featurewiz

In [None]:
from featurewiz import FeatureWiz

In [None]:
def featurewiz_engineering(data):
    
    # Ensure date is in datetime format
    data['Date'] = pd.to_datetime(data['Date'])

    # Preserve Date and Name columns
    preserved_columns = data[['Date', 'Name', 'ID','Return']]

    # Sort data by Company ID and Date to respect the time series order
    data = data.sort_values(by=['Name', 'Date'])

    # Convert all numeric columns to appropriate types and drop non-numeric features
    numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
    X = data[numeric_columns].drop(columns=['Return'])
    y = data['Return']

    # Use FeatureWiz 
    fwiz = FeatureWiz(
        feature_engg=['interactions', 'groupby', 'target'],  
        corr_limit=0.90, 
        verbose=2, 
        group_by_columns=['Name'], 
        date_columns=['Date'],  
        scalers='std'
    )

    # Fit and transform the data using FeatureWiz
    X_selected, y_transformed = fwiz.fit_transform(X, y)
    
    # Re-attach the Date and Name columns
    X_selected = pd.concat([preserved_columns.reset_index(drop=True), X_selected.reset_index(drop=True)], axis=1)
    
    return X_selected, fwiz.features

In [None]:
# Run function for raw dataset
raw_features_fw, raw_feature_names_fw, raw_fw_time, raw_fw_mem, raw_fw_cpu = track_resources(featurewiz_engineering, raw_data)

In [None]:
# Run function for extended dataset
metrics_features_fw, metrics_feature_names_fw, metrics_fw_time, metrics_fw_mem, metrics_fw_cpu = track_resources(featurewiz_engineering, uni)

In [None]:
# Save all the outputs using pickle
save_object(raw_features_fw, 'raw_features_fw.pkl')
save_object(raw_feature_names_fw, 'raw_feature_names_fw.pkl')
save_object(metrics_features_fw, 'metrics_features_fw.pkl')
save_object(metrics_feature_names_fw, 'metrics_feature_names_fw.pkl')
    
# Save resource usage data
resource_usage_data = {
    'raw_fw_time': raw_fw_time,
    'raw_fw_mem': raw_fw_mem,
    'raw_fw_cpu': raw_fw_cpu,
    'metrics_fw_time': metrics_fw_time,
    'metrics_fw_mem': metrics_fw_mem,
    'metrics_fw_cpu': metrics_fw_cpu
}
save_object(resource_usage_data, 'resource_usage_data_fw.pkl')

### PyCaret

In [None]:
from pycaret.regression import setup, compare_models, pull, get_config
from sklearn.preprocessing import PolynomialFeatures

In [None]:
# Function to remove spaces from column names and convert to camelCase
def clean_column_names(data):
    data.columns = [col.replace(' ', '') for col in data.columns]
    return data

In [None]:
# Clean column names
raw_data_clean = clean_column_names(raw_data)
uni_clean =  clean_column_names(uni)

In [None]:
# Feature Engineering with PyCaret ensuring consistent feature naming
def pycaret_engineering(data):
# Ensure date is in datetime format
    data['Date'] = pd.to_datetime(data['Date'])

    # Sort data by Date to maintain the time series order
    data = data.sort_values(by='Date')

    # Ignore columns
    ignored_columns = data[['Date', 'Name', 'ID', 'Return']].reset_index(drop=True)
    numeric_data = data.drop(columns=['Date', 'Name', 'ID', 'Return'])

    # Generate polynomial features
    poly = PolynomialFeatures(degree=2, interaction_only=True)
    poly_features = poly.fit_transform(numeric_data)

    # Get feature names for the polynomial features
    poly_feature_names = poly.get_feature_names_out(numeric_data.columns)
    
    # Convert to DataFrame 
    poly_features_df = pd.DataFrame(poly_features, columns=poly_feature_names)
    
    # Replace any spaces with underscores in feature names after transformation
    poly_features_df.columns = poly_features_df.columns.str.replace(' ', '_')

    # Concatenate polynomial features with the ignored columns
    processed_data = pd.concat([ignored_columns, poly_features_df], axis=1)

    # Set up PyCaret for regression with the new data
    reg = setup(
        data=processed_data,
        target='Return',
        ignore_features=['Date', 'Name', 'ID'],  
        session_id=123,
        feature_selection=True,
        remove_multicollinearity=True,
        group_features={'company_id': 'ID'},
        n_jobs=10,
        train_size=0.99,  
        verbose = 2,
        polynomial_features=False
    )

    # Get the transformed training and testing data
    X_train = get_config('X_train')
    X_test = get_config('X_test')
    
    # Concatenate the training and testing data to get the full dataset
    processed_features = pd.concat([X_train, X_test], axis=0).sort_index()

    # Reattach the ignored columns 
    final_dataset = pd.concat([ignored_columns, processed_features.reset_index(drop=True)], axis=1)

    return final_dataset,_

In [None]:
# Run the function on raw dataset
raw_features_pc,_, raw_pc_time, raw_pc_mem, raw_pc_cpu = track_resources(pycaret_engineering, raw_data_clean)

In [None]:
# Run the function on extended dataset
metrics_features_pc,_, metrics_pc_time, metrics_pc_mem, metrics_pc_cpu = track_resources(pycaret_engineering, uni_clean)

In [None]:
# Save all the outputs using pickle
save_object(raw_features_pc, 'raw_features_pc.pkl')
save_object(metrics_features_pc, 'metrics_features_pc.pkl')
    
# Save resource usage data
resource_usage_data = {
    'raw_pc_time': raw_pc_time,
    'raw_pc_mem': raw_pc_mem,
    'raw_pc_cpu': raw_pc_cpu,
    'metrics_pc_time': metrics_pc_time,
    'metrics_pc_mem': metrics_pc_mem,
    'metrics_pc_cpu': metrics_pc_cpu
}
save_object(resource_usage_data, 'resource_usage_data_pc.pkl')

### Performance Comparison

In [None]:
# Load the resource usage data from each pickle file
resource_usage_ft = load_pickle('resource_usage_data_ft.pkl')
resource_usage_tf = load_pickle('resource_usage_data_tf.pkl')
resource_usage_fw = load_pickle('resource_usage_data_fw.pkl')
resource_usage_pc = load_pickle('resource_usage_data_pc.pkl')

In [None]:
# Extract performance metrics
performance_metrics = {
    'Method': [
        'Featuretools Raw', 'Featuretools Metrics',
        'TSFresh Raw', 'TSFresh Metrics',
        'Featurewiz Raw', 'Featurewiz Metrics',
        'PyCaret Raw', 'PyCaret Metrics'
    ],
    'Time (s)': [
        resource_usage_ft['raw_ft_time'], resource_usage_ft['metrics_ft_time'],
        resource_usage_tf['raw_tf_time'], resource_usage_tf['metrics_tf_time'],
        resource_usage_fw['raw_fw_time'], resource_usage_fw['metrics_fw_time'],
        resource_usage_pc['raw_pc_time'], resource_usage_pc['metrics_pc_time']
    ],
    'Memory Usage (MB)': [
        resource_usage_ft['raw_ft_mem'], resource_usage_ft['metrics_ft_mem'],
        resource_usage_tf['raw_tf_mem'], resource_usage_tf['metrics_tf_mem'],
        resource_usage_fw['raw_fw_mem'], resource_usage_fw['metrics_fw_mem'],
        resource_usage_pc['raw_pc_mem'], resource_usage_pc['metrics_pc_mem']
    ],
    'Average CPU Usage (%)': [
        resource_usage_ft['raw_ft_cpu'], resource_usage_ft['metrics_ft_cpu'],
        resource_usage_tf['raw_tf_cpu'], resource_usage_tf['metrics_tf_cpu'],
        resource_usage_fw['raw_fw_cpu'], resource_usage_fw['metrics_fw_cpu'],
        resource_usage_pc['raw_pc_cpu'], resource_usage_pc['metrics_pc_cpu']
    ]
}

# Create the DataFrame
performance_df = pd.DataFrame(performance_metrics)

In [None]:
# Save the performance metrics to CSV
performance_df.to_csv('FE_performance_metrics.csv', index=False)

## Accuracy and Interpretability Comparison

In [None]:
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
# Function to train Random Forest and evaluate metrics
def train_and_evaluate_rf(X, y, feature_set_name):
    # Define the pipeline 
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('rf', RandomForestRegressor(n_jobs=-1))
    ])
    
    # Define 5-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True)
    
    # Cross-validation predictions
    y_pred = cross_val_predict(pipeline, X, y, cv=kf)
    
    # Calculate metrics
    mae = mean_absolute_error(y, y_pred)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    r2 = r2_score(y, y_pred)

    # Print metrics
    print(f"Results for {feature_set_name}:")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"R-squared (R2): {r2:.4f}")

    # Train the model on the full dataset to get feature importances
    pipeline.fit(X, y)
    feature_importances = pipeline.named_steps['rf'].feature_importances_

    return mae, rmse, r2, feature_importances

In [None]:
# Convert all feature sets to CSV
pickle_files = [
    'raw_features_ft.pkl',
    'metrics_features_ft.pkl',
    'raw_features_tf.pkl',
    'metrics_features_tf.pkl',
    'raw_features_fw.pkl',
    'metrics_features_fw.pkl',
    'raw_features_pc.pkl',
    'metrics_features_pc.pkl'
]

csv_files = [file.replace('.pkl', '.csv') for file in pickle_files]

for pickle_file, csv_file in zip(pickle_files, csv_files):
    pickle_to_csv(pickle_file, csv_file)

In [None]:
# Load the datasets
datasets = {
    'raw': pd.read_csv('raw_data.csv', index_col='Date'),
    'extended': pd.read_csv('uni.csv', index_col='Date'),
    'featuretools_raw': pd.read_csv('raw_features_ft.csv'),
    'featuretools_extended': pd.read_csv('metrics_features_ft.csv'),
    'tsfresh_raw': pd.read_csv('raw_features_tf.csv'),
    'tsfresh_extended': pd.read_csv('metrics_features_tf.csv'),
    'featurewiz_raw': pd.read_csv('raw_features_fw.csv'),
    'featurewiz_extended': pd.read_csv('metrics_features_fw.csv'),
    'pycaret_raw': pd.read_csv('raw_features_pc.csv'),
    'pycaret_extended': pd.read_csv('metrics_features_pc.csv')
}

In [None]:
# Clean datasets
raw = datasets['raw']
extended = datasets['extended']

raw = raw.drop(columns=['Unnamed: 0'])
extended = extended.drop(columns=['Unnamed: 0'])

datasets['raw'] = raw
datasets['extended'] = extended

In [None]:
# Clean Featuretools datasets
ft_raw = datasets['featuretools_raw']
ft_ext = datasets['featuretools_extended']

ft_raw[['Name', 'Date']] = ft_raw['unique_id'].str.split('_', expand=True)
ft_raw['Date'] = pd.to_datetime(ft_raw['Date'])

ft_ext[['Name', 'Date']] = ft_ext['unique_id'].str.split('_', expand=True)
ft_ext['Date'] = pd.to_datetime(ft_ext['Date'])

ft_raw = ft_raw.drop(columns=['unique_id'])
ft_ext = ft_ext.drop(columns=['unique_id'])

ft_raw.set_index(['Date'], inplace = True)
ft_ext.set_index(['Date'], inplace = True)

datasets['featuretools_raw'] = ft_raw
datasets['featuretools_extended'] = ft_ext

In [None]:
# Clean TSFresh datasets
tf_raw = datasets['tsfresh_raw']
tf_ext = datasets['tsfresh_extended']


tf_raw = tf_raw.drop(columns=['Unnamed: 0'])
tf_ext = tf_ext.drop(columns=['Unnamed: 0'])

tf_raw.set_index(['Date'], inplace = True)
tf_ext.set_index(['Date'], inplace = True)

datasets['tsfresh_raw'] = tf_raw
datasets['tsfresh_extended'] = tf_ext

In [None]:
# Clean Featurewiz datasets
fw_raw = datasets['featurewiz_raw']
fw_ext = datasets['featurewiz_extended']

fw_raw.set_index(['Date'], inplace = True)
fw_ext.set_index(['Date'], inplace = True)

fw_raw = fw_raw.drop(columns=['Unnamed: 0'])
fw_ext = fw_ext.drop(columns=['Unnamed: 0'])

datasets['featurewiz_raw'] = fw_raw
datasets['featurewiz_extended'] = fw_ext  

In [None]:
# Clean PyCaret datasets
pc_raw = datasets['pycaret_raw']
pc_ext = datasets['pycaret_extended']

pc_raw.set_index(['Date'], inplace = True)
pc_ext.set_index(['Date'], inplace = True)

pc_raw = pc_raw.drop(columns=['Unnamed: 0', '1'])
pc_ext = pc_ext.drop(columns=['Unnamed: 0', '1'])

datasets['pycaret_raw'] = pc_raw
datasets['pycaret_extended'] = pc_ext  

In [None]:
# Function to drop 0 or +-inf in dataset
def clean_dataset(df):
    # Drop columns where all values are zeros
    df_cleaned = df.loc[:, (df != 0).any(axis=0)]

    # Drop columns with any NaN or inf values in numeric columns
    df_cleaned = df_cleaned.dropna(axis=1, how='any')
    df_cleaned = df_cleaned.loc[:, ~df_cleaned.isin([np.inf, -np.inf]).any()]

    return df_cleaned

In [None]:
# Clean all datasets
cleaned_datasets = {name: clean_dataset(df) for name, df in datasets.items()}

In [None]:
# Initialize a DataFrame to hold results
metrics_index = [
    'MAE', 
    'RMSE', 
    'R2', 
    'Gini Feature Importances'
]
results_df = pd.DataFrame(index=metrics_index)

In [None]:
# Loop through the datasets and evaluate each one
for name, data in cleaned_datasets.items():
    # Separate features and target
    X = data.drop(columns=['Return'])
    
    # Drop non-numeric columns from the feature set
    X = X.select_dtypes(include=[np.number])
    
    y = data['Return']

    # Train and evaluate the model
    mae, rmse, r2, feature_importances = train_and_evaluate_rf(X, y, name)

    # Populate the results DataFrame
    results_df[name] = [
        mae, 
        rmse, 
        r2, 
        feature_importances
    ]

    # Print progress for each dataset
    print(f"Completed evaluation for: {name}")

# Save the results DataFrame to a CSV file
results_df.to_csv('rf_evaluation_summary.csv')

In [None]:
# Save cleaned datasets
datasets_to_save = ['raw', 'extended', 'featuretools_raw', 'tsfresh_raw', 
                    'featurewiz_raw', 'pycaret_raw', 'tsfresh_extended', 
                    'featuretools_extended', 'featurewiz_extended', 'pycaret_extended']

# Save the selected datasets to CSV
for name in datasets_to_save:
    cleaned_datasets[name].to_csv(f'cleaned_{name}.csv')

## Evaluate Importance Scores

In [None]:
# Load the evaluation summary file
results_df = pd.read_csv('rf_evaluation_summary.csv',index_col=0)

In [None]:
# Initialize a DataFrame to store the top 10 features for each dataset
top_features_df = pd.DataFrame(columns=['Dataset', 'Feature', 'Gini Importance'])

# Loop through the datasets
for name in results_df.columns:
    # Extract the Gini importances string
    gini_importances_str = results_df.loc['Gini Feature Importances', name]
    
    # Clean up the string and convert it to a list of floats
    gini_importances_str = gini_importances_str.replace('[', '').replace(']', '').replace('\n', ' ')
    gini_importances_list = [float(x) for x in gini_importances_str.split()]
    
    # Get the feature names from the corresponding dataset
    feature_names = cleaned_datasets[name].drop(columns=['Return']).select_dtypes(include=[float, int]).columns.tolist()
    
    # Ensure that the length of Gini importances matches the number of features
    if len(gini_importances_list) == len(feature_names):
        # Create a DataFrame for the current dataset's features and Gini importances
        feature_importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Gini Importance': gini_importances_list
        })
        
        # Sort by Gini importance and get the top 10 features
        top_10_features = feature_importance_df.nlargest(11, 'Gini Importance')
        
        # Add a column for the dataset name
        top_10_features['Dataset'] = name
        
        # Append to the main DataFrame
        top_features_df = pd.concat([top_features_df, top_10_features], ignore_index=True)
    else:
        print(f"Length mismatch between Gini importances and features in dataset: {name}")

# Reorder the columns
top_features_df = top_features_df[['Dataset', 'Feature', 'Gini Importance']]

In [None]:
top_features_df.to_csv('feature_importance.csv')