In [None]:
import pandas as pd
import numpy as np

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr
import gc

## Load Feature Sets

In [None]:
# Dictionary of loaded datasets
datasets = {
    'featuretools_raw': pd.read_csv('cleaned_featuretools_raw.csv', index_col='Date'),
    'featuretools_extended': pd.read_csv('cleaned_featuretools_extended.csv', index_col='Date'),
    'tsfresh_raw': pd.read_csv('cleaned_tsfresh_raw.csv', index_col='Date'),
    'tsfresh_extended': pd.read_csv('cleaned_tsfresh_extended.csv', index_col='Date'),
    'featurewiz_raw': pd.read_csv('cleaned_featurewiz_raw.csv', index_col='Date'),
    'featurewiz_extended': pd.read_csv('cleaned_featurewiz_extended.csv', index_col='Date'),
    'pycaret_raw': pd.read_csv('cleaned_pycaret_raw.csv', index_col='Date'),
    'pycaret_extended': pd.read_csv('cleaned_pycaret_extended.csv', index_col='Date'),   
}

In [None]:
# Names for each dataset for plotting
dataset_names = [
    "Featuretools Basic", "Featuretools Extended", 
    "TSFresh Basic", "TSFresh Extended",
    "Featurewiz Basic", "Featurewiz Extended",
    "PyCaret Basic", "PyCaret Extended"
]

In [None]:
def drop_cum_columns(df):
    # Drop columns that contain 'CUM' in their names
    columns_to_drop = [col for col in df.columns if 'CUM' in col]
    df = df.drop(columns=columns_to_drop)
    
    return df

In [None]:
# Clean
ft_ext = datasets['featuretools_extended']

ft_ext = drop_cum_columns(ft_ext)

ft_ext2 = ft_ext.drop(columns=['MODE(metrics_data.ID)', 'MODE(metrics_data.Name)','MONTH(Date)', 'MODE(metrics_data.MONTH(Date))',
                              'WEEKDAY(Date)', 'YEAR(Date)', 'MODE(metrics_data.WEEKDAY(Date))', 'MODE(metrics_data.YEAR(Date))'])
                     
datasets['featuretools_extended'] = ft_ext2

In [None]:
# Clean
tf_ext = datasets['tsfresh_extended']

tf_ext2 = tf_ext.drop(columns=['value__has_duplicate_min', 'value__sum_of_reoccurring_values', 
                              'value__sum_of_reoccurring_data_points', 'value__value_count__value_1', 
                              'value__value_count__value_-1','value__has_duplicate', 'value__longest_strike_below_mean', 
                              'value__count_above_mean', 'value__count_below_mean', 'value__last_location_of_maximum', 'value__first_location_of_maximum', 
                              'value__symmetry_looking__r_0.05', 'value__large_standard_deviation__r_0.2', 'value__number_cwt_peaks__n_1', 'value__number_peaks__n_5', 
                              'value__index_mass_quantile__q_0.2', 'value__index_mass_quantile__q_0.3', 'value__index_mass_quantile__q_0.4', 'value__index_mass_quantile__q_0.6', 
                              'value__index_mass_quantile__q_0.7', 'value__index_mass_quantile__q_0.8','value__index_mass_quantile__q_0.1', 'value__index_mass_quantile__q_0.9',
                              'value__last_location_of_minimum', 'value__first_location_of_minimum', 'value__number_cwt_peaks__n_5', 'value__number_peaks__n_1', 'value__number_peaks__n_3', 
                               'value__change_quantiles__f_agg_"var"__isabs_False__qh_0.2__ql_0.0', 'value__change_quantiles__f_agg_"var"__isabs_True__qh_0.2__ql_0.0', 
                               'value__change_quantiles__f_agg_"var"__isabs_False__qh_0.4__ql_0.2', 'value__change_quantiles__f_agg_"var"__isabs_True__qh_0.4__ql_0.2', 
                               'value__change_quantiles__f_agg_"var"__isabs_False__qh_0.6__ql_0.4', 'value__change_quantiles__f_agg_"mean"__isabs_True__qh_0.6__ql_0.4', 
                               'value__change_quantiles__f_agg_"var"__isabs_True__qh_0.6__ql_0.4', 'value__change_quantiles__f_agg_"var"__isabs_False__qh_0.8__ql_0.6', 
                               'value__change_quantiles__f_agg_"var"__isabs_True__qh_0.8__ql_0.6', 'value__change_quantiles__f_agg_"var"__isabs_False__qh_1.0__ql_0.8', 
                               'value__change_quantiles__f_agg_"var"__isabs_True__qh_1.0__ql_0.8', 'value__value_count__value_0', 'value__number_crossing_m__m_0', 
                               'value__number_crossing_m__m_-1', 'value__fourier_entropy__bins_3', 'value__fourier_entropy__bins_5', 'value__fourier_entropy__bins_10', 'value__fourier_entropy__bins_100'])
                     
datasets['tsfresh_extended'] = tf_ext2

In [None]:
from sklearn.feature_selection import VarianceThreshold

In [None]:
# Function to drop low variance columns
def drop_low_variance(df, threshold=0.01):
    # Select only numeric columns
    numeric_df = df.select_dtypes(include=[np.number])
    
    # Drop low variance columns
    selector = VarianceThreshold(threshold=threshold)
    selector.fit(numeric_df)
    
    # Get the mask of retained columns
    retained_columns = numeric_df.columns[selector.get_support()]
    
    # Retain only the columns that passed the variance threshold
    df_cleaned = df[retained_columns]
    
    # Add back any non-numeric columns
    non_numeric_df = df.select_dtypes(exclude=[np.number])
    df_cleaned = pd.concat([non_numeric_df, df_cleaned], axis=1)
    
    return df_cleaned

In [None]:
# Apply the function to each dataset
cleaned_datasets = {name: drop_low_variance(df) for name, df in datasets.items()}

## Evaluate In-Set Redundancy via PCA

In [None]:
# Function to perform PCA and plot results
def pca_plot(datasets, dataset_names):
    exp_var_ratio = {}
    components = {}

    # Plot for cumulative variance explained
    plt.figure(figsize=(14, 10))

    for i, (name, df) in enumerate(datasets.items()):
        # Drop non-numeric columns 
        df_numeric = df.select_dtypes(include=[np.number])

        # Scale the data
        scaler = StandardScaler()
        scaled_data = scaler.fit_transform(df_numeric)

        # Perform PCA
        pca = PCA()
        pca.fit(scaled_data)

        # Calculate cumulative variance explained
        explained_variance = np.cumsum(pca.explained_variance_ratio_) * 100
        exp_var_ratio[name] = explained_variance

        # Plot cumulative variance explained
        plt.plot(explained_variance, label=dataset_names[i])

        # Find number of components to reach 95% variance
        components_95 = np.argmax(explained_variance >= 95) + 1
        components[name] = components_95
        print(f"{dataset_names[i]}: {components_95} components to reach 95% variance")

    plt.axhline(y=95, color='red', linestyle='--', linewidth=1)
    plt.xlabel('Number of Principal Components')
    plt.ylabel('Cumulative Variance Explained (%)')
    plt.xlim(0,200)
    plt.legend()
    plt.grid(True)
    plt.savefig('Cumulative_variance.png', dpi=1000)
    plt.show()

    return exp_var_ratio, components

In [None]:
# Perform PCA and plot results
exp_var_ratio, components = pca_plot(cleaned_datasets, dataset_names)

In [None]:
# Plot for percentage of principal components required for 95% variance
plt.figure(figsize=(10, 6))
percent_of_components = [components[name] / len(datasets[name].columns) * 100 for name in datasets.keys()]
sns.barplot(x=dataset_names, y=percent_of_components)
plt.xticks(rotation=45)
plt.ylabel('% of Principal Components for 95% Variance')
plt.ylim(0, 55)
plt.savefig('Principle_components.png')
plt.show()

## Evaluate Between-Set Redundancy with Spearman Correlation Coefficient and Similarity Metric

In [None]:
# Filter the datasets to include only the extended ones
extended_datasets = {key: value for key, value in cleaned_datasets.items() if 'extended' in key}

In [None]:
ext_dataset_names = ["Featuretools Extended","TSFresh Extended","Featurewiz Extended","PyCaret Extended"]

In [None]:
# Function to calculate similarity between two datasets
def calculate_similarity(set1, set2, set1_name, set2_name):
    # Initialize correlation matrix
    correlations = pd.DataFrame(index=set1.columns, columns=set2.columns)
    
    # Calculate the Spearman correlation between each pair of features
    for col1 in set1.columns:
        for col2 in set2.columns:
            correlation, _ = spearmanr(set1[col1], set2[col2])
            correlations.loc[col1, col2] = correlation
    
    # Calculate the similarity metric
    similarity_metric = correlations.abs().max(axis=1).mean()
    
    print(f"Calculated similarity between {set1_name} and {set2_name}: {similarity_metric}")
    
    return similarity_metric

In [None]:
# Assess across-set redundancy 
def across_set_red(datasets):
    similarity_metrics = {}
    dataset_names = list(datasets.keys())
    
    for i in range(len(dataset_names)):
        for j in range(len(dataset_names)):
            if i != j:  
                set1_name = dataset_names[i]
                set2_name = dataset_names[j]
                
                # Calculate similarity for ordered pair
                print(f"Processing {set1_name} vs {set2_name}")
                similarity_metrics[(set1_name, set2_name)] = calculate_similarity(datasets[set1_name], datasets[set2_name], set1_name, set2_name)
                
                gc.collect()  
    
    return similarity_metrics

In [None]:
# Plot heatmap for across-set redundancy 
def plot_heatmap(similarity_metrics):
    # Extract unique feature set names
    feature_set_names = sorted(list(set([key[0] for key in similarity_metrics.keys()]).union(set([key[1] for key in similarity_metrics.keys()]))))
    
    # Initialize the similarity matrix with zeros
    similarity_matrix = np.zeros((len(feature_set_names), len(feature_set_names)))

    # Fill the similarity matrix with similarity scores
    for (set1, set2), similarity in similarity_metrics.items():
        i = feature_set_names.index(set1)
        j = feature_set_names.index(set2)
        similarity_matrix[i, j] = similarity

    # Create a mask to hide the diagonal 
    mask = np.eye(len(feature_set_names), dtype=bool)
    
    # Plot the heatmap with the mask applied
    plt.figure(figsize=(10, 8))
    sns.heatmap(similarity_matrix, xticklabels=feature_set_names, yticklabels=feature_set_names, annot=True, cmap='magma', fmt=".4f", mask=mask, vmin=0, vmax=0.1)
    plt.title('Across-Set Redundancy Analysis')
    plt.xlabel('Feature Set')
    plt.ylabel('Feature Set')
    plt.savefig('Across_set.png', dpi=1000)
    plt.show()

In [None]:
# Assess across-set redundancy
similarity_metrics = across_set_red(extended_datasets)

In [None]:
# Plot across-set redundancy
plot_heatmap(similarity_metrics)

In [None]:
# Convert to DataFrame
df = pd.DataFrame.from_dict(similarity_metrics, orient='index', columns=['Similarity'])
df.index = pd.MultiIndex.from_tuples(df.index, names=['Set1', 'Set2'])

In [None]:
# Save to CSV
df.to_csv('similarity_metrics.csv')

## Create Optimal Feature Set

In [None]:
from sklearn.feature_selection import mutual_info_regression

In [None]:
# Dictionary of loaded datasets
reduced_datasets = {
    'raw': pd.read_csv('reduced_raw.csv', index_col='Date'),
    'extended': pd.read_csv('reduced_extended.csv', index_col='Date'),
    'featuretools_extended': pd.read_csv('reduced_featuretools_extended.csv', index_col='Date'),
    'tsfresh_extended': pd.read_csv('reduced_tsfresh_extended.csv', index_col='Date'),
    'featurewiz_extended': pd.read_csv('reduced_featurewiz_extended.csv', index_col='Date'),
    'pycaret_extended': pd.read_csv('reduced_pycaret_extended.csv', index_col='Date')
}

In [None]:
# Combine Extended Feature Sets
def combine_features(datasets):
    extended_dfs = []
    name_id_return_df = None
    date_index = None

    for name, df in datasets.items():
        # Sort by Date and Name to ensure alignment
        df = df.sort_values(by=['Date', 'Name'])
        
        if name_id_return_df is None:
            # Keep Name, ID, and Return columns from the first dataset
            name_id_return_df = df[['Name', 'ID', 'Return']]
            date_index = df.index  

        # Drop non-feature columns
        df = df.drop(columns=['Name', 'ID', 'Return'], errors='ignore')
        extended_dfs.append(df)

    # Concatenate all extended feature datasets along the columns
    combined_features = pd.concat(extended_dfs, axis=1)
    
    # Concatenate Name, ID, Return columns back with the combined features
    combined_df = pd.concat([name_id_return_df, combined_features], axis=1)
    
    # Set the Date index back to the combined features dataset
    combined_df.index = date_index
    
    return combined_df

In [None]:
# Load and combine the extended feature sets
combined_df = combine_features(reduced_datasets)

In [None]:
combined_df = combined_df.drop(columns=['index'])

In [None]:
def drop_corr(df, threshold=0.9):
    # Temporarily remove 'Return', 'Name', and 'ID' columns
    non_feature_columns = df[['Name', 'ID', 'Return']]
    
    # Select only numerical columns for correlation analysis
    df1 = df.drop(columns=['Return', 'Name', 'ID'])
    X = df1.select_dtypes(include=[np.number])
    
    # Calculate the correlation matrix
    corr_matrix = X.corr().abs()
    upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    # Find features with correlation above the threshold
    to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]
    
    # Drop the highly correlated features
    reduced_df = df1.drop(columns=to_drop)
    
    # Add back the 'Name', 'ID', and 'Return' columns
    reduced_df = pd.concat([non_feature_columns, reduced_df], axis=1)
    
    return reduced_df

In [None]:
# Drop highly correlated features
df = drop_corr(combined_df)

In [None]:
# Separate features and target
X = df.drop(columns=['Return', 'Name', 'ID'])  
y = df['Return']  

# Calculate mutual information scores
mi_scores = mutual_info_regression(X, y)

# Create a DataFrame to store feature names and their corresponding MI scores
mi_df = pd.DataFrame({'Feature': X.columns, 'MI_Score': mi_scores})

# Sort features by MI score in descending order
mi_df = mi_df.sort_values(by='MI_Score', ascending=False)

# Select the top 30 features
features = mi_df.head(30)

# Get the names of the top 30 features
selected_features = features['Feature'].values

# Recombine new dataset
final_df = df[['Name', 'ID', 'Return'] + list(selected_features)]

# Ensure thedataset is indexed by 'Date'
final_df.index = df.index

In [None]:
final_df.to_csv('Optimal_feature_set.csv', index=True)