In [36]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold

# Load the data
df = pd.read_csv('./processed_data/metafeatures.csv', index_col=0)

# Apply variance threshold
selector = VarianceThreshold(threshold=0.01)
reduced_data = selector.fit_transform(df)
reduced_df = pd.DataFrame(reduced_data, index=df.index, columns=df.columns[selector.get_support()])

# Function to remove highly correlated features
def remove_highly_correlated(df, threshold):
    corr_matrix = df.corr(method='spearman')  # Compute the Spearman correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    return df.drop(to_drop, axis=1)

# Apply the function to remove highly correlated features
threshold = 0.9  # Set your own threshold
final_df = remove_highly_correlated(reduced_df, threshold)

# Print the results
print("Original number of features:", len(df.columns))
print("Number of features after variance threshold:", len(reduced_df.columns))
print("Final number of features after removing high correlation:", len(final_df.columns))


final_df.to_csv("./processed_data/metafeatures_reduced_6.csv")

Original number of features: 61
Number of features after variance threshold: 58
Final number of features after removing high correlation: 33
