### Variance Threshold Method

In [2]:

import pandas as pd
from sklearn.feature_selection import VarianceThreshold
import os

# Define parameters
types = ["B", "S", "D", "K"]
thresholds = [0.01, 0.1, 0.5, 0.6, 0.7, 0.8]
threshold_suffix = {0.01: "1", 0.1: "2", 0.5: "3", 0.6: "4", 0.7: "5", 0.8: "6"}  # Mapping thresholds to suffixes

# Create output directory if it doesn't exist
output_dir = "./reduced_features/Variance"
os.makedirs(output_dir, exist_ok=True)

# Process each type and threshold
for t in types:
    input_file_path = f"./features/input_cathepsin_{t}.csv"  # Input file for each type
    data = pd.read_csv(input_file_path)  # Load with header to retain feature names

    # Replace NaN values with column averages
    data = data.fillna(data.mean())

    for threshold in thresholds:
        # Apply Variance Threshold
        selector = VarianceThreshold(threshold=threshold)
        data_reduced = selector.fit_transform(data)

        # Get selected feature names
        selected_feature_names = data.columns[selector.get_support()]

        # Convert reduced data to DataFrame with feature names
        data_reduced_df = pd.DataFrame(data_reduced, columns=selected_feature_names)

        # Define output file path
        output_file_path = f"{output_dir}/{t}/reduced_desc_Variance_{t}_{threshold_suffix[threshold]}.csv"
        os.makedirs(os.path.dirname(output_file_path), exist_ok=True)  # Ensure directory exists

        # Save the reduced dataset
        data_reduced_df.to_csv(output_file_path, index=False)
        print(f"Dataset for {t} with threshold {threshold} reduced to {data_reduced.shape[1]} features. Saved to '{output_file_path}'.")


Dataset for B with threshold 0.01 reduced to 186 features. Saved to './reduced_features/Variance/B/reduced_desc_Variance_B_1.csv'.
Dataset for B with threshold 0.1 reduced to 141 features. Saved to './reduced_features/Variance/B/reduced_desc_Variance_B_2.csv'.
Dataset for B with threshold 0.5 reduced to 114 features. Saved to './reduced_features/Variance/B/reduced_desc_Variance_B_3.csv'.
Dataset for B with threshold 0.6 reduced to 113 features. Saved to './reduced_features/Variance/B/reduced_desc_Variance_B_4.csv'.
Dataset for B with threshold 0.7 reduced to 112 features. Saved to './reduced_features/Variance/B/reduced_desc_Variance_B_5.csv'.
Dataset for B with threshold 0.8 reduced to 108 features. Saved to './reduced_features/Variance/B/reduced_desc_Variance_B_6.csv'.
Dataset for S with threshold 0.01 reduced to 169 features. Saved to './reduced_features/Variance/S/reduced_desc_Variance_S_1.csv'.
Dataset for S with threshold 0.1 reduced to 139 features. Saved to './reduced_features/V

### Correlation-Based Feature Selection 

In [4]:
import pandas as pd
import numpy as np


# Define parameters
types = ["B", "S", "D", "K"]
thresholds = [0.9, 0.8, 0.7, 0.6, 0.5, 0.4]
threshold_suffix = {0.9: "1", 0.8: "2", 0.7: "3", 0.6: "4", 0.5: "5", 0.4: "6"}  # Mapping thresholds to suffixes

# Create output directory if it doesn't exist
output_dir = "./reduced_features/Correlation"
os.makedirs(output_dir, exist_ok=True)

for t in types:
    
    # Step 1: Load the CSV file with the header
    file_path = f"./features/input_cathepsin_{t}.csv"  # Replace with your file path
    data = pd.read_csv(file_path)  # Load with header to retain feature names

    # Step 2: Ensure only numeric columns are processed
    data_numeric = data.apply(pd.to_numeric, errors='coerce')  # Convert non-numeric to NaN

    # Step 3: Replace NaN values with column averages
    data_numeric = data_numeric.fillna(data_numeric.mean())

    # Step 4: Calculate the correlation matrix
    correlation_matrix = data_numeric.corr().abs()

    # Step 5: Identify and remove highly correlated features
    # Use the upper triangle of the correlation matrix
    upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

    for threshold in thresholds:

        # Find columns to drop based on a correlation threshold (e.g., 0.9)
        correlation_threshold = threshold # Adjust threshold as needed
        to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > correlation_threshold)]

        # Drop the highly correlated columns
        data_reduced = data.drop(columns=to_drop)  # Use original 'data' to retain feature names


        # Step 6: Save the reduced dataset with feature names
        output_file_path = f"./reduced_features/Correlation/{t}/reduced_desc_Correlation_{t}_{threshold_suffix[threshold]}.csv"
        data_reduced.to_csv(output_file_path, index=False)
        print(f"Dataset for {t} with threshold {threshold} reduced to {data_reduced.shape[1]} features. Saved to '{output_file_path}'.")




Dataset for B with threshold 0.9 reduced to 168 features. Saved to './reduced_features/Correlation/B/reduced_desc_Correlation_B_1.csv'.
Dataset for B with threshold 0.8 reduced to 144 features. Saved to './reduced_features/Correlation/B/reduced_desc_Correlation_B_2.csv'.
Dataset for B with threshold 0.7 reduced to 127 features. Saved to './reduced_features/Correlation/B/reduced_desc_Correlation_B_3.csv'.
Dataset for B with threshold 0.6 reduced to 105 features. Saved to './reduced_features/Correlation/B/reduced_desc_Correlation_B_4.csv'.
Dataset for B with threshold 0.5 reduced to 81 features. Saved to './reduced_features/Correlation/B/reduced_desc_Correlation_B_5.csv'.
Dataset for B with threshold 0.4 reduced to 56 features. Saved to './reduced_features/Correlation/B/reduced_desc_Correlation_B_6.csv'.
Dataset for S with threshold 0.9 reduced to 172 features. Saved to './reduced_features/Correlation/S/reduced_desc_Correlation_S_1.csv'.
Dataset for S with threshold 0.8 reduced to 142 fe

### RFE (desc)

In [5]:

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
import os

# Define parameters
types = ["B", "S", "D", "K"]
num_features_list = [150, 130, 90, 50, 30, 20]
feature_suffix = {150: "1", 130: "2", 90: "3", 50: "4", 30: "5", 20: "6"}  # Mapping feature count to suffix

# Create output directory if it doesn't exist
output_dir = "./reduced_features/RFE"
os.makedirs(output_dir, exist_ok=True)

# Process each type and feature count
for t in types:
    input_file_path = f"./features/input_cathepsin_{t}.csv"  # Input file for each type
    data = pd.read_csv(input_file_path)  # Load with header to retain feature names

    # Ensure numeric data and handle missing values
    data_numeric = data.apply(pd.to_numeric, errors='coerce')  # Convert non-numeric to NaN
    data_numeric = data_numeric.fillna(data_numeric.mean())  # Replace NaN with column averages

    # Create a placeholder target (mean of all features)
    placeholder_target = data_numeric.mean(axis=1)

    # Initialize a regression model
    model = LinearRegression()

    for num_features in num_features_list:
        # Apply Recursive Feature Elimination (RFE)
        rfe = RFE(model, n_features_to_select=num_features)
        rfe.fit(data_numeric, placeholder_target)

        # Get selected features
        selected_features = rfe.support_  # Boolean mask of selected features
        selected_feature_names = data.columns[selected_features]  # Retain selected feature names
        data_reduced = data.loc[:, selected_feature_names]

        # Define output file path
        output_file_path = f"{output_dir}/{t}/reduced_desc_RFE_{t}_{feature_suffix[num_features]}.csv"
        os.makedirs(os.path.dirname(output_file_path), exist_ok=True)  # Ensure directory exists

        # Save the reduced dataset
        data_reduced.to_csv(output_file_path, index=False)
        print(f"Dataset for {t} with {num_features} selected features saved to '{output_file_path}'.")


Dataset for B with 150 selected features saved to './reduced_features/RFE/B/reduced_desc_RFE_B_1.csv'.
Dataset for B with 130 selected features saved to './reduced_features/RFE/B/reduced_desc_RFE_B_2.csv'.
Dataset for B with 90 selected features saved to './reduced_features/RFE/B/reduced_desc_RFE_B_3.csv'.
Dataset for B with 50 selected features saved to './reduced_features/RFE/B/reduced_desc_RFE_B_4.csv'.
Dataset for B with 30 selected features saved to './reduced_features/RFE/B/reduced_desc_RFE_B_5.csv'.
Dataset for B with 20 selected features saved to './reduced_features/RFE/B/reduced_desc_RFE_B_6.csv'.
Dataset for S with 150 selected features saved to './reduced_features/RFE/S/reduced_desc_RFE_S_1.csv'.
Dataset for S with 130 selected features saved to './reduced_features/RFE/S/reduced_desc_RFE_S_2.csv'.
Dataset for S with 90 selected features saved to './reduced_features/RFE/S/reduced_desc_RFE_S_3.csv'.
Dataset for S with 50 selected features saved to './reduced_features/RFE/S/red