In [4]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from boruta import BorutaPy

np.int = np.int32
np.float = np.float64
np.bool = np.bool_

def select_features(X, y, iterations=10):
    def boruta_selection(X, y):
        forest = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
        boruta = BorutaPy(forest, n_estimators='auto', verbose=0, random_state=42)
        boruta.fit(X.values, y.values)
        selected_features = X.columns[boruta.support_].tolist()
        return selected_features

    def rfe_selection(X, y):
        model = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
        rfe = RFE(model, n_features_to_select=5)
        rfe.fit(X.values, y.values)
        selected_features = X.columns[rfe.support_].tolist()
        return selected_features

    def correlation_selection(X, threshold=0.95):
        corr_matrix = X.corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
        selected_features = [column for column in X.columns if column not in to_drop]
        return selected_features

    def lasso_selection(X, y):
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        lasso = LassoCV(cv=5, random_state=42).fit(X_scaled, y)
        selected_features = X.columns[lasso.coef_ != 0].tolist()
        return selected_features

    selected_features_list = []

    for _ in range(iterations):
        # Boruta and RFE and Correlation
        features = boruta_selection(X, y)
        X_boruta = X[features]
        features = rfe_selection(X_boruta, y)
        X_rfe = X_boruta[features]
        features = correlation_selection(X_rfe)
        selected_features_list.append(('Boruta + RFE + Correlation', features))
        
        print(('Boruta + RFE + Correlation', features))
        
        # RFE and Boruta and Correlation
        features = rfe_selection(X, y)
        X_rfe = X[features]
        features = boruta_selection(X_rfe, y)
        X_boruta = X_rfe[features]
        features = correlation_selection(X_boruta)
        selected_features_list.append(('RFE + Boruta + Correlation', features))
        
        print('RFE + Boruta + Correlation', features)
        
        # RFE + Correlation
        features = rfe_selection(X, y)
        X_rfe = X[features]
        features = correlation_selection(X_rfe)
        selected_features_list.append(('RFE + Correlation', features))
        
        print(('RFE + Correlation', features))
        
        # Boruta + Correlation
        features = boruta_selection(X, y)
        X_boruta = X[features]
        features = correlation_selection(X_boruta)
        selected_features_list.append(('Boruta + Correlation', features))
        
        print(('Boruta + Correlation', features))
        
        # Boruta + Lasso
        features = boruta_selection(X, y)
        X_boruta = X[features]
        features = lasso_selection(X_boruta, y)
        selected_features_list.append(('Boruta + Lasso', features))
        
        print(('Boruta + Lasso', features))
        
        # Boruta + RFE + Lasso
        features = boruta_selection(X, y)
        X_boruta = X[features]
        features = rfe_selection(X_boruta, y)
        X_rfe = X_boruta[features]
        features = lasso_selection(X_rfe, y)
        selected_features_list.append(('Boruta + RFE + Lasso', features))
        
        print(('Boruta + RFE + Lasso', features))
    
    return selected_features_list

# Dummy dataset (replace with your actual dataset)
X = pd.read_csv("../data/x_train.txt", delimiter=' ', header=None)
y = pd.read_csv("../data/y_train.txt", delimiter=' ', header=None).squeeze()

# Assign column names to the dataframe
X.columns = range(X.shape[1])

selected_features = select_features(X, y, iterations=1)

# Print the selected features for each iteration
for method, features in selected_features:
    print(f"Method: {method}")
    print(f"Selected Features: {features}")
    print("-" * 30)


('Boruta + RFE + Correlation', [8, 100, 101, 102, 105])
RFE + Boruta + Correlation [100, 101, 102, 103, 105]
('RFE + Correlation', [100, 101, 102, 103, 105])
('Boruta + Correlation', [0, 1, 2, 3, 4, 5, 7, 8, 9, 100, 101, 102, 103, 104, 105])
('Boruta + Lasso', [105])
('Boruta + RFE + Lasso', [105])
Method: Boruta + RFE + Correlation
Selected Features: [8, 100, 101, 102, 105]
------------------------------
Method: RFE + Boruta + Correlation
Selected Features: [100, 101, 102, 103, 105]
------------------------------
Method: RFE + Correlation
Selected Features: [100, 101, 102, 103, 105]
------------------------------
Method: Boruta + Correlation
Selected Features: [0, 1, 2, 3, 4, 5, 7, 8, 9, 100, 101, 102, 103, 104, 105]
------------------------------
Method: Boruta + Lasso
Selected Features: [105]
------------------------------
Method: Boruta + RFE + Lasso
Selected Features: [105]
------------------------------


In [5]:
import os
from datetime import datetime

# Define the directory to save the files
save_dir = 'feature_selection'

# Create the directory if it doesn't exist
os.makedirs(save_dir, exist_ok=True)

# Get current date and hour
current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

# Define the filename
filename = f"{save_dir}/selected_features_{current_time}.txt"

# Open the file for writing
with open(filename, 'w') as f:
    # Write the selected features for each iteration to the file
    for method, features in selected_features:
        f.write(f"Method: {method}\n")
        f.write(f"Selected Features: {features}\n")
        f.write("-" * 30 + "\n")