# Feature Selection

## Why feature selection?

Feature selection, or *variable selection*, is an often used technique in machine learning. It is a process of selecting subset of highly relevant data to benefit modelling in many ways. Such as,

- Reduce the complexity, that is, improve the efficiency of training [1].

- Improve the prediction accuracy [1,2].




## Find the top *k* features most correlated to Saleprice (Naïve method)
(House price data train.csv from https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data)

The data have 1460 rows and 81 columns

This code was tested in Python 3.12.3

In [1]:
import pandas as pd
import numpy as np

import time
import psutil
import os,gc
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


In [None]:

def get_memory_usage():
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024 / 1024  # Memory in MB

def naive_feature_selection(X, y, k=20):
    #mem_use_before = get_memory_usage()
    #print(f"Memory before naive selection: {mem_use_before:.2f} MB")

    # Select only numerical columns and fill missing values with median values of the columns
    X_numeric = X.select_dtypes(include=[np.number]).fillna(X.select_dtypes(include=[np.number]).median())
    # Compute correlations for numerical features only
    n_features = X_numeric.shape[1]
    correlations = np.zeros(n_features)
    for i in range(n_features):
        valid = (~np.isnan(X_numeric.iloc[:, i]) & ~np.isnan(y))
        if valid.sum() > 1:
            correlations[i] = np.abs(np.corrcoef(X_numeric.iloc[:, i][valid], y[valid])[0, 1])
        else:
            correlations[i] = 0
    top_k_indices = np.argsort(correlations)[-k:]
    selected_features = X_numeric.columns[top_k_indices].tolist()
    #mem_use_after = get_memory_usage()
    #print(f"Memory after naive selection: {mem_use_after:.2f} MB")
    #print(f"memory use increased {(mem_use_after - mem_use_before):.2f} MB" )
    return selected_features

def load_and_evaluate(file_path, k=20, n_runs = 100):
    # Initialize lists to store metrics
    memory_before_list = []
    memory_after_list = []
    memory_increase_list = []
    runtime_list = []
    mse_list = []

    #load data
    data = pd.read_csv(file_path)
    # split Sale price and other columns
    X = data.drop(columns=['SalePrice', 'Id'])
    y = data['SalePrice']
    # Handle missing values for numerical columns 
    X_numeric = X.select_dtypes(include=[np.number]).fillna(X.select_dtypes(include=[np.number]).median())
    X_categorical = X.select_dtypes(exclude=[np.number])
    # Combine numerical and categorical columns
    X = pd.concat([X_numeric, X_categorical], axis=1)
    
    # Verify no NaNs remain 
    if X.isna().any().any():
        X = X.fillna(0)  # Fill any remaining NaNs with 0 (e.g., for edge cases)
    
    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Monte Carlo method for testing mem use and runtime
    print(f"Running naive method {n_runs} times...")
    for run in range(n_runs):
        # clear memory before each run
        gc.collect() 
        mem_use_before = get_memory_usage()
        #print(f"Memory before naive selection: {mem_use_before:.2f} MB")

        start_time = time.time()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42+run)
        selected_features = naive_feature_selection(X_train, y_train, k)
        X_train_selected = X_train[selected_features]
        X_test_selected = X_test[selected_features]
        model = LinearRegression()
        model.fit(X_train_selected, y_train)
        y_pred = model.predict(X_test_selected)
        mse = mean_squared_error(y_test, y_pred)

        mem_use_after = get_memory_usage()
        end_time = time.time()
        # Store metrics
        memory_before_list.append(mem_use_before)
        memory_after_list.append(mem_use_after)
        memory_increase_list.append(mem_use_after - mem_use_before)
        runtime_list.append(end_time - start_time)
        mse_list.append(mse)

    #print(f"Memory after naive selection: {mem_use_after:.2f} MB")

    #print(f"Naive feature selection took {time.time() - start_time} seconds")
    
    # Ensure selected features are NaN-free
    #X_train_selected = X_train[selected_features]
    #X_test_selected = X_test[selected_features]
    
    #model = LinearRegression()
    #model.fit(X_train_selected, y_train)
    #y_pred = model.predict(X_test_selected)
    #mse = mean_squared_error(y_test, y_pred)
    #print(f"Mean Squared Error with {k} features: {mse}")

    # Compute statistics
    stats = {
        'memory_before_mean (MB)': np.mean(memory_before_list),
        'memory_before_std': np.std(memory_before_list),
        'memory_after_mean (MB)': np.mean(memory_after_list),
        'memory_after_std': np.std(memory_after_list),
        'memory_increase_mean (MB)': np.mean(memory_increase_list),
        'memory_increase_std': np.std(memory_increase_list),
        'runtime_mean (Seconds)': np.mean(runtime_list),
        'runtime_std': np.std(runtime_list),
        'MSE_mean': np.mean(mse_list),
        'MSE_std': np.std(mse_list)

    }
    
    return selected_features, mse, stats

def generate_report(stats, method="Naive"):
    report = f"# {method} Method Statistical Report\n\n"
    report += "## Summary Statistics\n\n"
    report += "| Metric | Mean | Standard Deviation |\n"
    report += "|--------|------|--------------------|\n"
    report += f"| Memory Before (MB) | {stats['memory_before_mean (MB)']:.2f} | {stats['memory_before_std']:.2f} |\n"
    report += f"| Memory After (MB) | {stats['memory_after_mean (MB)']:.2f} | {stats['memory_after_std']:.2f} |\n"
    report += f"| Memory Increase (MB) | {stats['memory_increase_mean (MB)']:.2f} | {stats['memory_increase_std']:.2f} |\n"
    report += f"| Runtime (seconds) | {stats['runtime_mean (Seconds)']:.6f} | {stats['runtime_std']:.6f} |\n"
    report += f"| MSE | {stats['MSE_mean']:.2f} | {stats['MSE_std']:.2f} |\n"

    return report

In [None]:

if __name__ == "__main__":
    gc.collect()
    file_path = "data/train.csv"
    n_runs = 500
    selected_features, mse, stats = load_and_evaluate(file_path,  k=20, n_runs=n_runs)
    #print(f"Selected features (last run): {selected_features}")
    report = generate_report(stats, method="Naive")
    with open("naive_stats_report.md", "w") as f:
        f.write(report)
    print("Naive method report generated: naive_stats_report.md")
    print("\nSummary Statistics:")
    for key, value in stats.items():
        print(f"{key}: {value:.6f}")
    #mem_use_after = get_memory_usage()
    #print(f"Memory after naive selection: {mem_use_after:.2f} MB")
    #print(f"memory use increased {(mem_use_after - mem_use_before):.2f} MB" )
    print(f"Selected features: {selected_features}")

Running naive method 500 times...
Naive method report generated: naive_stats_report.md

Summary Statistics:
memory_before_mean (MB): 181.911375
memory_before_std: 3.180536
memory_after_mean (MB): 181.927500
memory_after_std: 3.245049
memory_increase_mean (MB): 0.016125
memory_increase_std: 0.531319
runtime_mean (Seconds): 0.021531
runtime_std: 0.004977
MSE_mean: 1495907264.707669
MSE_std: 668916768.381671
Selected features: ['LotArea', 'HalfBath', '2ndFlrSF', 'WoodDeckSF', 'LotFrontage', 'OpenPorchSF', 'BsmtFinSF1', 'Fireplaces', 'MasVnrArea', 'GarageYrBlt', 'YearRemodAdd', 'YearBuilt', 'TotRmsAbvGrd', 'FullBath', '1stFlrSF', 'TotalBsmtSF', 'GarageArea', 'GarageCars', 'GrLivArea', 'OverallQual']


In [None]:
file_path = "train.csv"
test = pd.read_csv(file_path)
for col in test.columns:
    print(col)

Id
MSSubClass
MSZoning
LotFrontage
LotArea
Street
Alley
LotShape
LandContour
Utilities
LotConfig
LandSlope
Neighborhood
Condition1
Condition2
BldgType
HouseStyle
OverallQual
OverallCond
YearBuilt
YearRemodAdd
RoofStyle
RoofMatl
Exterior1st
Exterior2nd
MasVnrType
MasVnrArea
ExterQual
ExterCond
Foundation
BsmtQual
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinSF1
BsmtFinType2
BsmtFinSF2
BsmtUnfSF
TotalBsmtSF
Heating
HeatingQC
CentralAir
Electrical
1stFlrSF
2ndFlrSF
LowQualFinSF
GrLivArea
BsmtFullBath
BsmtHalfBath
FullBath
HalfBath
BedroomAbvGr
KitchenAbvGr
KitchenQual
TotRmsAbvGrd
Functional
Fireplaces
FireplaceQu
GarageType
GarageYrBlt
GarageFinish
GarageCars
GarageArea
GarageQual
GarageCond
PavedDrive
WoodDeckSF
OpenPorchSF
EnclosedPorch
3SsnPorch
ScreenPorch
PoolArea
PoolQC
Fence
MiscFeature
MiscVal
MoSold
YrSold
SaleType
SaleCondition
SalePrice


## Reference

1. Gareth James; Daniela Witten; Trevor Hastie; Robert Tibshirani (2013). "An Introduction to Statistical Learning". *Springer*. p. 204.
2. Kratsios, Anastasis; Hyndman, Cody (2021). "NEU: A Meta-Algorithm for Universal UAP-Invariant Feature Representation". *Journal of Machine Learning Research*. 22 (92): 1–51