### Find the top *k* features most correlated to Saleprice (Optimized method)
(House price data train.csv from https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data)

In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
import time
import psutil
import os, gc
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from codecarbon import OfflineEmissionsTracker

In [2]:
def get_memory_usage():
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024 / 1024  # Memory in MB

def optimized_feature_selection(X, y, k=20):
    #print(f"Memory before optimized selection: {get_memory_usage():.2f} MB")
    #start_time = time.time()
    X_numeric = X.select_dtypes(include=[np.number]).fillna(X.select_dtypes(include=[np.number]).median())
    # use corrwith to reduce runtime
    correlations = np.abs(X_numeric.corrwith(y))
    # use fillna to reduce runtime of checking with isnan in loops
    correlations = correlations.fillna(0)
    #top_k_features = correlations.nlargest(k).index.tolist() #this may use more time
    #instead of the above line, use the following two lines instead
    top_k_indices = np.argsort(correlations)[-k:]
    selected_features = X_numeric.columns[top_k_indices].tolist()
    #print(f"Memory after optimized selection: {get_memory_usage():.2f} MB")
    #print(f"Optimized feature selection took {time.time() - start_time} seconds")
    return selected_features

def load_and_evaluate(file_path, k=20, n_runs = 100):
    # Initialize lists to store metrics
    memory_before_list = []
    memory_after_list = []
    memory_increase_list = []
    runtime_list = []
    mse_list = []
    emissions_list = []

    data = pd.read_csv(file_path)
    X = data.drop(columns=['SalePrice', 'Id'])
    y = data['SalePrice']

    X = data.drop(columns=['SalePrice', 'Id'])
    y = data['SalePrice']
    # Handle missing values for numerical columns 
    X_numeric = X.select_dtypes(include=[np.number]).fillna(X.select_dtypes(include=[np.number]).median())
    X_categorical = X.select_dtypes(exclude=[np.number])
    # Combine numerical and categorical columns
    X = pd.concat([X_numeric, X_categorical], axis=1)
    
   
    if X.isna().any().any():
        X = X.fillna(0)  # Fill any remaining NaNs with 0 (e.g., for edge cases)
    
    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # Monte Carlo method for testing mem use and runtime
    print(f"Running naive method {n_runs} times...")
    # Calling psutil.cpu_precent() for 1 seconds
    cpu_use_start = psutil.cpu_percent(1)

    #tracker = EmissionsTracker(project_name="naive_feature_selection", output_file="naive_emissions.csv")
    tracker = OfflineEmissionsTracker(
        project_name="optimised_feature_selection",
        output_file="optimised_emissions.csv",
        country_iso_code="GBR",
        log_level="warning"
    )
    #tracker = OfflineEmissionsTracker(country_iso_code="UK")
    tracker.start()
    for run in range(n_runs):
        gc.collect() # clear memory before each run
        mem_use_before = get_memory_usage()
        start_time = time.time()

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42+run)

        selected_features = optimized_feature_selection(X_train, y_train, k)
        model = LinearRegression()
        
        X_train_selected = X_train[selected_features]
        X_test_selected = X_test[selected_features]
        
        model.fit(X_train_selected, y_train)
        y_pred = model.predict(X_test_selected)
        mse = mean_squared_error(y_test, y_pred)


        mem_use_after = get_memory_usage()
        end_time = time.time()
        # Store metrics
        memory_before_list.append(mem_use_before)
        memory_after_list.append(mem_use_after)
        memory_increase_list.append(mem_use_after - mem_use_before)
        runtime_list.append(end_time - start_time)
        mse_list.append(mse)

    emissions = tracker.stop()  # Emissions in kg CO₂eq
    cpu_use_end = psutil.cpu_percent(1)
    scaled_emissions = emissions * (cpu_use_end - cpu_use_start) / 100
    emissions_list.append(scaled_emissions)
    #model = LinearRegression()
    #X_train_selected = X_train[selected_features].toarray() if sp.issparse(X_train) else X_train[selected_features]
    #X_test_selected = X_test[selected_features].toarray() if sp.issparse(X_test) else X_test[selected_features]
    #X_train_selected = X_train[selected_features].toarray() if sp.issparse(X_train) else X_train[selected_features]
    #X_test_selected = X_test[selected_features].toarray() if sp.issparse(X_test) else X_test[selected_features]
    #if np.any(np.isnan(X_train_selected)):
    #    raise ValueError("NaNs detected in X_train_selected")
    #model.fit(X_train_selected, y_train)
    #y_pred = model.predict(X_test_selected)
    #mse = mean_squared_error(y_test, y_pred)
    #print(f"Mean Squared Error with {k} features: {mse}")

    # Compute statistics
    stats = {
        'memory_before_mean (MB)': np.mean(memory_before_list),
        'memory_before_std': np.std(memory_before_list),
        'memory_after_mean (MB)': np.mean(memory_after_list),
        'memory_after_std': np.std(memory_after_list),
        'memory_increase_mean (MB)': np.mean(memory_increase_list),
        'memory_increase_std': np.std(memory_increase_list),
        'runtime_mean (Seconds)': np.mean(runtime_list),
        'runtime_std': np.std(runtime_list),
        'MSE_mean': np.mean(mse_list),
        'MSE_std': np.std(mse_list),
        'emissions_total (kg CO2eq)': np.sum(emissions_list)
    }
    
    return selected_features, mse, stats

def generate_report(stats, method="Optimized"):
    report = f"# {method} Method Statistical Report\n\n"
    report += "## Summary Statistics\n\n"
    report += "| Metric | Mean | Standard Deviation |\n"
    report += "|--------|------|--------------------|\n"
    report += f"| Memory Before (MB) | {stats['memory_before_mean (MB)']:.2f} | {stats['memory_before_std']:.2f} |\n"
    report += f"| Memory After (MB) | {stats['memory_after_mean (MB)']:.2f} | {stats['memory_after_std']:.2f} |\n"
    report += f"| Memory Increase (MB) | {stats['memory_increase_mean (MB)']:.2f} | {stats['memory_increase_std']:.2f} |\n"
    report += f"| Runtime (seconds) | {stats['runtime_mean (Seconds)']:.6f} | {stats['runtime_std']:.6f} |\n"
    report += f"| MSE | {stats['MSE_mean']:.2f} | {stats['MSE_std']:.2f} |\n"
    report += f"\n## Environmental Impact\n"
    report += f"- Total Emissions (kg CO2eq): {stats['emissions_total (kg CO2eq)']:.6f}\n"
    return report

In [3]:
if __name__ == "__main__":
    gc.collect()
    file_path = "data/train.csv"
    n_runs = 500
    selected_features, mse, stats = load_and_evaluate(file_path,  k=20, n_runs=n_runs )
    print(f"Selected features (last run): {selected_features}")
    report = generate_report(stats, method="Optimized")
    with open("optimized_stats_report.md", "w") as f:
        f.write(report)
    print("Optimized method report generated: optimized_stats_report.md")
    print("\nSummary Statistics:")
    for key, value in stats.items():
        print(f"{key}: {value:.6f}")

Running naive method 500 times...


[codecarbon INFO @ 17:15:08] offline tracker init
 Mac OS detected: Please install Intel Power Gadget or enable PowerMetrics sudo to measure CPU



Selected features (last run): ['LotArea', 'HalfBath', '2ndFlrSF', 'WoodDeckSF', 'LotFrontage', 'OpenPorchSF', 'BsmtFinSF1', 'Fireplaces', 'MasVnrArea', 'GarageYrBlt', 'YearRemodAdd', 'YearBuilt', 'TotRmsAbvGrd', 'FullBath', '1stFlrSF', 'TotalBsmtSF', 'GarageArea', 'GarageCars', 'GrLivArea', 'OverallQual']
Optimized method report generated: optimized_stats_report.md

Summary Statistics:
memory_before_mean (MB): 185.325391
memory_before_std: 1.501888
memory_after_mean (MB): 185.371289
memory_after_std: 1.327574
memory_increase_mean (MB): 0.045898
memory_increase_std: 0.267205
runtime_mean (Seconds): 0.015336
runtime_std: 0.004588
MSE_mean: 1495907264.707669
MSE_std: 668916768.381671
emissions_total (kg CO2eq): 0.000008
