In [3]:
# %% Imports
import pandas as pd
import numpy as np
import os
import time
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LassoLars, SGDRegressor
from sklearn.ensemble import (
    RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, AdaBoostRegressor,
    BaggingRegressor, VotingRegressor, StackingRegressor
)
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline

def wrap_model(model):
    """Wraps any model in a pipeline with mean imputation."""
    return Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('model', model)
    ])
# %% Config
SEED = 42
np.random.seed(SEED)

BASE_RESULTS = "results_feature_subset"
os.makedirs(BASE_RESULTS, exist_ok=True)
for folder in ["metrics", "predictions", "plots"]:
    os.makedirs(os.path.join(BASE_RESULTS, folder), exist_ok=True)

# Fixed feature order (most → least important)
FEATURE_ORDER = [
    "DieWidth",
    "NumInstances",
    "NumValidPins",
    "DieHeight",
    "NumNets",
    "NumMovableInstances",
    "NetSkewness"
]

# %% Load Data
print("📂 Loading datasets...")
input_files = [
    "ispd18_global_features.csv",
    "ispd19_global_features.csv",
    "N28_global_features.csv"
]

df_list = [pd.read_csv(f) for f in input_files]
df = pd.concat(df_list, ignore_index=True)
print(f"✅ Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")

# Target and features
TARGET = "TotalWirelength"
y = df[TARGET]
print(f"🎯 Target variable: {TARGET}")

# %% Train-test split
print("✂ Splitting dataset into train and test sets...")
X_train_full, X_test_full, y_train, y_test = train_test_split(
    df[FEATURE_ORDER], y, test_size=0.2, random_state=SEED
)
print(f"✅ Train samples: {X_train_full.shape[0]}, Test samples: {X_test_full.shape[0]}")

# %% Define models
print("⚙ Initializing models...")

# Models with imputer applied to all
models = {
    "LinearRegression": wrap_model(LinearRegression()),
    "Ridge": wrap_model(Ridge(random_state=SEED)),
    "Lasso": wrap_model(Lasso(random_state=SEED)),
    "ElasticNet": wrap_model(ElasticNet(random_state=SEED)),
    "LassoLars": wrap_model(LassoLars()),
    "SGDRegressor": wrap_model(SGDRegressor(random_state=SEED)),
    "RandomForest": wrap_model(RandomForestRegressor(n_estimators=200, random_state=SEED, n_jobs=-1)),
    "GradientBoosting": wrap_model(GradientBoostingRegressor(random_state=SEED)),
    "ExtraTrees": wrap_model(ExtraTreesRegressor(n_estimators=200, random_state=SEED, n_jobs=-1)),
    "AdaBoost": wrap_model(AdaBoostRegressor(random_state=SEED)),
    "Bagging": wrap_model(BaggingRegressor(random_state=SEED, n_jobs=-1)),
    "DecisionTree": wrap_model(DecisionTreeRegressor(random_state=SEED)),
    "ExtraTree": wrap_model(ExtraTreeRegressor(random_state=SEED)),
    "KNeighbors": wrap_model(KNeighborsRegressor()),
    "XGBoost": wrap_model(XGBRegressor(n_estimators=300, learning_rate=0.05, random_state=SEED, n_jobs=-1)),
    "LightGBM": wrap_model(LGBMRegressor(n_estimators=300, learning_rate=0.05, random_state=SEED, n_jobs=-1)),
}

# Weighted Voting with wrapped models
weights = {"ExtraTrees": 0.25, "RandomForest": 0.25, "XGBoost": 0.25, "LightGBM": 0.25}
voting_models = [(name, models[name]) for name in weights.keys()]
models["WeightedVoting"] = VotingRegressor(voting_models, weights=list(weights.values()), n_jobs=-1)

# Stacking with wrapped models
stacking_estimators = [(name, models[name]) for name in ["RandomForest", "ExtraTrees", "XGBoost", "LightGBM"]]
models["Stacking"] = StackingRegressor(estimators=stacking_estimators, final_estimator=LinearRegression(), n_jobs=-1)

print(f"✅ Total models: {len(models)}")

# %% Function: Evaluation
def evaluate_and_save(model_name, model, X_train, X_test, y_train, y_test):
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time
    
    start_infer = time.time()
    y_pred = model.predict(X_test)
    infer_time = (time.time() - start_infer) / len(y_test)  # per sample
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    pred_df = pd.DataFrame({"TrueValue": y_test.values, "PredictedValue": y_pred})
    pred_df.to_csv(os.path.join(BASE_RESULTS, "predictions", f"{model_name}_predictions.csv"), index=False)
    
    plt.figure(figsize=(6, 6))
    plt.scatter(y_test, y_pred, alpha=0.6, edgecolor='k', s=40)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel("True TotalWirelength")
    plt.ylabel("Predicted TotalWirelength")
    plt.title(f"{model_name}: True vs Predicted")
    plt.tight_layout()
    plt.savefig(os.path.join(BASE_RESULTS, "plots", f"{model_name}_true_vs_pred.png"))
    plt.close()
    
    return {
        "Model": model_name,
        "MSE": mse,
        "RMSE": rmse,
        "R2": r2,
        "TrainTime(s)": train_time,
        "InferTimePerSample(s)": infer_time,
        "TrainSamples": len(y_train),
        "TestSamples": len(y_test)
    }

# %% Loop over feature subsets
print("🚀 Starting training for feature subsets...")
comparison_records = []

for k in range(1, len(FEATURE_ORDER) + 1):
    print(f"\n=== Training with top {k} features: {FEATURE_ORDER[:k]} ===")
    
    X_train = X_train_full[FEATURE_ORDER[:k]]
    X_test = X_test_full[FEATURE_ORDER[:k]]
    
    metrics_list = []
    for name, model in models.items():
        print(f"   ▶ {name}...")
        metrics = evaluate_and_save(name, model, X_train, X_test, y_train, y_test)
        metrics_list.append(metrics)
        
        # Save R² and MSE for comparison table
        comparison_records.append({
            "NumFeatures": k,
            "Model": name,
            f"R2@{k}": metrics["R2"],
            f"MSE@{k}": metrics["MSE"]
        })
    
    # Save metrics for this feature count
    metrics_df = pd.DataFrame(metrics_list)
    metrics_df.to_csv(os.path.join(BASE_RESULTS, "metrics", f"metrics_{k}_features.csv"), index=False)
    print(f"📄 Saved metrics for {k} features")

# %% Save R²/MSE comparison table
print("\n📊 Saving R²/MSE comparison table...")
comparison_df = pd.DataFrame(comparison_records)
comparison_df = comparison_df.pivot(index="Model", columns="NumFeatures")
comparison_df.to_csv(os.path.join(BASE_RESULTS, "metrics", "r2_mse_comparison.csv"))
print("✅ All done!")


📂 Loading datasets...
✅ Dataset loaded: 8469 rows, 18 columns
🎯 Target variable: TotalWirelength
✂ Splitting dataset into train and test sets...
✅ Train samples: 6775, Test samples: 1694
⚙ Initializing models...
✅ Total models: 18
🚀 Starting training for feature subsets...

=== Training with top 1 features: ['DieWidth'] ===
   ▶ LinearRegression...
   ▶ Ridge...
   ▶ Lasso...


  model = cd_fast.enet_coordinate_descent(


   ▶ ElasticNet...
   ▶ LassoLars...
   ▶ SGDRegressor...
   ▶ RandomForest...
   ▶ GradientBoosting...
   ▶ ExtraTrees...
   ▶ AdaBoost...
   ▶ Bagging...
   ▶ DecisionTree...
   ▶ ExtraTree...
   ▶ KNeighbors...
   ▶ XGBoost...
   ▶ LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000100 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 232
[LightGBM] [Info] Number of data points in the train set: 6775, number of used features: 1
[LightGBM] [Info] Start training from score 102882775257.610031




   ▶ WeightedVoting...




   ▶ Stacking...




📄 Saved metrics for 1 features

=== Training with top 2 features: ['DieWidth', 'NumInstances'] ===
   ▶ LinearRegression...
   ▶ Ridge...
   ▶ Lasso...


  model = cd_fast.enet_coordinate_descent(


   ▶ ElasticNet...


  model = cd_fast.enet_coordinate_descent(


   ▶ LassoLars...
   ▶ SGDRegressor...
   ▶ RandomForest...
   ▶ GradientBoosting...
   ▶ ExtraTrees...
   ▶ AdaBoost...
   ▶ Bagging...
   ▶ DecisionTree...
   ▶ ExtraTree...
   ▶ KNeighbors...
   ▶ XGBoost...
   ▶ LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000186 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 487
[LightGBM] [Info] Number of data points in the train set: 6775, number of used features: 2
[LightGBM] [Info] Start training from score 102882775257.610031




   ▶ WeightedVoting...




   ▶ Stacking...




📄 Saved metrics for 2 features

=== Training with top 3 features: ['DieWidth', 'NumInstances', 'NumValidPins'] ===
   ▶ LinearRegression...
   ▶ Ridge...
   ▶ Lasso...


  model = cd_fast.enet_coordinate_descent(


   ▶ ElasticNet...


  model = cd_fast.enet_coordinate_descent(


   ▶ LassoLars...
   ▶ SGDRegressor...
   ▶ RandomForest...
   ▶ GradientBoosting...
   ▶ ExtraTrees...
   ▶ AdaBoost...
   ▶ Bagging...
   ▶ DecisionTree...
   ▶ ExtraTree...
   ▶ KNeighbors...
   ▶ XGBoost...
   ▶ LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000538 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 742
[LightGBM] [Info] Number of data points in the train set: 6775, number of used features: 3
[LightGBM] [Info] Start training from score 102882775257.610031




   ▶ WeightedVoting...




   ▶ Stacking...




📄 Saved metrics for 3 features

=== Training with top 4 features: ['DieWidth', 'NumInstances', 'NumValidPins', 'DieHeight'] ===
   ▶ LinearRegression...
   ▶ Ridge...
   ▶ Lasso...


  model = cd_fast.enet_coordinate_descent(


   ▶ ElasticNet...


  model = cd_fast.enet_coordinate_descent(


   ▶ LassoLars...
   ▶ SGDRegressor...
   ▶ RandomForest...
   ▶ GradientBoosting...
   ▶ ExtraTrees...
   ▶ AdaBoost...
   ▶ Bagging...
   ▶ DecisionTree...
   ▶ ExtraTree...
   ▶ KNeighbors...
   ▶ XGBoost...
   ▶ LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000687 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 859
[LightGBM] [Info] Number of data points in the train set: 6775, number of used features: 4
[LightGBM] [Info] Start training from score 102882775257.610031




   ▶ WeightedVoting...




   ▶ Stacking...




📄 Saved metrics for 4 features

=== Training with top 5 features: ['DieWidth', 'NumInstances', 'NumValidPins', 'DieHeight', 'NumNets'] ===
   ▶ LinearRegression...
   ▶ Ridge...
   ▶ Lasso...


  model = cd_fast.enet_coordinate_descent(


   ▶ ElasticNet...


  model = cd_fast.enet_coordinate_descent(


   ▶ LassoLars...
   ▶ SGDRegressor...
   ▶ RandomForest...
   ▶ GradientBoosting...
   ▶ ExtraTrees...
   ▶ AdaBoost...
   ▶ Bagging...
   ▶ DecisionTree...
   ▶ ExtraTree...
   ▶ KNeighbors...
   ▶ XGBoost...
   ▶ LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000684 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1114
[LightGBM] [Info] Number of data points in the train set: 6775, number of used features: 5
[LightGBM] [Info] Start training from score 102882775257.610031




   ▶ WeightedVoting...




   ▶ Stacking...




📄 Saved metrics for 5 features

=== Training with top 6 features: ['DieWidth', 'NumInstances', 'NumValidPins', 'DieHeight', 'NumNets', 'NumMovableInstances'] ===
   ▶ LinearRegression...
   ▶ Ridge...
   ▶ Lasso...


  model = cd_fast.enet_coordinate_descent(


   ▶ ElasticNet...


  model = cd_fast.enet_coordinate_descent(


   ▶ LassoLars...
   ▶ SGDRegressor...
   ▶ RandomForest...
   ▶ GradientBoosting...
   ▶ ExtraTrees...
   ▶ AdaBoost...
   ▶ Bagging...
   ▶ DecisionTree...
   ▶ ExtraTree...
   ▶ KNeighbors...
   ▶ XGBoost...
   ▶ LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001828 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1369
[LightGBM] [Info] Number of data points in the train set: 6775, number of used features: 6
[LightGBM] [Info] Start training from score 102882775257.610031




   ▶ WeightedVoting...




   ▶ Stacking...




📄 Saved metrics for 6 features

=== Training with top 7 features: ['DieWidth', 'NumInstances', 'NumValidPins', 'DieHeight', 'NumNets', 'NumMovableInstances', 'NetSkewness'] ===
   ▶ LinearRegression...
   ▶ Ridge...
   ▶ Lasso...


  model = cd_fast.enet_coordinate_descent(


   ▶ ElasticNet...


  model = cd_fast.enet_coordinate_descent(


   ▶ LassoLars...
   ▶ SGDRegressor...
   ▶ RandomForest...
   ▶ GradientBoosting...
   ▶ ExtraTrees...
   ▶ AdaBoost...
   ▶ Bagging...
   ▶ DecisionTree...
   ▶ ExtraTree...
   ▶ KNeighbors...
   ▶ XGBoost...
   ▶ LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003945 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1624
[LightGBM] [Info] Number of data points in the train set: 6775, number of used features: 7
[LightGBM] [Info] Start training from score 102882775257.610031




   ▶ WeightedVoting...




   ▶ Stacking...




📄 Saved metrics for 7 features

📊 Saving R²/MSE comparison table...
✅ All done!
