In [None]:
# %% Imports
import pandas as pd
import numpy as np
import os
import time
import joblib
import psutil
import gc
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import VotingRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import clone

# %% Config
SEED = 42
np.random.seed(SEED)

# Top 5 features identified
TOP_FEATURES = [
    "DieWidth",
    "NumInstances",
    "NumValidPins", 
    "DieHeight",
    "NumNets"
]

TARGET = "TotalWirelength"
BASE_RESULTS = "results_top5_ensemble"
os.makedirs(BASE_RESULTS, exist_ok=True)

# %% Load and prepare data
def load_data():
    input_files = [
        "ispd18_global_features.csv",
        "ispd19_global_features.csv",
        "N28_global_features.csv"
    ]
    df = pd.concat([pd.read_csv(f) for f in input_files], ignore_index=True)
    return df[[TARGET] + TOP_FEATURES]

df = load_data()

# %% Initialize models
rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=SEED, n_jobs=-1)
xgb = XGBRegressor(n_estimators=150, learning_rate=0.05, max_depth=5, random_state=SEED, n_jobs=-1)
knn = KNeighborsRegressor(n_neighbors=5)
ensemble = VotingRegressor([("RF", rf), ("XGB", xgb), ("KNN", knn)], n_jobs=-1)

# %% Evaluation function
def evaluate_model(model, X_test, y_test):
    start_time = time.time()
    y_pred = model.predict(X_test)
    infer_time = (time.time() - start_time) / len(X_test) * 1000  # ms per sample
    
    return {
        "R2": r2_score(y_test, y_pred),
        "MSE": mean_squared_error(y_test, y_pred),
        "InferTime_ms_per_sample": infer_time
    }

# %% Memory measurement
def get_memory_usage():
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / (1024 * 1024)  # MB

# %% Main evaluation
results = []
X = df[TOP_FEATURES]
y = df[TARGET]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED
)

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Train and evaluate each model
for model_name, model in [
    ("RandomForest", rf),
    ("XGBoost", xgb),
    ("KNN", knn),
    ("Ensemble", ensemble)
]:
    print(f"\n=== Evaluating {model_name} ===")
    
    # Train model
    start_train = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_train
    
    # Evaluate
    metrics = evaluate_model(model, X_test, y_test)
    
    # Get model size
    model_path = os.path.join(BASE_RESULTS, f"{model_name}.pkl")
    joblib.dump(model, model_path)
    size_mb = os.path.getsize(model_path) / (1024 * 1024)
    
    # Get RAM usage
    ram_mb = get_memory_usage()
    
    results.append({
        "Model": model_name,
        "R2": metrics["R2"],
        "MSE": metrics["MSE"],
        "Size_MB": size_mb,
        "InferTime_ms_per_sample": metrics["InferTime_ms_per_sample"],
        "RAM_MB": ram_mb,
        "TrainTime_sec": train_time
    })
    
    print(f"R²: {metrics['R2']:.4f}")
    print(f"MSE: {metrics['MSE']:.2f}")
    print(f"Size: {size_mb:.2f} MB")
    print(f"Inference Time: {metrics['InferTime_ms_per_sample']:.4f} ms/sample")
    print(f"RAM Usage: {ram_mb:.2f} MB")
    print(f"Training Time: {train_time:.2f} sec")

# %% Save results
results_df = pd.DataFrame(results)
results_csv = os.path.join(BASE_RESULTS, "model_metrics.csv")
results_df.to_csv(results_csv, index=False)

print("\n=== Final Results ===")
print(results_df)
print(f"\nResults saved to: {results_csv}")

# %% Feature Importance Analysis (for ensemble components)
print("\n=== Feature Importance ===")
importance_data = []

for model_name, model in [("RF", rf), ("XGB", xgb)]:
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
    else:
        importances = model.feature_importances_
    
    for feature, importance in zip(TOP_FEATURES, importances):
        importance_data.append({
            "Model": model_name,
            "Feature": feature,
            "Importance": importance
        })

importance_df = pd.DataFrame(importance_data)
importance_csv = os.path.join(BASE_RESULTS, "feature_importance.csv")
importance_df.to_csv(importance_csv, index=False)
print(importance_df)
print(f"\nFeature importance saved to: {importance_csv}")