# Regression Model Training Pipeline
This notebook is converted from a Python training script. It includes steps to load the preprocessed data, train multiple regression models, evaluate them, and save the results for further analysis.

In [None]:
Import libraries and define helper functions
import os
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# === Paths ===
INPUT_PATH = '/Users/hj/MLAdditive/data/preprocessed.csv'
BASE_RESULTS_DIR = '/Users/hj/MLAdditive/results'



## Function: `load_data`
Loads preprocessed dataset from the CSV file.

In [None]:
def load_data(path):
    return pd.read_csv(path)



## Function: `evaluate_model`
Evaluates the regression model using key performance metrics.

In [None]:
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    print(f"\n🧠 Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    # Output directory
    out_dir = os.path.join(BASE_RESULTS_DIR, name.replace(" ", "_").lower())
    os.makedirs(out_dir, exist_ok=True)

    # Save model
    joblib.dump(model, os.path.join(out_dir, f"{name}_model.pkl"))

    # Save metrics
    results_df = pd.DataFrame({
        'Metric': ['MAE', 'MSE', 'RMSE', 'R2'],
        'Value': [mae, mse, rmse, r2]
    })
    results_df.to_csv(os.path.join(out_dir, 'evaluation_metrics.csv'), index=False)

    print(f"📉 {name} Results:")
    print(results_df.to_string(index=False))

    # Plot: Actual vs Predicted
    plt.figure()
    plt.scatter(y_test, y_pred, alpha=0.6)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
    plt.xlabel('Actual Lifetime_years')
    plt.ylabel('Predicted Lifetime_years')
    plt.title(f'{name}: Actual vs Predicted')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(os.path.join(out_dir, 'actual_vs_predicted.png'))

    # Plot: Residual Distribution
    residuals = y_test - y_pred
    plt.figure()
    plt.hist(residuals, bins=20, edgecolor='black')
    plt.xlabel('Residuals')
    plt.ylabel('Frequency')
    plt.title(f'{name}: Residuals Distribution')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(os.path.join(out_dir, 'residuals_distribution.png'))



## Function: `train_all_models`
This function is a core component of the model training workflow.

In [None]:
def train_all_models(df):
    X = df.drop('Lifetime_years', axis=1)
    y = df['Lifetime_years']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    models = {
        "Ridge Regression": GridSearchCV(Ridge(), param_grid={'alpha': [0.1, 1.0, 10.0, 100.0]}, cv=5, scoring='neg_mean_squared_error'),
        "Random Forest": RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42),
        "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
    }

    for name, model in models.items():
        evaluate_model(name, model, X_train, X_test, y_train, y_test)

if __name__ == "__main__":
    df = load_data(INPUT_PATH)
    train_all_models(df)
import os
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer

# === Paths ===
INPUT_PATH = '/Users/hj/MLAdditive/data/preprocessed.csv'
BASE_RESULTS_DIR = '/Users/hj/MLAdditive/results/regression'

# === Custom Scorers for CV ===


## Function: `rmse_scorer`
This function is a core component of the model training workflow.

In [None]:
def rmse_scorer(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_score = make_scorer(rmse_scorer, greater_is_better=False)
mae_score = make_scorer(mean_absolute_error, greater_is_better=False)



## Function: `load_data`
Loads preprocessed dataset from the CSV file.

In [None]:
def load_data(path):
    return pd.read_csv(path)



## Function: `evaluate_model`
Evaluates the regression model using key performance metrics.

In [None]:
def evaluate_model(name, model, X_train, X_test, y_train, y_test, X_full, y_full):
    print(f"\n🧠 Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Eval metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    out_dir = os.path.join(BASE_RESULTS_DIR, name.replace(" ", "_").lower())
    os.makedirs(out_dir, exist_ok=True)

    # Save model
    joblib.dump(model, os.path.join(out_dir, f"{name}_model.pkl"))

    # Save test split metrics
    test_metrics = pd.DataFrame({
        'Metric': ['MAE', 'MSE', 'RMSE', 'R2'],
        'Value': [mae, mse, rmse, r2]
    })
    test_metrics.to_csv(os.path.join(out_dir, 'evaluation_metrics.csv'), index=False)

    print(f"📉 {name} Test Results:\n{test_metrics.to_string(index=False)}")

    # Plot: Actual vs Predicted
    plt.figure()
    plt.scatter(y_test, y_pred, alpha=0.6)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
    plt.xlabel('Actual Lifetime_years')
    plt.ylabel('Predicted Lifetime_years')
    plt.title(f'{name}: Actual vs Predicted')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(os.path.join(out_dir, 'actual_vs_predicted.png'))

    # Plot: Residual Distribution
    residuals = y_test - y_pred
    plt.figure()
    plt.hist(residuals, bins=20, edgecolor='black')
    plt.xlabel('Residuals')
    plt.ylabel('Frequency')
    plt.title(f'{name}: Residuals Distribution')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(os.path.join(out_dir, 'residuals_distribution.png'))

    # === Cross-validation ===
    print(f"🔁 Running 5-Fold Cross-Validation for {name}...")

    cv_r2 = cross_val_score(model, X_full, y_full, cv=5, scoring='r2')
    cv_mae = -cross_val_score(model, X_full, y_full, cv=5, scoring=mae_score)
    cv_rmse = -cross_val_score(model, X_full, y_full, cv=5, scoring=rmse_score)

    cv_df = pd.DataFrame({
        'Metric': ['CV_R2', 'CV_MAE', 'CV_RMSE'],
        'Mean': [cv_r2.mean(), cv_mae.mean(), cv_rmse.mean()],
        'Std': [cv_r2.std(), cv_mae.std(), cv_rmse.std()]
    })

    cv_df.to_csv(os.path.join(out_dir, 'crossval_metrics.csv'), index=False)

    print(f"📊 {name} Cross-Validation (5-Fold) Summary:\n{cv_df.to_string(index=False)}")



## Function: `train_all_models`
This function is a core component of the model training workflow.

In [None]:
def train_all_models(df):
    X = df.drop('Lifetime_years', axis=1)
    y = df['Lifetime_years']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    models = {
        "Ridge Regression": GridSearchCV(Ridge(), param_grid={'alpha': [0.1, 1.0, 10.0, 100.0]}, cv=5, scoring='neg_mean_squared_error'),
        "Random Forest": RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42),
        "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
    }

    for name, model in models.items():
        evaluate_model(name, model, X_train, X_test, y_train, y_test, X, y)

if __name__ == "__main__":
    df = load_data(INPUT_PATH)
    train_all_models(df)


## Model Training Execution
This section runs the full pipeline using selected models and saves the outputs.

In [None]:
if __name__ == "__main__":
#     df = load_data(INPUT_PATH)
#     train_all_models(df)
import os
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer

# === Paths ===
INPUT_PATH = '/Users/hj/MLAdditive/data/preprocessed.csv'
BASE_RESULTS_DIR = '/Users/hj/MLAdditive/results/regression'

# === Custom Scorers for CV ===
def rmse_scorer(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_score = make_scorer(rmse_scorer, greater_is_better=False)
mae_score = make_scorer(mean_absolute_error, greater_is_better=False)

def load_data(path):
    return pd.read_csv(path)

def evaluate_model(name, model, X_train, X_test, y_train, y_test, X_full, y_full):
    print(f"\n🧠 Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Eval metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    out_dir = os.path.join(BASE_RESULTS_DIR, name.replace(" ", "_").lower())
    os.makedirs(out_dir, exist_ok=True)

    # Save model
    joblib.dump(model, os.path.join(out_dir, f"{name}_model.pkl"))

    # Save test split metrics
    test_metrics = pd.DataFrame({
        'Metric': ['MAE', 'MSE', 'RMSE', 'R2'],
        'Value': [mae, mse, rmse, r2]
    })
    test_metrics.to_csv(os.path.join(out_dir, 'evaluation_metrics.csv'), index=False)

    print(f"📉 {name} Test Results:\n{test_metrics.to_string(index=False)}")

    # Plot: Actual vs Predicted
    plt.figure()
    plt.scatter(y_test, y_pred, alpha=0.6)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
    plt.xlabel('Actual Lifetime_years')
    plt.ylabel('Predicted Lifetime_years')
    plt.title(f'{name}: Actual vs Predicted')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(os.path.join(out_dir, 'actual_vs_predicted.png'))

    # Plot: Residual Distribution
    residuals = y_test - y_pred
    plt.figure()
    plt.hist(residuals, bins=20, edgecolor='black')
    plt.xlabel('Residuals')
    plt.ylabel('Frequency')
    plt.title(f'{name}: Residuals Distribution')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(os.path.join(out_dir, 'residuals_distribution.png'))

    # === Cross-validation ===
    print(f"🔁 Running 5-Fold Cross-Validation for {name}...")

    cv_r2 = cross_val_score(model, X_full, y_full, cv=5, scoring='r2')
    cv_mae = -cross_val_score(model, X_full, y_full, cv=5, scoring=mae_score)
    cv_rmse = -cross_val_score(model, X_full, y_full, cv=5, scoring=rmse_score)

    cv_df = pd.DataFrame({
        'Metric': ['CV_R2', 'CV_MAE', 'CV_RMSE'],
        'Mean': [cv_r2.mean(), cv_mae.mean(), cv_rmse.mean()],
        'Std': [cv_r2.std(), cv_mae.std(), cv_rmse.std()]
    })

    cv_df.to_csv(os.path.join(out_dir, 'crossval_metrics.csv'), index=False)

    print(f"📊 {name} Cross-Validation (5-Fold) Summary:\n{cv_df.to_string(index=False)}")

def train_all_models(df):
    X = df.drop('Lifetime_years', axis=1)
    y = df['Lifetime_years']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    models = {
        "Ridge Regression": GridSearchCV(Ridge(), param_grid={'alpha': [0.1, 1.0, 10.0, 100.0]}, cv=5, scoring='neg_mean_squared_error'),
        "Random Forest": RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42),
        "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
    }

    for name, model in models.items():
        evaluate_model(name, model, X_train, X_test, y_train, y_test, X, y)

