In [None]:
"""
03_baseline_benchmark.ipynb

This notebook establishes baseline regression performance
using DummyRegressor. This helps contextualize the
performance of more complex models in future notebooks.

Key Steps:
- Load cleaned data
- Define input features and targets
- Benchmark with DummyRegressor for DFI and TUNEL
- Evaluate using cross-validation with appropriate metrics
"""

'\n03_baseline_benchmark.ipynb\n\nThis notebook establishes baseline regression and classification performance\nusing DummyRegressor. This helps contextualize the\nperformance of more complex models in future notebooks.\n\nKey Steps:\n- Load cleaned data\n- Define input features and targets (regression and classification)\n- Benchmark with DummyRegressor for DFI and TUNEL\n- Evaluate using cross-validation with appropriate metrics\n'

In [None]:
import pandas as pd
import numpy as np
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import (
    make_scorer, mean_squared_error, mean_absolute_error
    )

In [None]:
# ==========================================
# 1. Load Data and Define Feature Sets
# ==========================================

In [None]:
data = pd.read_pickle('03_cleaned_data.pkl')
test_data = pd.read_pickle('03_cleaned_test_data.pkl')

In [None]:
input_features = ['Semen Volume', 'Sperm Concentration', 'Sperm Count', 'Total Motility',
                   'Progressive Motility', 'Non-progressive Motility', 'Immotile Sperm',
                   'Abnormal Morphology', 'Age','TMS']
reg_targets = ['DFI', 'TUNEL']

In [None]:
X = data[input_features]
y_reg = data[reg_targets]

X_test = test_data[input_features]
y_reg_test = test_data[reg_targets]

In [None]:
print(f"Features (X): {X.shape}")
print(f"Regression Targets (y_reg): {y_reg.shape}")
print()
print(f"Features (X_test): {X_test.shape}")
print(f"Regression Targets (y_reg_test): {y_reg_test.shape}")

Features (X): (8716, 10)
Regression Targets (y_reg): (8716, 2)

Features (X_test): (1000, 10)
Regression Targets (y_reg_test): (1000, 2)


In [None]:
# ==========================================
# 2. Baseline Regression with DummyRegressor
# ==========================================

In [None]:
# Define scorers
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

# 10-fold cross-validation
cv = KFold(n_splits=10, shuffle=True, random_state=42)

#Iterate Through Regression targets
for target in reg_targets:
    y = y_reg[target]
    y_test = y_reg_test[target]
    model = DummyRegressor(strategy="mean")

    mse = cross_val_score(model, X, y, cv=cv, scoring=mse_scorer)
    mae = cross_val_score(model, X, y, cv=cv, scoring=mae_scorer)

    mse_test = cross_val_score(model, X_test, y_test, cv=cv, scoring=mse_scorer)
    mae_test = cross_val_score(model, X_test, y_test, cv=cv, scoring=mae_scorer)

    print(f"\n📉 Regression Target: {target}")
    print(f"MAE: {np.mean(-mae):.4f}")
    print(f"MSE: {np.mean(-mse):.4f}")
    print(f"RMSE: {np.sqrt(np.mean(-mse)):.4f}")
    print()
    print(f"Test MAE: {np.mean(-mae_test):.4f}")
    print(f"Test MSE: {np.mean(-mse_test):.4f}")
    print(f"Test RMSE: {np.sqrt(np.mean(-mse_test)):.4f}")


📉 Regression Target: DFI
MAE: 5.6826
MSE: 56.4389
RMSE: 7.5126

Test MAE: 5.7689
Test MSE: 58.7654
Test RMSE: 7.6659

📉 Regression Target: TUNEL
MAE: 4.1684
MSE: 37.4276
RMSE: 6.1178

Test MAE: 4.1373
Test MSE: 36.2495
Test RMSE: 6.0208
