In [1]:
# ==========================================
# Prostate Cancer Dataset (High Variance)
# ==========================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# --------------------------------
# 1. Load Dataset
# --------------------------------

url = "https://web.stanford.edu/~hastie/ElemStatLearn/datasets/prostate.data"
df = pd.read_csv(url, sep="\t")

# Remove unnamed index column
df = df.drop(columns=["Unnamed: 0"])

print("Shape:", df.shape)
print(df.head())


# --------------------------------
# 2. Define Target and Features
# --------------------------------

y = df["lpsa"]        # log PSA level (target)
X = df.drop(columns=["lpsa", "train"])  # drop target and predefined split column


# --------------------------------
# 3. Train/Test Split
# --------------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=42
)


# --------------------------------
# 4. Standardize (VERY IMPORTANT for Ridge/Lasso)
# --------------------------------

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("Training shape:", X_train.shape)

Shape: (97, 10)
     lcavol   lweight  age      lbph  svi       lcp  gleason  pgg45      lpsa  \
0 -0.579818  2.769459   50 -1.386294    0 -1.386294        6      0 -0.430783   
1 -0.994252  3.319626   58 -1.386294    0 -1.386294        6      0 -0.162519   
2 -0.510826  2.691243   74 -1.386294    0 -1.386294        7     20 -0.162519   
3 -1.203973  3.282789   58 -1.386294    0 -1.386294        6      0 -0.162519   
4  0.751416  3.432373   62 -1.386294    0 -1.386294        6      0  0.371564   

  train  
0     T  
1     T  
2     T  
3     T  
4     T  
Training shape: (72, 8)


In [2]:
from sklearn.linear_model import LinearRegression

ols = LinearRegression()
ols.fit(X_train, y_train)

print("OLS Coefficients:")
print(ols.coef_)

OLS Coefficients:
[ 0.73220775  0.17692005 -0.19087863  0.14205699  0.3441528  -0.22056052
  0.04767524  0.13155793]


In [None]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=10)
ridge.fit(X_train, y_train)

print("Ridge Coefficients:")
print(ridge.coef_)
# Slightly more distributed!

Ridge Coefficients:
[ 0.58589038  0.18464998 -0.13102284  0.11005215  0.28286256 -0.06806792
  0.05210259  0.09064348]


In [None]:
# ==========================================
# Regression
# ==========================================

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
# --------------------------------
# 1. Ridge Tuning
# --------------------------------

alpha_grid = {"alpha": np.logspace(-4, 3, 100)}

ridge_grid = GridSearchCV(
    Ridge(),
    alpha_grid,
    cv=4,
    scoring="neg_mean_absolute_error",
    n_jobs=-1
)

ridge_grid.fit(X_train, y_train)
best_ridge = ridge_grid.best_estimator_

print("Best Ridge alpha:", ridge_grid.best_params_)


# --------------------------------
# 2. Lasso Tuning (Faster + Stable)
# --------------------------------

from sklearn.linear_model import LassoCV

lasso_cv = LassoCV(
    alphas=np.logspace(-4, 4, 300),
    cv=4,
    max_iter=20000,
    n_jobs=-1,
    random_state=42
)

lasso_cv.fit(X_train, y_train)

best_lasso = Lasso(alpha=lasso_cv.alpha_, max_iter=20000)

print("Best Lasso alpha:", lasso_cv.alpha_)


from sklearn.linear_model import ElasticNetCV # L1 and L2 combined

elastic = ElasticNetCV(
    l1_ratio=np.linspace(0.01, 0.2, 20),
    alphas=np.logspace(-4, 3, 100),
    cv=4,
    max_iter=20000,
    random_state=42
) # may perform the best, but Ridge can do better if optimization range is changed

elastic.fit(X_train, y_train)

# --------------------------------
# 3. Model Comparison
# --------------------------------

models = {
    "Linear Regression": LinearRegression(),
    "Ridge (Tuned)": best_ridge,
    "Lasso (Tuned)": best_lasso,
    "Elastic": elastic,
    "Decision Tree": DecisionTreeRegressor(max_depth=3, random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=200, max_depth=3, random_state=42),
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    
    results.append({
        "Model": name,
        "MAE": mae
    })

results_df = pd.DataFrame(results).sort_values(by="MAE")

print("\nModel Comparison (Sorted by MAE):\n")
print(results_df)

Best Ridge alpha: {'alpha': 3.944206059437656}
Best Lasso alpha: 0.0644710210732387

Model Comparison (Sorted by MAE):

               Model       MAE
3            Elastic  0.476072
1      Ridge (Tuned)  0.480390
0  Linear Regression  0.496802
2      Lasso (Tuned)  0.501044
5      Random Forest  0.632334
4      Decision Tree  0.815808


In [16]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV

# --------------------------------
# Kernel Ridge (RBF)
# --------------------------------

kr_param_grid = {
    "alpha": np.logspace(-3, 2, 100),
    "gamma": np.logspace(-3, 2, 100)
}

kr_grid = GridSearchCV(
    KernelRidge(kernel="rbf"),
    kr_param_grid,
    cv=5,
    scoring="neg_mean_absolute_error",
    n_jobs=-1
)

kr_grid.fit(X_train, y_train)
best_kernel_ridge = kr_grid.best_estimator_

print("Best Kernel Ridge params:", kr_grid.best_params_)


# --------------------------------
# Support Vector Regression (RBF)
# --------------------------------

svr_param_grid = {
    "C": np.logspace(-2, 3, 50),
    "gamma": np.logspace(-3, 2, 50),
    "epsilon": [0.01, 0.05, 0.1]
}

svr_grid = GridSearchCV(
    SVR(kernel="rbf"),
    svr_param_grid,
    cv=4,
    scoring="neg_mean_absolute_error",
    n_jobs=-1
)

svr_grid.fit(X_train, y_train)
best_svr = svr_grid.best_estimator_

print("Best SVR params:", svr_grid.best_params_)


# --------------------------------
# MLP (Small Network + Regularized)
# --------------------------------

mlp_param_grid = {
    "hidden_layer_sizes": [(8,), (16,), (32,)],
    "alpha": [0.01, 0.1, 1, 3],
    "learning_rate_init": [0.0001, 0.001, 0.01]
}

mlp_grid = GridSearchCV(
    MLPRegressor(
        activation="relu",
        solver="adam",
        max_iter=10_000,
        early_stopping=True,
        random_state=42
    ),
    mlp_param_grid,
    cv=4,
    scoring="neg_mean_absolute_error",
    n_jobs=-1
)

mlp_grid.fit(X_train, y_train)
best_mlp = mlp_grid.best_estimator_

print("Best MLP params:", mlp_grid.best_params_)


# --------------------------------
# Final Model Comparison
# --------------------------------

models = {
    "Linear Regression": LinearRegression(),
    "Ridge (Tuned)": best_ridge,
    "Lasso (Tuned)": best_lasso,
    "Elastic": elastic,
    "Kernel Ridge (RBF)": best_kernel_ridge,
    "SVR (RBF)": best_svr,
    "MLP": best_mlp
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    
    results.append({
        "Model": name,
        "MAE": mae
    })

results_df = pd.DataFrame(results).sort_values(by="MAE")

print("\nExtended Model Comparison (Sorted by MAE):\n")
print(results_df)

Best Kernel Ridge params: {'alpha': 0.07390722033525779, 'gamma': 0.011497569953977356}
Best SVR params: {'C': 7.196856730011521, 'epsilon': 0.01, 'gamma': 0.016768329368110083}
Best MLP params: {'alpha': 3, 'hidden_layer_sizes': (32,), 'learning_rate_init': 0.001}

Extended Model Comparison (Sorted by MAE):

                Model       MAE
3             Elastic  0.476072
1       Ridge (Tuned)  0.480390
0   Linear Regression  0.496802
2       Lasso (Tuned)  0.501044
4  Kernel Ridge (RBF)  0.519625
5           SVR (RBF)  0.544697
6                 MLP  1.020992
