In [8]:
# === Imports ===
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import os

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# === Load processed data ===
train_data = pd.read_csv('Downloads/widsdatathon2025-university/processed_data/processed_train_data.csv')
test_data = pd.read_csv('Downloads/widsdatathon2025-university/processed_data/processed_test_data.csv')

# === Define features and target ===
features = [col for col in train_data.columns if col not in ["age", "participant_id"]]
X = train_data[features]
y = train_data["age"]

# === Log-transform the target for RMSE stability ===
y_log = np.log1p(y)  # log(1 + age)

# === Train/test split for evaluation ===
X_train, X_test, y_train_log, y_test = train_test_split(
    X, y_log,  # use log-transformed target
    test_size=0.15,
    random_state=42314,
    stratify=train_data["sex_Male"]
)

# === Scale features ===
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# === Ridge Regression with GridSearchCV ===
param_grid = {"alpha": np.linspace(4900, 5000, 200)}
ridge_model = Ridge()

grid_search = GridSearchCV(ridge_model, param_grid, cv=100, scoring="neg_mean_squared_error")
grid_search.fit(X_train_scaled, y_train_log)

# === Best model ===
best_alpha = grid_search.best_params_["alpha"]
best_mse = -grid_search.best_score_
print(f"Best Alpha: {best_alpha}")
print(f"Best Cross-Validated MSE (log-space): {best_mse:.3f}")

best_ridge = Ridge(alpha=best_alpha)
best_ridge.fit(X_train_scaled, y_train_log)

# === Predict and reverse log-transform ===
log_preds = best_ridge.predict(X_test_scaled)
y_pred = np.expm1(log_preds)  # reverse log1p

# === Evaluation in original scale ===
y_true_original = np.expm1(y_test)
rmse = np.sqrt(mean_squared_error(y_true_original, y_pred))
mae = mean_absolute_error(y_true_original, y_pred)
r2 = r2_score(y_true_original, y_pred)

print("\nModel Evaluation (original scale):")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"R²: {r2:.4f}")

# === Final Prediction for Kaggle Test Set ===
X_kaggle_scaled = scaler.transform(test_data[features])
kaggle_log_preds = best_ridge.predict(X_kaggle_scaled)
kaggle_preds = np.expm1(kaggle_log_preds)

# === Build submission DataFrame ===
submission_df = pd.DataFrame({
    "participant_id": test_data["participant_id"],
    "age": kaggle_preds
})

# === Save to Downloads folder ===
downloads_path = os.path.join(os.path.expanduser("~"), "Downloads")
submission_path = os.path.join(downloads_path, "submission_ridge_logtransform.csv")

submission_df.to_csv(submission_path, index=False)
print(f"\n✅ Submission saved to: {submission_path}")



Best Alpha: 4900.0
Best Cross-Validated MSE (log-space): 0.027

Model Evaluation (original scale):
RMSE: 2.1169
MAE: 1.7103
R²: 0.5998

✅ Submission saved to: /Users/rosaligonzalez/Downloads/submission_ridge_logtransform.csv
