In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

# --- 1. DATA LOADING (Using uploaded files: train.csv and test.csv) ---

try:
    # Load the actual provided files
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')

    print("--- Actual Data Loaded ---")
    print(f"Train samples: {len(train_df)}, Test samples: {len(test_df)}")
    print("--------------------------")

except FileNotFoundError:
    # Fallback/Error handling if files are not accessible or named differently
    print("Error: train.csv or test.csv not found.")
    print("Please ensure the files are correctly named and accessible.")
    # Exiting gracefully if the files cannot be loaded
    exit()


# Store the test IDs for the final submission file
test_ids = test_df['Id']

# Separate features (X) and target (y)
X_train = train_df.drop(['Id', 'Recovery Index'], axis=1)
y_train = train_df['Recovery Index']
X_test = test_df.drop('Id', axis=1)


# --- 2. PREPROCESSING AND FEATURE ENGINEERING ---

def preprocess_and_engineer_features(df):
    """Applies categorical encoding and creates new linearly combined features."""

    # 1. Categorical Encoding (Lifestyle Activities: Yes -> 1, No -> 0)
    # Ensure there are no NaNs in this column before mapping
    df['Lifestyle Activities'] = df['Lifestyle Activities'].fillna('No')
    df['Lifestyle_Active'] = df['Lifestyle Activities'].apply(lambda x: 1 if x == 'Yes' else 0)
    df = df.drop('Lifestyle Activities', axis=1)

    # 2. Linear Feature Combinations (Avoiding polynomials/high order)
    # The new features are derived from linear operations (multiplication, division)
    # of the base features, which maintains the spirit of linear modeling when 
    # the new features are treated as independent inputs.

    T = df['Therapy Hours']
    H = df['Initial Health Score']
    S = df['Average Sleep Hours']
    F = df['Follow-Up Sessions']

    # Handle potential division by zero for stability
    epsilon = 1e-6 # Small constant to prevent division by zero

    # Ratio: Therapy hours per effective session count
    # Represents the intensity/efficiency of therapy hours relative to follow-up commitment
    df['Therapy_Per_FollowUp'] = T / (F + epsilon)

    # Product: Combined health momentum (Sleep quality * Initial condition)
    df['Sleep_Health_Product'] = S * H

    # Sum: Total effort in treatment
    # Simple aggregation of time/commitment resources
    df['Combined_Treatment_Hours'] = T + F

    # Ratio: Health score per hour of sleep (Efficiency of rest)
    df['Health_Per_Sleep'] = H / (S + epsilon)

    # Product: Direct Therapy Impact Score (Therapy hours * Initial condition)
    df['Therapy_Health_Product'] = T * H

    return df

# Apply preprocessing and feature engineering to both datasets
X_train_processed = preprocess_and_engineer_features(X_train.copy())
X_test_processed = preprocess_and_engineer_features(X_test.copy())

# Ensure columns are consistent after processing
# This is crucial in production to handle cases where one set might miss a category (not applicable here)
# or if columns were added/removed differently.
X_test_processed = X_test_processed[X_train_processed.columns]



--- Actual Data Loaded ---
Train samples: 8000, Test samples: 2000
--------------------------


In [17]:
import numpy as np
import pandas as pd
import time
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import RidgeCV, ElasticNetCV
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.impute import SimpleImputer

# X_train_processed, X_test_processed, y_train, test_ids are expected above
X = X_train_processed.copy()
X_test = X_test_processed.copy()
y = y_train.values.ravel()

rng_seed = 42
# cross_val_predict requires a partitioning CV (no repeats) so use KFold with shuffle
cv = KFold(n_splits=10, shuffle=True, random_state=rng_seed)

# Ensure no NaNs remain (impute with median for numeric)
num_imputer = SimpleImputer(strategy="median")
X_imputed = pd.DataFrame(num_imputer.fit_transform(X), columns=X.columns, index=X.index)
X_test_imputed = pd.DataFrame(num_imputer.transform(X_test), columns=X_test.columns, index=X_test.index)

n_features = X_imputed.shape[1]
k_select = min(12, max(3, n_features))

models = {
    "ridge": Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("select", SelectKBest(score_func=f_regression, k=k_select)),
        ("ridge", RidgeCV(alphas=[0.01, 0.1, 1.0, 10.0, 50.0], cv=5))
    ]),
    "elasticnet": Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("select", SelectKBest(score_func=f_regression, k=k_select)),
        ("en", ElasticNetCV(l1_ratio=[0.1, 0.5, 0.9], alphas=[0.01, 0.1, 1.0], cv=5, random_state=rng_seed, max_iter=5000))
    ]),
    "hgb": Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("hgb", HistGradientBoostingRegressor(random_state=rng_seed, max_iter=800, learning_rate=0.05))
    ])
}

oof_dict = {}
rmse_dict = {}
start = time.time()
for name, pipe in models.items():
    print(f"Computing OOF for {name} ...")
    # use X_imputed.values so pipeline receives numpy array consistently
    oof = cross_val_predict(pipe, X_imputed.values, y, cv=cv, n_jobs=-1, method="predict")
    rmse = np.sqrt(mean_squared_error(y, oof))
    oof_dict[name] = oof
    rmse_dict[name] = rmse
    print(f"{name} OOF RMSE: {rmse:.4f}")

print(f"OOF compute finished in {time.time()-start:.1f}s")

# Build OOF matrix for stacking
oof_matrix = np.vstack([oof_dict[m] for m in oof_dict]).T
print("Base models OOF RMSEs:", rmse_dict)

# Stacking meta-model (train on OOF predictions)
meta = RidgeCV(alphas=[0.01, 0.1, 1.0, 10.0], cv=5)
meta.fit(oof_matrix, y)
oof_meta_pred = meta.predict(oof_matrix)
rmse_meta = np.sqrt(mean_squared_error(y, oof_meta_pred))
print(f"Stacking meta OOF RMSE: {rmse_meta:.4f}")

# Fit base models on full training data and produce test predictions
test_preds_models = []
for name, pipe in models.items():
    pipe.fit(X_imputed.values, y)
    preds = pipe.predict(X_test_imputed.values)
    test_preds_models.append(preds)
test_matrix = np.vstack(test_preds_models).T

# Use stacking meta to combine test predictions
test_preds_stack = meta.predict(test_matrix)

# As a fallback also compute simple weighted average (inverse-rmse^2) and compare
rmse_vals = np.array([rmse_dict[m] for m in oof_dict])
weights = 1.0 / (rmse_vals**2 + 1e-12)
weights = weights / weights.sum()
test_preds_weighted = (test_matrix * weights).sum(axis=1)

# Choose final test preds: stacking (preferred) but also print both RMSE on OOF proxy
oof_weighted = (oof_matrix * weights).sum(axis=1)
rmse_ens_weighted = np.sqrt(mean_squared_error(y, oof_weighted))
print(f"Weighted ensemble OOF RMSE: {rmse_ens_weighted:.4f}")

final_test_preds = test_preds_stack  # stacking tends to perform better

# Clip to training target range; do NOT aggressively round before submission (rounding can increase RMSE)
y_min, y_max = y.min(), y.max()
final_test_preds = np.clip(final_test_preds, y_min, y_max)

# If the competition requires integer targets, round at submission; otherwise keep float.
# Keep as integer only if necessary:
final_test_preds_int = np.rint(final_test_preds).astype(int)

submission = pd.DataFrame({
    "Id": test_ids,
    "Recovery Index": final_test_preds_int
})
submission.to_csv("submission_ensemble_improved.csv", index=False)
print("Saved submission_ensemble_improved.csv — first rows:")
print(submission.head())

import joblib
joblib.dump({"models": models, "meta": meta, "weights": weights, "rmse": rmse_dict}, "ensemble_models.joblib", compress=3)


Computing OOF for ridge ...
ridge OOF RMSE: 2.0441
Computing OOF for elasticnet ...
elasticnet OOF RMSE: 2.0476
Computing OOF for hgb ...
hgb OOF RMSE: 2.2307
OOF compute finished in 7.4s
Base models OOF RMSEs: {'ridge': 2.0441044578324297, 'elasticnet': 2.0475641538575147, 'hgb': 2.2306827053024123}
Stacking meta OOF RMSE: 2.0437
Weighted ensemble OOF RMSE: 2.0652
Saved submission_ensemble_improved.csv — first rows:
     Id  Recovery Index
0  6253              55
1  4685              23
2  1732              48
3  4743              31
4  4522              43


['ensemble_models.joblib']