In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import cross_val_score

RANDOM_STATE = 42

In [7]:
df_train = pd.read_csv('../data/train_processed_v2.csv')

original_features = [
    'Therapy Hours', 'Initial Health Score', 'Lifestyle Activities', 
    'Average Sleep Hours', 'Follow-Up Sessions'
]

interaction_features = [
    'Therapy_Hours_x_Initial_Health_Score',
    'Therapy_Hours_x_Lifestyle_Activities',
    'Therapy_Hours_x_Average_Sleep_Hours',
    'Therapy_Hours_x_Follow-Up_Sessions',
    'Initial_Health_Score_x_Lifestyle_Activities',
    'Initial_Health_Score_x_Average_Sleep_Hours',
    'Initial_Health_Score_x_Follow-Up_Sessions',
    'Lifestyle_Activities_x_Average_Sleep_Hours',
    'Lifestyle_Activities_x_Follow-Up_Sessions',
    'Average_Sleep_Hours_x_Follow-Up_Sessions'
]

features = original_features + interaction_features
target = 'Recovery Index'

X = df_train[features]
y = df_train[target]

print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")

Features (X) shape: (8000, 15)
Target (y) shape: (8000,)


In [8]:
# --- Model 2: Elastic Net ---

print("Starting Model 2: Elastic Net Tuning...")

#Create a Pipeline to chain scaling and the model
pipe_elastic = Pipeline([
    ('scale', StandardScaler()),
    ('model', ElasticNet(random_state=RANDOM_STATE, max_iter=1000))
])

#Define the parameters to search
#'model__alpha': The total strength of the penalty.
#'model__l1_ratio': The mix (0=Ridge, 1=Lasso, 0.5=Equal)

""" param_grid = {
    'model__alpha': [0.001, 0.01, 0.1, 1.0, 10.0],
    'model__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
} """

param_grid = {
    'model__alpha': [0.0013, 0.0014, 0.0015, 0.0016, 0.0017], 
    'model__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9, 1.0] 
}

#Set up GridSearchCV for RMSE
#automatically uses k fold to test all parameter combinations
search_rmse = GridSearchCV(
    pipe_elastic,
    param_grid,
    cv=10,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

search_rmse.fit(X, y)

# --- Add this block to inspect feature coefficients ---

print("\n--- Feature Coefficients from Best Model ---")

# 1. Get the best pipeline found by GridSearchCV
best_pipeline = search_rmse.best_estimator_

# 2. Extract the trained ElasticNet model from the pipeline
best_model = best_pipeline.named_steps['model']

# 3. Get the feature names (assuming X is a DataFrame)
feature_names = X.columns

# 4. Get the coefficients from the trained model
coefficients = best_model.coef_

# 5. Create a Pandas Series
coef_series = pd.Series(coefficients, index=feature_names)
coef_series_sorted = coef_series.abs().sort_values(ascending=False) # Sort by magnitude

print("Coefficients (sorted by importance):")
# Print coefficients matching the sorted order
print(coef_series[coef_series_sorted.index]) 

# --- End of added block ---

print("\n--- Elastic Net Results (RMSE) ---")
print(f"Best Parameters found: {search_rmse.best_params_}")
best_rmse = np.sqrt(-search_rmse.best_score_)
print(f"Average 10-Fold RMSE (from best model): {best_rmse:.3f}")

Starting Model 2: Elastic Net Tuning...

--- Feature Coefficients from Best Model ---
Coefficients (sorted by importance):
Initial Health Score                           17.554420
Therapy Hours                                   7.260035
Average Sleep Hours                             0.650876
Follow-Up Sessions                              0.437480
Lifestyle Activities                            0.213933
Average_Sleep_Hours_x_Follow-Up_Sessions        0.114854
Therapy_Hours_x_Lifestyle_Activities            0.109860
Initial_Health_Score_x_Average_Sleep_Hours      0.109287
Therapy_Hours_x_Average_Sleep_Hours             0.085550
Therapy_Hours_x_Initial_Health_Score            0.016002
Therapy_Hours_x_Follow-Up_Sessions              0.010321
Initial_Health_Score_x_Lifestyle_Activities     0.000000
Initial_Health_Score_x_Follow-Up_Sessions      -0.000000
Lifestyle_Activities_x_Average_Sleep_Hours      0.000000
Lifestyle_Activities_x_Follow-Up_Sessions       0.000000
dtype: float64

--- El

In [9]:
# --- Elastic Net Submission Cell ---

print("Loading test data...")
X_test = pd.read_csv('../data/test_processed_v2.csv')
test_ids = pd.read_csv('../data/test_ids.csv')
print("Test data and IDs loaded.")
X_test = X_test[features]

#Get the best parameters from the search
best_params = search_rmse.best_params_
print(f"\nTraining a new model with best params: {best_params}")

final_elastic_pipe = Pipeline([
    ('scale', StandardScaler()),
    ('model', ElasticNet(random_state=RANDOM_STATE, max_iter=1000))
])

#Set the best parameters 
final_elastic_pipe.set_params(**best_params)

final_elastic_pipe.fit(X, y)
print("Model trained successfully.")

print("\nGenerating predictions...")
predictions = final_elastic_pipe.predict(X_test)

print("Creating submission file...")
submission_df = pd.DataFrame({'Id': test_ids['Id'], 'Recovery Index': predictions})
submission_path = '../submission/submission_final.csv'
submission_df.to_csv(submission_path, index=False)

print(f"\nElastic Net submission file saved to: {submission_path}")
submission_df.head()

Loading test data...
Test data and IDs loaded.

Training a new model with best params: {'model__alpha': 0.0017, 'model__l1_ratio': 1.0}
Model trained successfully.

Generating predictions...
Creating submission file...

Elastic Net submission file saved to: ../submission/submission_final.csv


Unnamed: 0,Id,Recovery Index
0,6253,54.691281
1,4685,22.589949
2,1732,47.936289
3,4743,31.287771
4,4522,42.957485
