In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import cross_val_score

RANDOM_STATE = 42

In [2]:
df_train = pd.read_csv('../data/train_processed_v3.csv')

original_features = [
    'Therapy Hours', 'Initial Health Score', 'Lifestyle Activities', 
    'Average Sleep Hours', 'Follow-Up Sessions'
]

squared_features = [
    'Therapy Hours_sq', 'Initial Health Score_sq', 
    'Average Sleep Hours_sq', 'Follow-Up Sessions_sq'
]

interaction_features = [
    'Therapy_Hours_x_Initial_Health_Score',
    'Therapy_Hours_x_Lifestyle_Activities',
    'Therapy_Hours_x_Average_Sleep_Hours',
    'Therapy_Hours_x_Follow-Up_Sessions',
    'Initial_Health_Score_x_Lifestyle_Activities',
    'Initial_Health_Score_x_Average_Sleep_Hours',
    'Initial_Health_Score_x_Follow-Up_Sessions',
    'Lifestyle_Activities_x_Average_Sleep_Hours',
    'Lifestyle_Activities_x_Follow-Up_Sessions',
    'Average_Sleep_Hours_x_Follow-Up_Sessions'
]

interaction_squared_features = [
    'Therapy_Hours_x_Initial_Health_Score_sq',
    'Therapy_Hours_x_Lifestyle_Activities_sq',
    'Therapy_Hours_x_Average_Sleep_Hours_sq',
    'Therapy_Hours_x_Follow-Up_Sessions_sq',
    'Initial_Health_Score_x_Lifestyle_Activities_sq',
    'Initial_Health_Score_x_Average_Sleep_Hours_sq',
    'Initial_Health_Score_x_Follow-Up_Sessions_sq',
    'Lifestyle_Activities_x_Average_Sleep_Hours_sq',
    'Lifestyle_Activities_x_Follow-Up_Sessions_sq',
    'Average_Sleep_Hours_x_Follow-Up_Sessions_sq'
]

features = original_features + interaction_features
target = 'Recovery Index'

X = df_train[features]
y = df_train[target]

print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")

Features (X) shape: (8000, 15)
Target (y) shape: (8000,)


In [3]:
# --- Model 2: Elastic Net ---

print("Starting Model 2: Elastic Net Tuning...")

#Create a Pipeline to chain scaling and the model
pipe_elastic = Pipeline([
    ('scale', StandardScaler()),
    ('model', ElasticNet(random_state=RANDOM_STATE, max_iter=1000))
])

#Define the parameters to search
#'model__alpha': The total strength of the penalty.
#'model__l1_ratio': The mix (0=Ridge, 1=Lasso, 0.5=Equal)

""" param_grid = {
    'model__alpha': [0.001, 0.01, 0.1, 1.0, 10.0],
    'model__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
} """

param_grid = {
    'model__alpha': [0.0005, 0.0008, 0.001, 0.0012, 0.0015],
    'model__l1_ratio': [0.8, 0.85, 0.9, 0.95, 1.0]
}

#Set up GridSearchCV for RMSE
#automatically uses k fold to test all parameter combinations
search_rmse = GridSearchCV(
    pipe_elastic,
    param_grid,
    cv=10,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

search_rmse.fit(X, y)

print("\n--- Elastic Net Results (RMSE) ---")
print(f"Best Parameters found: {search_rmse.best_params_}")
best_rmse = np.sqrt(-search_rmse.best_score_)
print(f"Average 10-Fold RMSE (from best model): {best_rmse:.3f}")

#Set up GridSearchCV for R-squared
search_r2 = GridSearchCV(
    pipe_elastic,
    param_grid,
    cv=10,
    scoring='r2',
    n_jobs=-1
)
search_r2.fit(X, y)

print("\n--- Elastic Net Results (R-squared) ---")
print(f"Average 10-Fold R-squared (from best model): {search_r2.best_score_:.3f}")

Starting Model 2: Elastic Net Tuning...

--- Elastic Net Results (RMSE) ---
Best Parameters found: {'model__alpha': 0.0015, 'model__l1_ratio': 1.0}
Average 10-Fold RMSE (from best model): 2.046

--- Elastic Net Results (R-squared) ---
Average 10-Fold R-squared (from best model): 0.989


In [4]:
"""
# --- Cell 5: Coefficient Inspection and Manual Feature Selection ---

print("Inspecting coefficients from the best model...")

# 1. Get the list of all 15 feature names
# (Make sure this 'features' list is the one for your v4 data)
all_15_features = X.columns.tolist()

# 2. Get the "best_estimator" from your GridSearch
# This is the pipeline that was trained on the best params
best_lasso_pipeline = search_rmse.best_estimator_

# 3. Extract the final model and its coefficients
# We access the step named 'model' inside the pipeline
final_model = best_lasso_pipeline.named_steps['model']
coefficients = final_model.coef_

# 4. Create a Series to see the names and coefficients together
feature_importance = pd.Series(coefficients, index=all_15_features).sort_values(ascending=False)

print("\n--- Feature Importance (Coefficients) ---")
print(feature_importance)

# 5. --- Create Your New "v7" Feature List ---
# We will "manually" select features that have a coefficient > 0.01 (or some small number)
# We are ignoring the sign (abs()) and just looking at the magnitude
threshold = 0.01 
features_v7 = feature_importance[abs(feature_importance) > threshold].index.tolist()

print(f"\n--- Selected {len(features_v7)} 'Strong' Features ---")
print(features_v7)

# 6. --- Test Your New "v7" Model ---
print("\nTesting a new Linear Regression on 'v7' features...")

X_v7 = X[features_v7]  # Create a new X with only the strong features
y_v7 = y

# Test it with a simple, new Linear Regression
# (We don't need Lasso, we already did the selection!)
model_v7 = LinearRegression()

# Set up scoring
scoring = 'neg_mean_squared_error'

# Perform 10-fold cross-validation
scores = cross_val_score(model_v7, X_v7, y_v7, cv=10, scoring=scoring, n_jobs=-1)

# Calculate and print the results
rmse_scores = np.sqrt(-scores)
avg_rmse = np.mean(rmse_scores)

print("\n--- 'v7' Model (Manual Selection) Results ---")
print(f"Average 10-Fold RMSE: {avg_rmse:.4f}") # Using 4 decimal places!
"""

'\n# --- Cell 5: Coefficient Inspection and Manual Feature Selection ---\n\nprint("Inspecting coefficients from the best model...")\n\n# 1. Get the list of all 15 feature names\n# (Make sure this \'features\' list is the one for your v4 data)\nall_15_features = X.columns.tolist()\n\n# 2. Get the "best_estimator" from your GridSearch\n# This is the pipeline that was trained on the best params\nbest_lasso_pipeline = search_rmse.best_estimator_\n\n# 3. Extract the final model and its coefficients\n# We access the step named \'model\' inside the pipeline\nfinal_model = best_lasso_pipeline.named_steps[\'model\']\ncoefficients = final_model.coef_\n\n# 4. Create a Series to see the names and coefficients together\nfeature_importance = pd.Series(coefficients, index=all_15_features).sort_values(ascending=False)\n\nprint("\n--- Feature Importance (Coefficients) ---")\nprint(feature_importance)\n\n# 5. --- Create Your New "v7" Feature List ---\n# We will "manually" select features that have a coe

In [5]:
# --- Elastic Net Submission Cell ---

print("Loading test data...")
X_test = pd.read_csv('../data/test_processed_v3.csv')
test_ids = pd.read_csv('../data/test_ids.csv')
print("Test data and IDs loaded.")
X_test = X_test[features]

#Get the best parameters from the search
best_params = search_rmse.best_params_
print(f"\nTraining a new model with best params: {best_params}")

final_elastic_pipe = Pipeline([
    ('scale', StandardScaler()),
    ('model', ElasticNet(random_state=RANDOM_STATE, max_iter=1000))
])

#Set the best parameters 
final_elastic_pipe.set_params(**best_params)

final_elastic_pipe.fit(X, y)
print("Model trained successfully.")

print("\nGenerating predictions...")
predictions = final_elastic_pipe.predict(X_test)

print("Creating submission file...")
submission_df = pd.DataFrame({'Id': test_ids['Id'], 'Recovery Index': predictions})
""" submission_df['Recovery Index'] = submission_df['Recovery Index'].round().astype(int) """
submission_path = '../submission/submission_model2.csv'
submission_df.to_csv(submission_path, index=False)

print(f"\nElastic Net submission file saved to: {submission_path}")
submission_df.head()

Loading test data...
Test data and IDs loaded.

Training a new model with best params: {'model__alpha': 0.0015, 'model__l1_ratio': 1.0}
Model trained successfully.

Generating predictions...
Creating submission file...

Elastic Net submission file saved to: ../submission/submission_model2.csv


Unnamed: 0,Id,Recovery Index
0,6253,54.691266
1,4685,22.588652
2,1732,47.93666
3,4743,31.288429
4,4522,42.957409
