In [10]:
import pandas as pd
import numpy as np
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV, KFold, cross_val_predict
from sklearn.metrics import mean_squared_error

In [11]:
# Load the datasets
train_df = pd.read_csv('processed_train.csv')
test_df = pd.read_csv('processed_test.csv')

# Display the first few rows to confirm they are loaded correctly
print("Training Data Head:")
print(train_df.head())
print("\nTest Data Head:")
print(test_df.head())

Training Data Head:
     Id  Therapy Hours  Initial Health Score  Lifestyle Activities  \
0  9255       0.006455             -1.183844             -0.989307   
1  1562      -1.149747             -1.241506              1.010808   
2  1671      -1.149747              0.661318             -0.989307   
3  6088      -1.149747             -1.356828             -0.989307   
4  6670       1.162658             -1.299167             -0.989307   

   Average Sleep Hours  Follow-Up Sessions  Recovery Index  
0             0.269888            0.134041              36  
1             0.269888            0.483562              25  
2             0.269888           -0.914520              59  
3            -0.318906           -1.264041              22  
4             1.447477           -1.613561              40  

Test Data Head:
     Id  Therapy Hours  Initial Health Score  Lifestyle Activities  \
0  6253       0.006455             -0.030618             -0.989307   
1  4685      -1.149747             -

In [12]:
# Separate features (X) and target (y)
X = train_df.drop(['Id', 'Recovery Index'], axis=1)
y = train_df['Recovery Index']

# Prepare the test set (X_test)
X_test = test_df.drop('Id', axis=1)

In [13]:
# Define the model
elastic_net = ElasticNet(max_iter=10000, random_state=42)

# Define the hyperparameter grid to search
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 100],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9, 0.95, 0.99, 1]
}

# Set up the 10-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=elastic_net,
    param_grid=param_grid,
    cv=kfold,
    scoring='neg_mean_squared_error',  # We use negative MSE for optimization
    n_jobs=-1,  # Use all available CPU cores
    verbose=2
)

In [14]:
# Fit the grid search to the data
grid_search.fit(X, y)

# Print the best parameters and the corresponding score
print("Best Parameters found: ", grid_search.best_params_)
print("Best CV score (Negative MSE): ", grid_search.best_score_)

Fitting 10 folds for each of 48 candidates, totalling 480 fits
[CV] END ..........................alpha=0.001, l1_ratio=0.1; total time=   0.0s
[CV] END ..........................alpha=0.001, l1_ratio=0.1; total time=   0.0s
[CV] END ..........................alpha=0.001, l1_ratio=0.1; total time=   0.0s
[CV] END ..........................alpha=0.001, l1_ratio=0.1; total time=   0.0s
[CV] END ..........................alpha=0.001, l1_ratio=0.1; total time=   0.0s
[CV] END ..........................alpha=0.001, l1_ratio=0.1; total time=   0.0s
[CV] END ..........................alpha=0.001, l1_ratio=0.1; total time=   0.0s
[CV] END ..........................alpha=0.001, l1_ratio=0.1; total time=   0.0s
[CV] END ..........................alpha=0.001, l1_ratio=0.1; total time=   0.0s
[CV] END ..........................alpha=0.001, l1_ratio=0.1; total time=   0.0s
[CV] END ..........................alpha=0.001, l1_ratio=0.3; total time=   0.0s
[CV] END ..........................alpha=0.001

In [15]:
# Get the best estimator from the grid search
best_elastic_net = grid_search.best_estimator_

print("Final Model:")
print(best_elastic_net)

Final Model:
ElasticNet(alpha=0.001, l1_ratio=1, max_iter=10000, random_state=42)


In [16]:
# Generate predictions on the test data
predictions = best_elastic_net.predict(X_test)

# Round predictions to the nearest integer as per competition description
predictions = np.round(predictions).astype(int)

# Create the submission DataFrame
submission_df = pd.DataFrame({
    'Id': test_df['Id'],
    'Recovery Index': predictions
})

# Save the submission file
submission_df.to_csv('elastic_net_submission.csv', index=False)

print("'elastic_net_submission.csv' created successfully!")
print(submission_df.head())

'elastic_net_submission.csv' created successfully!
     Id  Recovery Index
0  6253              55
1  4685              23
2  1732              48
3  4743              31
4  4522              43


In [17]:
# Get cross-validated predictions for the training data using the best model
cv_predictions = cross_val_predict(best_elastic_net, X, y, cv=kfold)

# Round the predictions
cv_predictions = np.round(cv_predictions)

# Create a comparison DataFrame
comparison_df = pd.DataFrame({
    'Id': train_df['Id'],
    'Actual Recovery Index': y,
    'Predicted Recovery Index': cv_predictions
})

# Calculate the absolute difference (delta) between actual and predicted values
comparison_df['Delta'] = (comparison_df['Actual Recovery Index'] - comparison_df['Predicted Recovery Index'])

# Save the comparison file
comparison_df.to_csv('elastic_net_comparison.csv', index=False)

print("'elastic_net_comparison.csv' created successfully with a 'Delta' column!")
print(comparison_df.head())

# Optional: Calculate and print the Root Mean Squared Error on the training data
rmse = np.sqrt(mean_squared_error(y, cv_predictions))
print(f"\nCross-Validated RMSE on Training Data: {rmse}")

'elastic_net_comparison.csv' created successfully with a 'Delta' column!
     Id  Actual Recovery Index  Predicted Recovery Index  Delta
0  9255                     36                      34.0    2.0
1  1562                     25                      26.0   -1.0
2  1671                     59                      58.0    1.0
3  6088                     22                      22.0    0.0
4  6670                     40                      41.0   -1.0

Cross-Validated RMSE on Training Data: 2.061583129539044


In [18]:
describe_df = comparison_df.describe()

cols = list(describe_df.columns)

# Move "Delta Recovery Index" to the 4th position (index 3)
# Only if it exists, just to be safe
if 'Delta' in cols:
    cols.remove('Delta')
    cols.insert(0, 'Delta')
    describe_df = describe_df[cols]

describe_df.to_csv("elastic_net_describe.csv", index=True)