In [5]:
# Import necessary libraries
import pandas as pd
import joblib
from sklearn.linear_model import LassoCV, Lasso
from sklearn.model_selection import KFold
import numpy as np

# Load the preprocessed training data
data = pd.read_csv('processed_train.csv')

# Display the first few rows of the dataframe
data.head()

Unnamed: 0,Id,Therapy Hours,Initial Health Score,Lifestyle Activities,Average Sleep Hours,Follow-Up Sessions,Recovery Index
0,9255,0.006455,-1.183844,-0.989307,0.269888,0.134041,36
1,1562,-1.149747,-1.241506,1.010808,0.269888,0.483562,25
2,1671,-1.149747,0.661318,-0.989307,0.269888,-0.91452,59
3,6088,-1.149747,-1.356828,-0.989307,-0.318906,-1.264041,22
4,6670,1.162658,-1.299167,-0.989307,1.447477,-1.613561,40


In [6]:
# Separate features (X) and target (y)
X = data.drop(columns=['Id', 'Recovery Index'])
y = data['Recovery Index']

print("Shape of features (X):", X.shape)
print("Shape of target (y):", y.shape)

Shape of features (X): (8000, 5)
Shape of target (y): (8000,)


In [7]:
# Set up the cross-validation strategy
# We'll use 10 folds and shuffle the data. A random_state is used for reproducibility.
cv = KFold(n_splits=10, shuffle=True, random_state=42)

# Create a LassoCV model
# This model will test a range of alphas and find the best one using cross-validation.
# 'cv=cv' tells the model to use our KFold strategy.
# 'random_state' ensures that the results are reproducible.
lasso_cv = LassoCV(cv=cv, random_state=42)

# Fit the model to the data
lasso_cv.fit(X, y)

# Get the best alpha found by LassoCV
best_alpha = lasso_cv.alpha_
print(f"Best alpha found: {best_alpha}")

Best alpha found: 0.017563374300956593


In [8]:
# Initialize and train the final Lasso model with the best alpha
final_lasso_model = Lasso(alpha=best_alpha, random_state=42)
final_lasso_model.fit(X, y)

print("Lasso model trained successfully.")

Lasso model trained successfully.


In [9]:
# Define the filename for your saved model
filename = 'lasso_regression.joblib'

# Use joblib.dump to save the trained model object to a file
joblib.dump(final_lasso_model, filename)

print(f"Model saved successfully to '{filename}'")

Model saved successfully to 'lasso_regression.joblib'


In [14]:
processed_test_df = pd.read_csv('processed_test.csv')

# Separate the Ids for the submission file and the features for prediction
test_ids = processed_test_df['Id']
X_test = processed_test_df.drop(columns=['Id'])

print("Processed test data loaded successfully.")
X_test.head()

Processed test data loaded successfully.


Unnamed: 0,Therapy Hours,Initial Health Score,Lifestyle Activities,Average Sleep Hours,Follow-Up Sessions
0,0.006455,-0.030618,-0.989307,0.858683,-0.91452
1,-1.149747,-1.356828,1.010808,-1.496494,1.182603
2,0.777257,-0.780215,1.010808,0.269888,0.134041
3,0.391856,-1.587474,1.010808,0.858683,0.134041
4,0.777257,-0.953199,-0.989307,-1.496494,0.483562


In [15]:
# Use our trained final_lasso_model to predict on the new X_test
test_predictions = final_lasso_model.predict(X_test)

# Display the first few predictions
print("Sample predictions:", test_predictions[:5])

Sample predictions: [54.7303158  22.64754386 47.87822528 31.27519256 43.04379962]


In [16]:
# This step remains the same, but now it's clearer where the data comes from.
submission_df = pd.DataFrame({'Id': test_ids, 'Recovery Index': test_predictions})

# Round the predictions to the nearest integer
submission_df['Recovery Index'] = submission_df['Recovery Index'].round().astype(int)

# Save the DataFrame to a CSV file
submission_df.to_csv('lasso_submission.csv', index=False)

print("Submission file 'lasso_submission.csv' created successfully.")
submission_df.head()

Submission file 'lasso_submission.csv' created successfully.


Unnamed: 0,Id,Recovery Index
0,6253,55
1,4685,23
2,1732,48
3,4743,31
4,4522,43


In [17]:
from sklearn.model_selection import cross_val_predict

print("Generating cross-validated predictions on the training data...")

# Use the same 10-fold cross-validation strategy we defined earlier
cv = KFold(n_splits=10, shuffle=True, random_state=42)

# Get predictions for each row in the training data.
# For each row, the prediction is made by a model that was NOT trained on that row.
train_predictions = cross_val_predict(final_lasso_model, X, y, cv=cv)

# Create a new DataFrame for comparison
# Let's use the original 'processed_data.csv' as the base
comparison_df = pd.read_csv('processed_train.csv')

# Add the new 'Predicted Recovery Index' column
comparison_df['Predicted Recovery Index'] = train_predictions.round().astype(int)
comparison_df['Delta'] = comparison_df['Recovery Index'] - comparison_df['Predicted Recovery Index']

# Reorder columns to have Id, Actual, and Predicted at the front for easy comparison
cols = ['Id', 'Recovery Index', 'Predicted Recovery Index'] + [col for col in comparison_df.columns if col not in ['Id', 'Recovery Index', 'Predicted Recovery Index']]
comparison_df = comparison_df[cols]

# Get the existing columns
cols = list(comparison_df.columns)

# Move "Delta" to the 4th position (index 3)
# Only if it exists, just to be safe
if 'Delta' in cols:
    cols.remove('Delta')
    cols.insert(3, 'Delta')
    comparison_df = comparison_df[cols]

# Save the comparison DataFrame to a new CSV file
comparison_df.to_csv('lasso_comparison.csv', index=False)

print("\n'lasso_comparison.csv' created successfully.")
print("This file compares the actual vs. predicted values for your training data.")
comparison_df.head()

Generating cross-validated predictions on the training data...

'lasso_comparison.csv' created successfully.
This file compares the actual vs. predicted values for your training data.


Unnamed: 0,Id,Recovery Index,Predicted Recovery Index,Delta,Therapy Hours,Initial Health Score,Lifestyle Activities,Average Sleep Hours,Follow-Up Sessions
0,9255,36,34,2,0.006455,-1.183844,-0.989307,0.269888,0.134041
1,1562,25,26,-1,-1.149747,-1.241506,1.010808,0.269888,0.483562
2,1671,59,58,1,-1.149747,0.661318,-0.989307,0.269888,-0.91452
3,6088,22,22,0,-1.149747,-1.356828,-0.989307,-0.318906,-1.264041
4,6670,40,41,-1,1.162658,-1.299167,-0.989307,1.447477,-1.613561


In [18]:
describe_df = comparison_df.describe()

cols = list(describe_df.columns)

# Move "Delta" to the 4th position (index 3)
# Only if it exists, just to be safe
if 'Delta' in cols:
    cols.remove('Delta')
    cols.insert(0, 'Delta')
    describe_df = describe_df[cols]

describe_df.to_csv("lasso_describe.csv", index=True)