In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error

RANDOM_STATE = 42

In [2]:
df_train = pd.read_csv('../data/train_processed_v3.csv')
print(df_train.head())
interaction_features = [
    'Therapy_Hours_x_Initial_Health_Score',
    'Therapy_Hours_x_Follow-Up_Sessions',
    'Initial_Health_Score_x_Lifestyle_Activities',
    'Average_Sleep_Hours_x_Follow-Up_Sessions'
]

features = ['Therapy Hours', 'Initial Health Score', 'Lifestyle Activities', 'Average Sleep Hours', 'Follow-Up Sessions'] + interaction_features
target = 'Recovery Index'

X = df_train[features]
y = df_train[target]

print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")

   Therapy Hours  Initial Health Score  Lifestyle Activities  \
0              5                    49                     0   
1              2                    48                     1   
2              2                    81                     0   
3              2                    46                     0   
4              8                    47                     0   

   Average Sleep Hours  Follow-Up Sessions  Recovery Index  \
0                    7                   5              36   
1                    7                   6              25   
2                    7                   2              59   
3                    6                   1              22   
4                    9                   0              40   

   Therapy_Hours_x_Initial_Health_Score  Therapy_Hours_x_Lifestyle_Activities  \
0                                   245                                     0   
1                                    96                                     2   

In [3]:
# --- Model 1: Linear Regression ---

print("Starting Model 1: Linear Regression...")

#Initialize the model
model_lr = LinearRegression()

#Set up scoring - 'cross_val_score' doesn't have a direct 'rmse' score, so we use 'neg_mean_squared_error' and then take its square root and flip the sign.
scoring = 'neg_mean_squared_error'

#Perform 10-fold cross-validation
scores = cross_val_score(model_lr, X, y, cv=10, scoring=scoring, n_jobs=-1)

#Calculate and print the results
mse_scores = -scores  # Flip the sign back to positive
rmse_scores = np.sqrt(mse_scores)
avg_rmse = np.mean(rmse_scores)

print("\n--- Linear Regression Results ---")
print(f"RMSE scores for 10 folds: {rmse_scores.round(3)}")
print(f"Average 10-Fold RMSE: {avg_rmse:.3f}")

#R-squared
r2_scores = cross_val_score(model_lr, X, y, cv=10, scoring='r2', n_jobs=-1)
avg_r2 = np.mean(r2_scores)
print(f"Average 10-Fold R-squared: {avg_r2:.3f}")

Starting Model 1: Linear Regression...

--- Linear Regression Results ---
RMSE scores for 10 folds: [1.998 2.094 2.142 2.009 2.013 2.022 2.027 2.109 2.078 1.954]
Average 10-Fold RMSE: 2.045
Average 10-Fold R-squared: 0.989


In [4]:
# --- Baseline Submission Cell ---

print("Loading test data...")
X_test = pd.read_csv('../data/test_processed_v3.csv')
X_test = X_test[features]
test_ids = pd.read_csv('../data/test_ids.csv')
print("Test data and IDs loaded.")


print("\nTraining a new model on ALL training data...")
#Create a fresh instance of the model
final_lr_model = LinearRegression()

#Train it on 100% of the training data
final_lr_model.fit(X, y)
print("Model trained successfully.")


print("\nGenerating predictions...")
#Use the trained model to predict on the test data
predictions = final_lr_model.predict(X_test)


print("Creating submission file...")
submission_df = pd.DataFrame({'Id': test_ids['Id'], 'Recovery Index': predictions})
submission_df['Recovery Index'] = submission_df['Recovery Index'].round().astype(int)

submission_path = '../submission/submission_model1.csv'
submission_df.to_csv(submission_path, index=False)

print(f"\nBaseline submission file saved to: {submission_path}")
submission_df.head()

Loading test data...
Test data and IDs loaded.

Training a new model on ALL training data...
Model trained successfully.

Generating predictions...
Creating submission file...

Baseline submission file saved to: ../submission/submission_model1.csv


Unnamed: 0,Id,Recovery Index
0,6253,55
1,4685,23
2,1732,48
3,4743,31
4,4522,43
