In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error

RANDOM_STATE = 42

In [None]:
df_train = pd.read_csv('../data/train_processed.csv')
print(df_train.head())
features = ['Therapy Hours', 'Initial Health Score', 'Lifestyle Activities', 'Average Sleep Hours', 'Follow-Up Sessions']
target = 'Recovery Index'

X = df_train[features]
y = df_train[target]

print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")

In [None]:
# --- Model 1: Linear Regression ---

print("Starting Model 1: Linear Regression...")

#Initialize the model
model_lr = LinearRegression()

#Set up scoring - 'cross_val_score' doesn't have a direct 'rmse' score, so we use 'neg_mean_squared_error' and then take its square root and flip the sign.
scoring = 'neg_mean_squared_error'

#Perform 10-fold cross-validation
scores = cross_val_score(model_lr, X, y, cv=10, scoring=scoring, n_jobs=-1)

#Calculate and print the results
mse_scores = -scores  # Flip the sign back to positive
rmse_scores = np.sqrt(mse_scores)
avg_rmse = np.mean(rmse_scores)

print("\n--- Linear Regression Results ---")
print(f"RMSE scores for 10 folds: {rmse_scores.round(3)}")
print(f"Average 10-Fold RMSE: {avg_rmse:.3f}")

#R-squared
r2_scores = cross_val_score(model_lr, X, y, cv=10, scoring='r2', n_jobs=-1)
avg_r2 = np.mean(r2_scores)
print(f"Average 10-Fold R-squared: {avg_r2:.3f}")

In [None]:
# --- Baseline Submission Cell ---

print("Loading test data...")
X_test = pd.read_csv('../data/test_processed.csv')

test_ids = pd.read_csv('../data/test_ids.csv')
print("Test data and IDs loaded.")


print("\nTraining a new model on ALL training data...")
#Create a fresh instance of the model
final_lr_model = LinearRegression()

#Train it on 100% of the training data
final_lr_model.fit(X, y)
print("Model trained successfully.")


print("\nGenerating predictions...")
#Use the trained model to predict on the test data
predictions = final_lr_model.predict(X_test)


print("Creating submission file...")
submission_df = pd.DataFrame({'Id': test_ids['Id'], 'Recovery Index': predictions})
submission_df['Recovery Index'] = submission_df['Recovery Index'].round().astype(int)

submission_path = '../submission/submission_model1.csv'
submission_df.to_csv(submission_path, index=False)

print(f"\nBaseline submission file saved to: {submission_path}")
submission_df.head()