In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

In [2]:
# Load the datasets
train_df = pd.read_csv('/kaggle/input/processed-data/processed_train.csv')
test_df = pd.read_csv('/kaggle/input/processed-data/processed_test.csv')

# Separate features (X) and target (y)
X = train_df.drop(['Id', 'Recovery Index'], axis=1)
y = train_df['Recovery Index']

# Prepare the test data (dropping the 'Id' column for prediction)
X_test = test_df.drop('Id', axis=1)

print("Data loaded and prepared.")
X.head()

Data loaded and prepared.


Unnamed: 0,Therapy Hours,Initial Health Score,Lifestyle Activities,Average Sleep Hours,Follow-Up Sessions
0,0.006455,-1.183844,-0.989307,0.269888,0.134041
1,-1.149747,-1.241506,1.010808,0.269888,0.483562
2,-1.149747,0.661318,-0.989307,0.269888,-0.91452
3,-1.149747,-1.356828,-0.989307,-0.318906,-1.264041
4,1.162658,-1.299167,-0.989307,1.447477,-1.613561


In [3]:
# Define a wider and more granular parameter grid to search
param_grid_extended = {
    'n_estimators': [200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4],
    'min_samples_leaf': [1, 3, 5],
    'subsample': [0.8, 0.9, 1.0]
}

# Initialize the Gradient Boosting Regressor
gbr = GradientBoostingRegressor(random_state=42)

# Initialize GridSearchCV with the extended grid
# This will take longer to run, which is normal for a more thorough search.
grid_search = GridSearchCV(estimator=gbr, param_grid=param_grid_extended,
                           cv=10, n_jobs=-1, scoring='neg_mean_squared_error', verbose=2)

# Fit GridSearchCV to the data
grid_search.fit(X, y)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best parameters found from extended search: {best_params}")

# Get the best score
best_score = np.sqrt(-grid_search.best_score_)
print(f"Best cross-validated RMSE from extended search: {best_score:.4f}")


Fitting 10 folds for each of 216 candidates, totalling 2160 fits
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, n_estimators=200, subsample=0.8; total time=   1.5s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, n_estimators=200, subsample=0.8; total time=   1.4s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, n_estimators=200, subsample=0.8; total time=   1.5s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, n_estimators=200, subsample=0.9; total time=   1.5s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, n_estimators=200, subsample=0.9; total time=   1.4s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, n_estimators=200, subsample=1.0; total time=   1.4s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, n_estimators=200, subsample=1.0; total time=   1.3s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, n_estimators=200, subsample=1.0; total time=   1.4s
[CV] END learning_rate=

In [4]:
# Initialize and train the final model with the best parameters
final_model = GradientBoostingRegressor(**best_params, random_state=42)
final_model.fit(X, y)

print("Final model trained successfully with the best parameters.")

Final model trained successfully with the best parameters.


In [5]:
# Define the filename for your saved model
filename = 'boosting_method.joblib'

# Use joblib.dump to save the trained model object to a file
joblib.dump(final_model, filename)

print(f"Model saved successfully to '{filename}'")

Model saved successfully to 'boosting_method.joblib'


In [6]:
# Predict on the test data
test_predictions = final_model.predict(X_test)

# The Recovery Index should be an integer
test_predictions = np.round(test_predictions).astype(int)

# Create the submission DataFrame
submission_df = pd.DataFrame({'Id': test_df['Id'], 'Recovery Index': test_predictions})

# Save the submission file
submission_df.to_csv('boosting_method_submission.csv', index=False)

print("Submission file 'boosting_method_submission.csv' created successfully.")
submission_df.head()

Submission file 'boosting_method_submission.csv' created successfully.


Unnamed: 0,Id,Recovery Index
0,6253,55
1,4685,22
2,1732,48
3,4743,31
4,4522,43


In [7]:
# Get cross-validated predictions for the training set
train_cv_predictions = cross_val_predict(final_model, X, y, cv=10)

# Round the predictions to the nearest integer
train_cv_predictions = np.round(train_cv_predictions).astype(int)

# Create the comparison DataFrame
comparison_df = pd.DataFrame({
    'Actual Recovery Index': y,
    'Predicted Recovery Index': train_cv_predictions
})

# Calculate the Delta
comparison_df['Delta'] = comparison_df['Actual Recovery Index'] - comparison_df['Predicted Recovery Index']

# Save the comparison file
comparison_df.to_csv('boosting_method_comparison.csv', index=False)

print("Comparison file 'boosting_method_comparison.csv' created successfully.")
comparison_df.head()

Comparison file 'boosting_method_comparison.csv' created successfully.


Unnamed: 0,Actual Recovery Index,Predicted Recovery Index,Delta
0,36,35,1
1,25,26,-1
2,59,58,1
3,22,21,1
4,40,41,-1


In [8]:
# Get descriptive statistics for the 'Delta' column
delta_description = comparison_df['Delta'].describe()

# Save the description to a CSV file
delta_description.to_csv('boosting_method_describe.csv')

print("Descriptive statistics file 'boosting_method_describe.csv' created successfully.")
print(delta_description)

Descriptive statistics file 'boosting_method_describe.csv' created successfully.
count    8000.000000
mean       -0.000125
std         2.103789
min        -8.000000
25%        -1.000000
50%         0.000000
75%         1.000000
max         9.000000
Name: Delta, dtype: float64


In [9]:
from sklearn.metrics import mean_squared_error

# Extract the actual and predicted values
actual_values = comparison_df['Actual Recovery Index']
predicted_values = comparison_df['Predicted Recovery Index']

# Calculate RMSE
# The 'squared=False' argument makes the function return RMSE instead of MSE.
mse = mean_squared_error(actual_values, predicted_values)
rmse = np.sqrt(mse)
# Print the result, formatted to 4 decimal places
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 2.1036575291620068
