In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV, KFold, cross_val_predict
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Load the datasets
try:
    train_df = pd.read_csv('processed_train.csv')
    test_df = pd.read_csv('processed_test.csv')
    print("Data loaded successfully.")
    print("Training data shape:", train_df.shape)
    print("Test data shape:", test_df.shape)
except FileNotFoundError as e:
    print(f"Error: {e}. Make sure 'processed_train.csv' and 'processed_test.csv' are in the same directory as this notebook.")

# Display the first few rows to confirm they are loaded correctly
print("Training Data Head:")
print(train_df.head())
print("\nTest Data Head:")
print(test_df.head())

Data loaded successfully.
Training data shape: (8000, 7)
Test data shape: (2000, 6)
Training Data Head:
     Id  Therapy Hours  Initial Health Score  Lifestyle Activities  \
0  9255       0.006455             -1.183844             -0.989307   
1  1562      -1.149747             -1.241506              1.010808   
2  1671      -1.149747              0.661318             -0.989307   
3  6088      -1.149747             -1.356828             -0.989307   
4  6670       1.162658             -1.299167             -0.989307   

   Average Sleep Hours  Follow-Up Sessions  Recovery Index  
0             0.269888            0.134041              36  
1             0.269888            0.483562              25  
2             0.269888           -0.914520              59  
3            -0.318906           -1.264041              22  
4             1.447477           -1.613561              40  

Test Data Head:
     Id  Therapy Hours  Initial Health Score  Lifestyle Activities  \
0  6253       0.006455

In [5]:
# Prepare the training data
X_train = train_df.drop(['Id', 'Recovery Index'], axis=1)
y_train = train_df['Recovery Index']

# Prepare the test data
X_test = test_df.drop('Id', axis=1)

print("Data prepared for modeling.")
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)

Data prepared for modeling.
X_train shape: (8000, 5)
y_train shape: (8000,)
X_test shape: (2000, 5)


In [6]:
# Define the parameter grid to search
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Initialize the KNN Regressor model
knn = KNeighborsRegressor()

# Set up K-Fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=kf,
                           scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=2)

# Fit GridSearchCV to the data
print("Starting hyperparameter tuning with GridSearchCV...")
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("\nBest Hyperparameters found:")
print(grid_search.best_params_)
print("\nBest Cross-Validation RMSE Score:")
# We negate the score because scoring was 'neg_root_mean_squared_error'
print(-grid_search.best_score_)

Starting hyperparameter tuning with GridSearchCV...
Fitting 10 folds for each of 42 candidates, totalling 420 fits
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   0.0s
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   0.0s
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   0.0s
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   0.0s
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   0.0s
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   0.0s
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   0.0s
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   0.0s
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   0.0s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=   0.0s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=  

In [7]:
# Get the best estimator from the grid search
final_knn_model = grid_search.best_estimator_

print("Final KNN model with best parameters:")
print(final_knn_model)

# The model is already trained on the full data by GridSearchCV when refit=True (default),
# so we can directly use it for predictions.
print("\nFinal model has been trained on the entire training dataset.")

Final KNN model with best parameters:
KNeighborsRegressor(metric='euclidean', n_neighbors=15, weights='distance')

Final model has been trained on the entire training dataset.


In [8]:
# Make predictions on the test data
print("Generating predictions on the test set...")
test_predictions = final_knn_model.predict(X_test)

# Round predictions to the nearest integer as per the problem description
test_predictions_rounded = np.round(test_predictions).astype(int)

print("Predictions generated successfully.")
print("First 5 predictions:", test_predictions_rounded[:5])

Generating predictions on the test set...
Predictions generated successfully.
First 5 predictions: [53 25 45 34 43]


In [9]:
# Create the submission DataFrame
submission_df = pd.DataFrame({'Id': test_df['Id'], 'Recovery Index': test_predictions_rounded})

# Save the submission file
submission_df.to_csv('knn_submission.csv', index=False)

print("'knn_submission.csv' has been created successfully!")
submission_df.head()

'knn_submission.csv' has been created successfully!


Unnamed: 0,Id,Recovery Index
0,6253,53
1,4685,25
2,1732,45
3,4743,34
4,4522,43


In [13]:
# Generate cross-validated predictions for the training data
print("Generating cross-validated predictions for the training set...")
train_cv_predictions = cross_val_predict(final_knn_model, X_train, y_train, cv=kf)

# Round the predictions
train_cv_predictions_rounded = np.round(train_cv_predictions).astype(int)

print("Cross-validated predictions generated successfully.")
print("First 5 CV predictions:", train_cv_predictions_rounded[:5])

Generating cross-validated predictions for the training set...
Cross-validated predictions generated successfully.
First 5 CV predictions: [34 27 56 23 41]


In [14]:
# Create the comparison DataFrame
comparison_df = pd.DataFrame({
    'Actual': y_train,
    'Predicted': train_cv_predictions_rounded
})

# Calculate the Delta (difference)
comparison_df['Delta'] = comparison_df['Actual'] - comparison_df['Predicted']

# Save the comparison file
comparison_df.to_csv('knn_comparison.csv', index=False)

print("'knn_comparison.csv' has been created successfully!")
comparison_df.head()

'knn_comparison.csv' has been created successfully!


Unnamed: 0,Actual,Predicted,Delta
0,36,34,2
1,25,27,-2
2,59,56,3
3,22,23,-1
4,40,41,-1


In [15]:
# Get descriptive statistics for the 'Delta' column
delta_description = comparison_df['Delta'].describe()

# Save the description to a CSV file
delta_description.to_csv('knn_describe.csv')

print("'knn_describe.csv' has been created successfully!")
print("\nStatistics for the 'Delta' (Actual - Predicted) column:")
print(delta_description)

'knn_describe.csv' has been created successfully!

Statistics for the 'Delta' (Actual - Predicted) column:
count    8000.000000
mean       -0.004250
std         2.815133
min       -12.000000
25%        -2.000000
50%         0.000000
75%         2.000000
max        12.000000
Name: Delta, dtype: float64
