# K-Nearest Neighbor Regression with Hyperparameter Optimization

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RandomizedSearchCV, PredefinedSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, classification_report, r2_score, roc_curve, auc
from sklearn.preprocessing import label_binarize
from scipy.special import softmax

train_data = pd.read_csv("Train_Set.csv")
val_data = pd.read_csv("Validation_Set.csv")
test_data = pd.read_csv('Test_Set.csv')

X = train_data.drop(columns=["Severity"])
y = train_data["Severity"]

X_test = test_data.drop(columns=["Severity"])
y_test = test_data["Severity"]

combined_data = pd.concat([train_data, val_data], axis=0)
X_combined = combined_data.drop(columns=["Severity"])
y_combined = combined_data["Severity"]

# Create an indicator array for the validation set split
split_index = [-1] * len(train_data) + [0] * len(val_data)
predefined_split = PredefinedSplit(test_fold=split_index)

# Define the parameter grid for KNeighborsRegressor
param_grid_knn = {
    'n_neighbors': [3],
    'weights': ['distance'],
    'p': [1]  
}

grid_search_knn = RandomizedSearchCV(
    KNeighborsRegressor(),
    n_iter=1,
    param_distributions=param_grid_knn,
    cv=predefined_split,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

grid_search_knn.fit(X_combined, y_combined)  

# Get the best model and parameters
best_knn_model = grid_search_knn.best_estimator_
print("Best parameters for KNN:", grid_search_knn.best_params_)

# Predictions using the optimized KNeighborsRegressor
y_pred_knn = best_knn_model.predict(X_test)
y_pred_knn_rounded = y_pred_knn.round().astype(int)

classes = sorted(y_test.unique())
n_classes = len(classes)

# Compute distances from each prediction to each class
distance_matrix = np.abs(y_pred_knn[:, np.newaxis] - np.array(classes))

# Convert distances to pseudo-probabilities using softmax
# The lower the distance, the higher the "probability"
pseudo_prob_matrix = softmax(-distance_matrix, axis=1)

# Binarize the true labels for ROC curve
y_test_binarized = label_binarize(y_test, classes=classes)

# Plot ROC curves for each class
plt.figure(figsize=(12, 10))

for i in range(n_classes):
    fpr, tpr, _ = roc_curve(y_test_binarized[:, i], pseudo_prob_matrix[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label=f'ROC curve for class {classes[i]} (area = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for KNN Regressor')
plt.legend(loc="lower right")
plt.grid()
plt.show()

# Evaluation
mae_knn = mean_absolute_error(y_test, y_pred_knn_rounded)
mse_knn = mean_squared_error(y_test, y_pred_knn_rounded)
accuracy = accuracy_score(y_test, y_pred_knn_rounded)
r2 = r2_score(y_test, y_pred_knn_rounded)


# Results
print(f"Optimized KNeighbors Regressor - MAE: {mae_knn}")
print(f"Optimized KNeighbors Regressor - MSE: {mse_knn}")
print("Classification Report:")
print(f"Test Accuracy: {accuracy:.4f}")
print(f"R-squared (R²): {r2:.4f}")
print(classification_report(y_test, y_pred_knn_rounded))