# KNN

### Evaluation Function

In [1]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd

def evaluate_model(y_true, y_pred, X_test):
    n = X_test.shape[0]  # number of samples
    p = X_test.shape[1]  # number of predictors

    # Since we logged the resale prices during data processing, we exponetiate it back to view the metrics in dollars and not log units
    y_true = np.exp(y_true)
    y_pred = np.exp(y_pred)

    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    adj_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))

    metrics = pd.Series({
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'R^2': r2,
        'Adjusted R²': adj_r2
    })

    return metrics.to_frame(name='LinearRegression')

### KNN Function

In [17]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd

def knn_regression_tuning(X_train, X_test, y_train, y_test, k_values=None):
    if k_values is None:
        k_values = [3, 5, 7, 9, 11]

    # Create pipeline: scaling + KNN regressor
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('knn', KNeighborsRegressor())
    ])

    # Parameter grid for tuning n_neighbors, weights and p (distance metric)
    param_grid = {
        'knn__n_neighbors': k_values,
        'knn__weights': ['uniform', 'distance']
    }

    # GridSearchCV with 5-fold CV and neg MSE scoring
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

    # Fit to training data
    grid_search.fit(X_train, y_train)

    # Best model from grid search
    best_model = grid_search.best_estimator_

    # Predict on test data
    y_pred = best_model.predict(X_test)

    # Evaluate performance (assuming evaluate_model is defined)
    metrics = evaluate_model(y_test, y_pred, X_test)

    # Print best params and evaluation results
    print("Best hyperparameters:", grid_search.best_params_)
    print(metrics)

## df_base

In [8]:
import pandas as pd

X_base_train = pd.read_pickle("X_base_train.pkl")
X_base_test = pd.read_pickle("X_base_test.pkl")
y_base_train = pd.read_pickle("y_base_train.pkl")
y_base_test = pd.read_pickle("y_base_test.pkl")

### KNN

In [18]:
knn_regression_tuning(X_base_train, X_base_test, y_base_train, y_base_test, k_values=None)

Best hyperparameters: {'knn__n_neighbors': 5, 'knn__weights': 'distance'}
             LinearRegression
MAE              5.853455e+04
MSE              6.183791e+09
RMSE             7.863708e+04
R^2              8.261335e-01
Adjusted R²      8.259876e-01


## df_post_covid

In [20]:
X_post_train = pd.read_pickle("X_post_train.pkl")
X_post_test = pd.read_pickle("X_post_test.pkl")
y_post_train = pd.read_pickle("y_post_train.pkl")
y_post_test = pd.read_pickle("y_post_test.pkl")

### KNN

In [21]:
knn_regression_tuning(X_post_train, X_post_test, y_post_train, y_post_test, k_values=None)

Best hyperparameters: {'knn__n_neighbors': 7, 'knn__weights': 'distance'}
             LinearRegression
MAE              4.552422e+04
MSE              3.972737e+09
RMSE             6.302965e+04
R^2              8.974342e-01
Adjusted R²      8.972500e-01
