# ImmoEliza Project - Part 3: Regression 

# Step 5-6: Model Application & Evaluation (KNN Regressor)

In [1]:
# Import separately scaled features & target

import pandas as pd

X_train = pd.read_csv('./data/3_Scaled_Features_Train.csv')
X_test = pd.read_csv('./data/3_Scaled_Features_Test.csv')
y_train = pd.read_csv('./data/3_Target_Train.csv')
y_test = pd.read_csv('./data/3_Target_Test.csv')

In [2]:
X_train.columns

Index(['commune_encoded', 'living_area', 'building_condition', 'terrace',
       'equipped_kitchen', 'subtype_of_property', 'garden'],
      dtype='object')

In [3]:
# Import libraries

from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

### KNN Model Initialization (Regressor) & Training

In [4]:
# Initialize the KNN regressor
knn = KNeighborsRegressor(n_neighbors=5)  # You can try different values for n_neighbors

# Train the model
knn.fit(X_train, y_train)

### Make Predictions

In [5]:
# Make predictions on both train and test data
y_pred_train = knn.predict(X_train)
y_pred_test = knn.predict(X_test)

### Optimize K (Optional)

In [6]:
"""# Try different values of k
k_values = range(1, 21)  # Test k from 1 to 20
mse_values = []

for k in k_values:
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    mse_values.append(mean_squared_error(y_test, y_pred))

# Plot or print the best k
best_k = k_values[mse_values.index(min(mse_values))]
print(f"Best k: {best_k}")"""


'# Try different values of k\nk_values = range(1, 21)  # Test k from 1 to 20\nmse_values = []\n\nfor k in k_values:\n    knn = KNeighborsRegressor(n_neighbors=k)\n    knn.fit(X_train, y_train)\n    y_pred = knn.predict(X_test)\n    mse_values.append(mean_squared_error(y_test, y_pred))\n\n# Plot or print the best k\nbest_k = k_values[mse_values.index(min(mse_values))]\nprint(f"Best k: {best_k}")'

### Hyperparameter Tuning (Optional)

In [7]:
"""# Tune hyperparameters
knn = KNeighborsRegressor(n_neighbors=best_k, weights='distance')
knn.fit(X_train, y_train)
y_pred_train = knn.predict(X_train)
y_pred_test = knn.predict(X_test)

# Calculate MSE and R² for train data
mse_train = mean_squared_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)

# Calculate MSE, MAE and R² for test data
mse_test = mean_squared_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)
mae = mean_absolute_error(y_test, y_pred_test)

# Print the results
print(f"Training MSE: {mse_train}")
print(f"Training R²: {r2_train}")
print(f"Test MSE: {mse_test}")
print(f"Test R²: {r2_test}")
print(f"Test MAE: {mae}")"""

'# Tune hyperparameters\nknn = KNeighborsRegressor(n_neighbors=best_k, weights=\'distance\')\nknn.fit(X_train, y_train)\ny_pred_train = knn.predict(X_train)\ny_pred_test = knn.predict(X_test)\n\n# Calculate MSE and R² for train data\nmse_train = mean_squared_error(y_train, y_pred_train)\nr2_train = r2_score(y_train, y_pred_train)\n\n# Calculate MSE, MAE and R² for test data\nmse_test = mean_squared_error(y_test, y_pred_test)\nr2_test = r2_score(y_test, y_pred_test)\nmae = mean_absolute_error(y_test, y_pred_test)\n\n# Print the results\nprint(f"Training MSE: {mse_train}")\nprint(f"Training R²: {r2_train}")\nprint(f"Test MSE: {mse_test}")\nprint(f"Test R²: {r2_test}")\nprint(f"Test MAE: {mae}")'

### Use Cross-Validation to Find the best k

In [8]:
"""from sklearn.model_selection import cross_val_score

k_values = range(1, 28)  # Test k from 1 to 20

# Cross-validation to find the best k
cv_scores = []
for k in k_values:
    knn = KNeighborsRegressor(n_neighbors=k)
    scores = cross_val_score(knn, X, y, cv=5, scoring='neg_mean_squared_error')
    cv_scores.append(-scores.mean())  # Negative because cross_val_score returns negative MSE

# Plot or print the best k based on cross-validation
best_k_cv = k_values[cv_scores.index(min(cv_scores))]
print(f"Best k (using cross-validation): {best_k_cv}")"""

'from sklearn.model_selection import cross_val_score\n\nk_values = range(1, 28)  # Test k from 1 to 20\n\n# Cross-validation to find the best k\ncv_scores = []\nfor k in k_values:\n    knn = KNeighborsRegressor(n_neighbors=k)\n    scores = cross_val_score(knn, X, y, cv=5, scoring=\'neg_mean_squared_error\')\n    cv_scores.append(-scores.mean())  # Negative because cross_val_score returns negative MSE\n\n# Plot or print the best k based on cross-validation\nbest_k_cv = k_values[cv_scores.index(min(cv_scores))]\nprint(f"Best k (using cross-validation): {best_k_cv}")'

In [9]:
# Tune hyperparameters
knn = KNeighborsRegressor(n_neighbors=24, weights='distance')
knn.fit(X_train, y_train)
y_pred_train = knn.predict(X_train)
y_pred_test = knn.predict(X_test)

# Calculate MSE and R² for train data
mse_train = mean_squared_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)
mae = mean_absolute_error(y_test, y_pred_test)

# Calculate MSE and R² for test data
mse_test = mean_squared_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)

# Print the results
print(f"Training MSE: {mse_train}")
print(f"Training R²: {r2_train}")
print(f"Test MSE: {mse_test}")
print(f"Test R²: {r2_test}")
print(f"Test MAE: {mae}")

Training MSE: 244650050.331399
Training R²: 0.9930719929293508
Test MSE: 10674533472.197187
Test R²: 0.7140905153339875
Test MAE: 63138.96689789522


### Confirm K

In [10]:
# Confirm k with cross-validation

from sklearn.model_selection import cross_val_score

scores = cross_val_score(knn, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
mean_cv_mse = -scores.mean()

print("Cross-Validated MSE for k:", mean_cv_mse)


Cross-Validated MSE for k: 10798466471.962687


In [11]:
# Confirm k with r²

from sklearn.model_selection import cross_val_score
scores_r2 = cross_val_score(knn, X_train, y_train, scoring='r2', cv=5)
mean_cv_r2 = scores_r2.mean()

print("Cross-Validated R² for k:", mean_cv_r2)


Cross-Validated R² for k: 0.6939575694617033


Training MSE: 206546610.847818
Training R²: 0.9941510072103644
Test MSE: 14077834562.528383
Test R²: 0.622935612552125

Both test values and their comparison to training results indicate overfitting, meaning the model learns noise and specific patterns 

In [12]:
"""for column in X_train.columns:
    plt.figure(figsize=(8, 5))
    sns.scatterplot(x=X_train[column], y=y_train)
    plt.title(f"{column} vs Price")
    plt.xlabel(column)
    plt.ylabel("Price")
    plt.show()
"""

'for column in X_train.columns:\n    plt.figure(figsize=(8, 5))\n    sns.scatterplot(x=X_train[column], y=y_train)\n    plt.title(f"{column} vs Price")\n    plt.xlabel(column)\n    plt.ylabel("Price")\n    plt.show()\n'

### Residual Analysis
Residual analysis can reveal systematic errors in the predictions that indicate missing or less useful features

In [13]:
# Calculate residuals (difference between the actual and predicted values)
residuals = y_test - y_pred_test

In [14]:
# Visualize residuals
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_pred_test, y=residuals)
plt.axhline(0, color='red', linestyle='--')
plt.title("Residuals vs Predicted Values")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.show()

ValueError: Per-column arrays must each be 1-dimensional

<Figure size 1000x600 with 0 Axes>

What you see: This scatter plot shows the residuals (the differences between the actual and predicted values) on the vertical axis and the predicted values on the horizontal axis.

Interpretation:

* The horizontal red dashed line indicates where the residuals should ideally be centered (around 0). This line helps you see if there are systematic deviations in the residuals.
* A good model typically has residuals scattered randomly around this line without any obvious pattern.
* In this case, the residuals appear to increase as the predicted values rise, especially with higher predicted values. This suggests that the model has heteroscedasticity (variance of residuals is not constant), meaning it performs worse for higher predicted values.
* Ideally, you want this plot to show no clear pattern or structure. If there’s a trend (like here), it indicates that the model might not be capturing something in the data that it should, especially for higher values.

In [None]:
# Histogram of residuals
plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True, bins=30)
plt.title("Distribution of Residuals")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.show()

What you see: This is a histogram with a kernel density estimate (KDE) curve, showing the distribution of residuals.

Interpretation:

* The spike around 0 suggests that most residuals are close to 0, indicating that the model’s predictions are generally quite accurate for most instances.
* However, you can see that the residuals have a heavy skew, with a long tail on the right side. This points to the fact that there are some predictions that are off by a large amount, and the model is not doing well on those instances.
* Ideally, the residuals should follow a normal distribution, and this plot looks non-normal, which might indicate a bias in the model or the presence of outliers that the model cannot handle well.

What to Do Next?

* Check for outliers: The skewness in the distribution suggests that some points may be far off, which could influence the model’s accuracy.
* Consider transforming the data: If you suspect non-linearity or heteroscedasticity, you could try transforming the target variable (e.g., applying a log transformation).
* Tune your model: You might need to adjust your KNN model’s parameters, such as the number of neighbors.
* Model evaluation: Consider using other residual diagnostics, like the Q-Q plot, to check for normality of residuals or running cross-validation to confirm if this trend holds across different subsets of the data.

### Inversing log transformation of target

In [None]:
import numpy as np

# Inverse predicted values
#y_pred_original = np.expm1(y_pred_test)

# Inverse original log tranformed values
#y_test_original = np.expm1(y_test)

In [None]:
# Calculate mse and r²

#mse_original = mean_squared_error(y_test_original, y_pred_original)
#r2_original = r2_score(y_test_original, y_pred_original)

#print(f'MSE (Original scale): {mse_original}')
#print(f'R² (Original scale): {r2_original}')


### Permutation Importance

* This method measures the importance of a feature by randomly shuffling its values, breaking the relationship between the feature and the target.
* You then check how much the model's performance (e.g., MSE or R²) drops. A large drop indicates the feature is important, while a small drop suggests it isn't.

In [None]:
from sklearn.inspection import permutation_importance
import numpy as np

# Calculate permutation importance
perm_importance = permutation_importance(knn, X_test, y_test, scoring='neg_mean_squared_error', random_state=42)

# Display feature importance
for i, feature in enumerate(X_test.columns):
    print(f"Feature: {feature}, Importance: {perm_importance.importances_mean[i]:.4f}")


In [None]:
import matplotlib.pyplot as plt

# Sort features by importance
sorted_idx = np.argsort(perm_importance.importances_mean)
plt.barh(X_test.columns[sorted_idx], perm_importance.importances_mean[sorted_idx])
plt.xlabel('Permutation Importance')
plt.ylabel('Features')
plt.title('Feature Importance for KNN Regressor')
plt.show()