<a href="https://colab.research.google.com/github/MumbuaFaithK/ai-and-data-projects/blob/main/California_KNN_Model_Deployment_Faith_M.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# California Housing Price Prediction using KNN

In [None]:
!pip install -U scikit-learn



## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error


## 2. Load Dataset

In [None]:
# Load the California housing dataset
X, y = fetch_california_housing(return_X_y=True, as_frame=True)

# Preview the data
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


## 3. Train-Test Split (80/20)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## 4. Preprocessing with ColumnTransformer

In [None]:
# All features are numerical
numeric_features = X.columns

# Define pipeline for numerical preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Combine preprocessing using ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features)
])

## 5. Build Full Pipeline

In [None]:
# Build pipeline: preprocessing + KNN model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('knn', KNeighborsRegressor())
])

## 6. Define Hyperparameter Grid

In [None]:
# Define grid of hyperparameters for tuning
param_grid = {
    'knn__n_neighbors': [3, 5, 7, 9],
    'knn__weights': ['uniform', 'distance'],
    'knn__p': [1, 2]
}

## 7. GridSearchCV with 5-Fold Cross-Validation

In [None]:
# Perform grid search with cross-validation
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    verbose=1,
    n_jobs=-1
)


## 8. Train the Model

In [None]:
# Fit the model on training data
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


0,1,2
,estimator,Pipeline(step...Regressor())])
,param_grid,"{'knn__n_neighbors': [3, 5, ...], 'knn__p': [1, 2], 'knn__weights': ['uniform', 'distance']}"
,scoring,'r2'
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_neighbors,9
,weights,'distance'
,algorithm,'auto'
,leaf_size,30
,p,1
,metric,'minkowski'
,metric_params,
,n_jobs,


## 9. Model Evaluation

In [None]:
# Define hyperparameter grid
param_grid = {
    'knn__n_neighbors': [3, 5, 7, 9],
    'knn__weights': ['uniform', 'distance'],
    'knn__p': [1, 2]
}

# Apply GridSearchCV
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    verbose=1,
    n_jobs=-1
)

# Fit the model
grid_search.fit(X_train, y_train)
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("R² Score:", r2)
print("MSE:", mse)
print("RMSE:", rmse)
# Retrieve the best model from grid search
best_model = grid_search.best_estimator_

# Predict on test set
y_pred = best_model.predict(X_test)

# Evaluate using R² and RMSE
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print performance
print("Best Parameters:", grid_search.best_params_)
print("Best CV R² Score:", grid_search.best_score_)
print("Test R² Score:", r2)
print("Test MSE:", mse)
print("Test RMSE:", rmse)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
R² Score: 0.72210916268423
MSE: 0.3641506481894662
RMSE: 0.6034489607162036
Best Parameters: {'knn__n_neighbors': 9, 'knn__p': 1, 'knn__weights': 'distance'}
Best CV R² Score: 0.731266870986164
Test R² Score: 0.72210916268423
Test MSE: 0.3641506481894662
Test RMSE: 0.6034489607162036


## 10. Save the Trained Pipeline

In [None]:
# Save the trained model to a .pkl file
with open('california_knn_pipeline.pkl', 'wb') as f:
    pickle.dump(best_model, f)

print("Final model saved as 'california_knn_pipeline.pkl'")

Final model saved as 'california_knn_pipeline.pkl'


# Conclusion

###  Conclusion

- The K-Nearest Neighbors model was trained and optimized using GridSearchCV.
- The best model used the following parameters:
  - `n_neighbors`: *<value>*
  - `weights`: *<value>*
  - `p`: *<value>*
- The final R² score on the test set was **<R² value>**, indicating that the model explains <percentage>% of the variance in housing prices.
- The model was successfully saved as a `.pkl` file and is ready for deployment.
