<a href="https://colab.research.google.com/github/ND3MW4/Introducton_to_Shell_Scripting_workshop/blob/main/MLOPs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 📦 Import libraries

In [None]:
import pandas as pd
import pickle
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error

# 1. Load dataset

In [None]:
X, y = fetch_california_housing(return_X_y=True, as_frame=True)

# 2. Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# 3. Preprocessing: Imputation + Scaling for numerical features

In [None]:
numeric_features = X.columns  # all are numerical
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

# 4. Combine preprocessing using ColumnTransformer

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features)])

# 5. Build pipeline: preprocessing + KNN

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('knn', KNeighborsRegressor())])

# 6. Define hyperparameter grid

In [None]:
param_grid = {
    'knn__n_neighbors': [3, 5, 7, 9],
    'knn__weights': ['uniform', 'distance'],
    'knn__p': [1, 2]}


# 7. Apply GridSearchCV with 5-fold cross-validation

In [None]:
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    verbose=1,
    n_jobs=-1)

# 8. Fit the model

In [None]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


# 9. Evaluate on test set

In [None]:
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5

Fitting 5 folds for each of 16 candidates, totalling 80 fits


# 10. Print results

In [None]:
print("Best Parameters:", grid_search.best_params_)
print("Best CV R² Score:", grid_search.best_score_)
print("Test R² Score:", r2)
print("Test MSE:", mse)
print("Test RMSE:", rmse)

Best Parameters: {'knn__n_neighbors': 9, 'knn__p': 1, 'knn__weights': 'distance'}
Best CV R² Score: 0.731266870986164
Test R² Score: 0.72210916268423
Test MSE: 0.3641506481894662
Test RMSE: 0.6034489607162036


# 11. Save the pipeline

In [None]:
with open('california_knn_pipeline.pkl', 'wb') as f:
    pickle.dump(best_model, f)

print("📦 Final pipeline saved to 'california_knn_pipeline.pkl'")

📦 Final pipeline saved to 'california_knn_pipeline.pkl'
