<a href="https://colab.research.google.com/github/MarianBolous/AceGPT-v2/blob/main/california_rf_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# California Housing Regression with scikit‑learn & MLflow (Google Colab)

End‑to‑end ML pipeline **ready to run in Colab**:
1. Install deps in the VM
2. Mount Google Drive for persistent MLflow runs
3. Expose the MLflow UI via **ngrok**
4. Train & register a RandomForest model
5. Optional: serve the model behind a public URL


## 0  Install dependencies

In [None]:
!pip -q install --upgrade pip
!pip -q install scikit-learn==1.4.2 mlflow pyngrok pandas numpy matplotlib seaborn


## 1  Mount Google Drive (to keep MLflow artifacts)

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.environ['MLFLOW_TRACKING_URI'] = 'file:///content/drive/MyDrive/mlruns'


## 2  Launch MLflow UI via ngrok

In [None]:
import subprocess, time
from pyngrok import ngrok

# 🔑 Paste your token below
ngrok.set_auth_token('PASTE_YOUR_NGROK_TOKEN')

mlflow_ui = subprocess.Popen(['mlflow', 'ui', '--port', '5000', '--host', '0.0.0.0'])
time.sleep(3)
public_url = ngrok.connect(addr=5000, proto='http')
print('MLflow UI 👉', public_url.public_url)


## 3  Imports & experiment setup

In [None]:
import os, subprocess, time
import mlflow, mlflow.sklearn
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from mlflow.models import infer_signature
import pandas as pd
import numpy as np

EXPERIMENT_NAME = 'DS-Method-California-Housing'
mlflow.set_experiment(EXPERIMENT_NAME)


## 4  Load data, build pipeline, train & log

In [None]:
raw = fetch_california_housing(as_frame=True)
X_full, y_full = raw.data, raw.target
X_full.describe().to_csv('feature_summary.csv')

X_train, X_test, y_train, y_test = train_test_split(
    X_full, y_full, test_size=0.20, random_state=42)

pipe = Pipeline([
    ('scale', StandardScaler()),
    ('rf', RandomForestRegressor(random_state=42))
])

param_grid = {
    'rf__n_estimators': [120, 240],
    'rf__max_depth': [None, 15],
    'rf__min_samples_split': [2, 4]
}

with mlflow.start_run(run_name='rf_regressor_colab'):
    gscv = GridSearchCV(
        pipe,
        param_grid=param_grid,
        cv=3,
        scoring='neg_mean_absolute_error',
        n_jobs=-1,
        verbose=1,
    ).fit(X_train, y_train)

    best = gscv.best_estimator_
    mlflow.log_params(gscv.best_params_)

    y_pred = best.predict(X_test)
    metrics = {
        'MAE': mean_absolute_error(y_test, y_pred),
        'RMSE': root_mean_squared_error(y_test, y_pred),
        'R2': r2_score(y_test, y_pred),
    }
    mlflow.log_metrics(metrics)

    fi = pd.Series(best.named_steps['rf'].feature_importances_,
                   index=X_full.columns).sort_values(ascending=False)
    fi.to_csv('feature_importance.csv')
    mlflow.log_artifact('feature_importance.csv', artifact_path='insight')
    mlflow.log_artifact('feature_summary.csv', artifact_path='eda')

    signature = infer_signature(X_test.head(5), best.predict(X_test.head(5)))
    mlflow.sklearn.log_model(
        best,
        artifact_path='model',
        registered_model_name='CaliforniaRFRegressor',
        signature=signature,
        input_example=X_test.head(5),
    )

print('✅ Training run complete — check the MLflow UI link above')


## 5  (Optional) Serve the model with ngrok

In [None]:
# from pyngrok import ngrok
# model_uri = 'models:/CaliforniaRFRegressor/1'
# proc = subprocess.Popen(['mlflow', 'models', 'serve', '-m', model_uri, '-p', '9000', '--host', '0.0.0.0'])
# time.sleep(5)
# endpoint = ngrok.connect(9000, 'http')
# print('Model endpoint 👉', endpoint.public_url + '/invocations')
