In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib

import warnings
warnings.filterwarnings("ignore")

### Configure MLflow Tracking and Experiment

In [None]:
import mlflow 

# store run metadata (metrics, parameters, tags, etc.) in a SQLite database file named "mlflow.db"
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

# log your runs under an experiment called "nyc-taxi-experiment"
mlflow.set_experiment("nyc-taxi-experiment")

In [None]:
mlflow.get_artifact_uri()

In [None]:
df = pd.read_parquet(r'data\yellow_tripdata_2023-01.parquet').sample(frac=0.01)
df_val = pd.read_parquet(r'data\yellow_tripdata_2023-02.parquet').sample(frac=0.01)
df.head()

In [None]:
df_copy = df.copy()

In [None]:
df_copy.info()

In [None]:
df_copy['duration'] = (df_copy['tpep_dropoff_datetime'] - df_copy['tpep_pickup_datetime']).dt.total_seconds() / 60
df_copy = df_copy.loc[(df_copy['duration'] >= 1) & (df_copy['duration'] <= 60)]
df_train = df_copy[['PULocationID', 'DOLocationID']]

In [None]:
encoder = OneHotEncoder(drop='first', handle_unknown='ignore')
df_train_encoded = encoder.fit_transform(df_train)
df_train_encoded.shape

In [None]:
X_train, y_train = df_train_encoded, df_copy['duration'].values

In [None]:
lr_model = LinearRegression().fit(X_train, y_train)

In [None]:
y_pred_train = lr_model.predict(X_train)
mean_squared_error(y_train, y_pred_train, squared=False)

In [None]:
df_val['duration'] = (df_val['tpep_dropoff_datetime'] - df_val['tpep_pickup_datetime']).dt.total_seconds() / 60
df_val = df_val.loc[(df_val['duration'] >= 1) & (df_val['duration'] <= 60)]

y_val = df_val['duration'].values
df_val = df_val[['PULocationID', 'DOLocationID']]
X_val = encoder.transform(df_val)

In [None]:
y_pred_val = lr_model.predict(X_val)	
mean_squared_error(y_val, y_pred_val, squared=False)

In [None]:
joblib.dump((lr_model, encoder), 'models/lin_reg.bin')

In [None]:
lasso = Lasso(alpha=0.001).fit(X_train, y_train)
y_pred_val = lasso.predict(X_val)	
mean_squared_error(y_val, y_pred_val, squared=False)

### MLflow Run: Track Lasso Regression Experiment

In [None]:
# let's start a new run
with mlflow.start_run():

	mlflow.set_tag('data-scientist', 'mohamed')
	
	mlflow.log_param('train-data-path', r'data\yellow_tripdata_2023-01.parquet')
	mlflow.log_param('val-data-path', r'data\yellow_tripdata_2023-02.parquet')
	
	alpha = 0.1
	mlflow.log_param("alpha", alpha)
	lasso = Lasso(alpha).fit(X_train, y_train)
	
	y_pred_val = lasso.predict(X_val)	
	rmse = mean_squared_error(y_val, y_pred_val, squared=False)
	mlflow.log_metric("rmse", rmse)

### Hyperparameter Tuning with Hyperopt + XGBoost

| Term              | What it is                              | What it stands for / does                       |
| ----------------- | --------------------------------------- | ----------------------------------------------- |
| `fmin`            | Optimization function                   | Runs the tuning loop                            |
| `tpe`             | Algorithm                               | Tree-structured Parzen Estimator (Bayesian opt) |
| `hp`              | Hyperparameter search space constructor | Defines what values to try                      |
| `STATUS_OK`       | Status flag                             | Tells Hyperopt the trial succeeded              |
| `Trials`          | Trial tracker                           | Stores all results and parameter combinations   |
| `scope`           | Type-caster                             | Converts floats to int, wraps Python functions  |

We're importing the entire stack for **automated, intelligent hyperparameter tuning** using **Bayesian optimization** with **XGBoost**. Each piece is a cog in the tuning machine.


In [None]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [None]:
train = xgb.DMatrix(X_train, label=y_train)
val = xgb.DMatrix(X_val, label=y_val)

Steps of objective function:

- Receive hyperparameters

- Train the model

- Return a loss value (e.g., RMSE)

In [None]:
def objective(params):
	with mlflow.start_run():
		mlflow.set_tag('model', 'xgboost')
		mlflow.log_params(params)
		booster = xgb.train(
			   params=params, 
			   dtrain=train, 
			   num_boost_round=100, 
			   evals=[(val, 'validation')], 
			   early_stopping_rounds=50, 
			   verbose_eval=False
		)
		y_pred = booster.predict(val)
		rmse = mean_squared_error(y_val, y_pred, squared=False)
		mlflow.log_metric('rmse', rmse)
		return {'loss': rmse, 'status': STATUS_OK}

In [None]:
search_space = {
    # `quniform` produces discrete steps spaced by q from low to high: 3.0, 4.0, ..., 20.0
    'max_depth': scope.int(hp.quniform('max_depth', 3, 20, 1)), 
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.001), np.log(0.3)), 
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1), # searches in this range: [exp(-6), exp(-1)] = [0.002, 0.368]
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1), 
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3), 
    'objective': 'reg:squarederror', 
    'seed': 42, 
}

best_result = fmin(
    fn=objective, 
    space=search_space, 
    algo=tpe.suggest, # guides the search intelligently
    max_evals=50, # runs the objective function 50 times. Each time with a different set of parameters
    trials=Trials() # stores results of all evaluations: parameters, loss, runtime, etc, so we can analyze or plot them later
)

In [None]:
best_result

### [mlflow auto-logging](https://mlflow.org/docs/latest/tracking/autolog#:~:text=The%20following%20libraries%20support%20autologging%3A)

`mlflow.xgboost.autolog()` must be called before training (e.g., `xgb.train(...)`) begins.

**Best practice:** place `mlflow.xgboost.autolog()` outside of `mlflow.start_run()` at the very top of the script. 


In [None]:
mlflow.xgboost.autolog()

params = {
    'colsample_bytree': 0.811456438126501,
    'learning_rate': 0.23687492640963337,
    'max_depth': 17,
    'min_child_weight': 0.6041334208397435,
    'objective': 'reg:squarederror',
    'reg_alpha': 0.05731229437746139,
    'reg_lambda': 0.020764792800741835,
    'seed': 42,
    'subsample': 0.7453194874678659
}

booster = xgb.train(
	params=params, 
	dtrain=train, 
	num_boost_round=300, 
	evals=[(val, 'validation')], 
	early_stopping_rounds=50, 
    verbose_eval=20
	# verbose_eval=False
)

### Logging models in mlflow

-	**Log as an artifact** (contains manual saving)

	```py
	mlflow.log_artifact(local_path="models/my_model.joblib", artifact_path="joblib_models")
	```
	This tells MLflow to log `"models/my_model.joblib"` inside `"mlruns/<current_exp_id>/<current_run_id>/artifacts/joblib_models"`

	The `artifact_path` is just a folder name inside the MLflow UI. If not provided, it will store it in `"mlruns/<current_exp_id>/<current_run_id>/artifacts/"`

	Use `mlflow.log_artifacts(local_dir="models", artifact_path=)` for logging your models folder; if you just want to log a single file, use `mlflow.log_artifact()`

<br>

-	**Log using `log_model` method** (more automatic; no manual saving)

	```python
	mlflow.<framework>.log_model(model, artifact_path=)
	```
	Or for custom pipelines:
	```python
	mlflow.pyfunc.log_model(python_model=YourCustomWrapper(...), artifact_path=)
	```

In [None]:
# disable now to avoid logging the model twice
mlflow.xgboost.autolog(disable=True) 

with mlflow.start_run():
	best_params = {
		'colsample_bytree': 0.811456438126501,
		'learning_rate': 0.23687492640963337,
		'max_depth': 17,
		'min_child_weight': 0.6041334208397435,
		'objective': 'reg:squarederror',
		'reg_alpha': 0.05731229437746139,
		'reg_lambda': 0.020764792800741835,
		'seed': 42,
		'subsample': 0.7453194874678659
	}

	mlflow.log_params(best_params)

	booster = xgb.train(
		params=best_params, 
		dtrain=train, 
		num_boost_round=10, 
		evals=[(val, 'validation')], 
		early_stopping_rounds=50, 
		verbose_eval=20
		# verbose_eval=False
	)

	y_pred = booster.predict(val)
	rmse = mean_squared_error(y_val, y_pred, squared=False)
	mlflow.log_metric('rmse', rmse)

	joblib.dump(encoder, "models/encoder.joblib")
	mlflow.log_artifact("models/encoder.joblib", artifact_path="preprocessors")

	mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")

### Loading models in mlflow

-	**Load model as a PyFuncModel**

	```py
	mlflow.pyfunc.load_model(logged_model_path)
	```

-	**Load model using its framework**

	```py
	mlflow.<framework>.load_model(logged_model_path)
	```

In [None]:
# I got this code snippet from mlflow ui
logged_model = 'runs:/52594083daf440e3851dc88a08a3e62f/models_mlflow'

loaded_model = mlflow.pyfunc.load_model(logged_model)
loaded_model

In [None]:
xgboost_model = mlflow.xgboost.load_model(logged_model)
xgboost_model

In [None]:
y_pred = booster.predict(val)
mean_squared_error(y_val, y_pred, squared=False)

### Accesing the details from mlflow using the MLFlow client 

In [None]:
from mlflow.tracking import MlflowClient

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
client.search_experiments()

In [None]:
runs = client.search_runs(
    experiment_ids='1', 
    filter_string="metrics.rmse < 7.0", 
    max_results=5, 
    order_by=['metrics.rmse ASC']
)

for run in runs:
    print(f"Run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.2f}")

In [None]:
model_name = 'nyc-taxi-regressor'

In [None]:
latest_versions = client.get_latest_versions(model_name)
for version in latest_versions:
    print(f"version: {version.version}, status: {version.tags['status']}")

In [None]:
datetime.today().isoformat(sep=' ')

In [None]:
client.update_model_version(
    name=model_name, 
    version=2, 
    description=f"Model version 4 is updated at {datetime.today().isoformat(sep=' ')}"
)

### Adding an MLflow Model to the Model Registry

<img src="https://i.ytimg.com/vi/TKHU7HAvGH8/maxresdefault.jpg" alt="model registry" width="800" height="450">

In [None]:
run_id = "7228072c97ea4e55a95204869efbb6cc"
model_uri = f"runs:/{run_id}/models"
mlflow.register_model(model_uri=model_uri, name=model_name, tags={"model": "RandomForest", "status": "staging"})