In [2]:
!pip install hyperopt xgboost mlflow boto3 -q


This code sets up the hyperparameter tuning process using hyperopt, logs the parameters and metrics to MLflow, and runs the optimization. After running the optimization, you will be able to see the results in the MLflow UI.

### 1: Import Libraries and Suppress Warnings


In [8]:
import mlflow
import mlflow.xgboost
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd
import warnings
from sklearn.preprocessing import StandardScaler

# Suppress specific warnings
warnings.filterwarnings('ignore', category=UserWarning, module='xgboost')
warnings.filterwarnings('ignore', category=UserWarning, module='_distutils_hack')

# Set MLflow tracking URI
EC2_PUBLIC_DNS='ec2-16-16-217-131.eu-north-1.compute.amazonaws.com'
mlflow.set_tracking_uri(f"http://{EC2_PUBLIC_DNS}:5000")
mlflow.set_experiment("xgboost_hyperparameter_tuning")


<Experiment: artifact_location='s3://mlflow-artifacts-capstone-mlops/4', creation_time=1722115105130, experiment_id='4', last_update_time=1722115105130, lifecycle_stage='active', name='xgboost_hyperparameter_tuning', tags={}>

### 2: Load Data and Preprocess


In [9]:
# Load the cleaned data
cleaned_df = pd.read_csv('../data/cleaned_car_data.csv')

# Define features (X) and target variable (y)
X = cleaned_df.drop(columns=['price'])  # Exclude 'price'
y = cleaned_df['price']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the splits
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Optional: Check the first few rows of the scaled data
print("First few rows of the scaled training data:")
print(X_train_scaled[:5])


Training data shape: (86392, 19)
Testing data shape: (21599, 19)
First few rows of the scaled training data:
[[ 0.9171049  -1.06487922  0.4380831  -1.8997133   2.4206718   3.05252791
  -0.21935334 -0.24096514 -0.44371169 -0.21594036 -0.36700329 -0.24611195
  -0.25485438 -0.37469737 -0.39570361 -1.1391218   1.83331115 -0.16062932
   0.88124322]
 [-1.41870071  0.30215763 -2.09510711  1.04440611 -1.20361943 -0.32759733
  -0.21935334 -0.24096514  2.25371569 -0.21594036 -0.36700329 -0.24611195
  -0.25485438 -0.37469737 -0.39570361  0.87786925 -0.54546114 -0.16062932
   0.88124322]
 [ 0.9171049  -0.96480263  0.35364342 -0.75735782 -0.84119031 -0.32759733
  -0.21935334 -0.24096514 -0.44371169  4.63090817 -0.36700329 -0.24611195
  -0.25485438 -0.37469737 -0.39570361  0.87786925 -0.54546114 -0.16062932
   0.88124322]
 [ 0.9171049  -0.99095472  0.35364342  0.30141068 -1.20361943 -0.32759733
  -0.21935334 -0.24096514 -0.44371169 -0.21594036 -0.36700329 -0.24611195
   3.92380939 -0.37469737 -0.395

### 3: Define Hyperparameter Space and Objective Function


In [10]:
space = {
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'n_estimators': hp.quniform('n_estimators', 100, 1000, 100),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'seed': 42,
    'objective': 'reg:squarederror'
}

def objective(params):
    with mlflow.start_run(nested=True):
        # Convert hyperparameters to int if they are passed as floats
        params["max_depth"] = int(params["max_depth"])
        params["n_estimators"] = int(params["n_estimators"])

        # Log hyperparameters
        mlflow.log_params(params)

        # Train the model
        model = xgb.XGBRegressor(
            n_estimators=params["n_estimators"],
            max_depth=params["max_depth"],
            learning_rate=params["learning_rate"],
            subsample=params["subsample"],
            seed=params["seed"],
            objective=params["objective"]
        )
        model.fit(X_train_scaled, y_train)

        # Predict and evaluate
        y_pred = model.predict(X_test_scaled)
        rmse = root_mean_squared_error(y_test, y_pred)
        mlflow.log_metric("rmse", rmse)

        # Log the model
        mlflow.xgboost.log_model(model, "model")

        return {'loss': rmse, 'status': STATUS_OK}


### 4: Run Hyperparameter Optimization


In [11]:
trials = Trials()
best_params = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,  # Adjust this number based on how long you want to search
    trials=trials
)

print()
print()

print("Best parameters:", best_params)


100%|██████████| 50/50 [07:44<00:00,  9.30s/trial, best loss: 2285.035402316803]


Best parameters: {'learning_rate': 0.09455111298980684, 'max_depth': 9.0, 'min_child_weight': 0.3730492049381335, 'n_estimators': 500.0, 'subsample': 0.9989341273723211}
