#### 2: Import Libraries


In [5]:
import mlflow
import mlflow.xgboost
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
import pandas as pd
from sklearn.preprocessing import StandardScaler


#### 3: Set Up MLflow


In [3]:
# Set MLflow tracking URI
EC2_PUBLIC_DNS='ec2-16-16-217-131.eu-north-1.compute.amazonaws.com'
mlflow.set_tracking_uri(f"http://{EC2_PUBLIC_DNS}:5000")

# Set experiment name
mlflow.set_experiment("xgboost_optimized_model")


2024/07/27 22:13:03 INFO mlflow.tracking.fluent: Experiment with name 'xgboost_optimized_model' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://mlflow-artifacts-capstone-mlops/5', creation_time=1722118383817, experiment_id='5', last_update_time=1722118383817, lifecycle_stage='active', name='xgboost_optimized_model', tags={}>

#### 4: Load and Prepare Data


In [6]:
# Load the cleaned data
cleaned_df = pd.read_csv('../data/cleaned_car_data.csv')

# Define features (X) and target variable (y)
X = cleaned_df.drop(columns=['price'])  # Exclude 'price'
y = cleaned_df['price']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the splits
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Optional: Check the first few rows of the scaled data
print("First few rows of the scaled training data:")
print(X_train_scaled[:3])


Training data shape: (86392, 19)
Testing data shape: (21599, 19)
First few rows of the scaled training data:
[[ 0.9171049  -1.06487922  0.4380831  -1.8997133   2.4206718   3.05252791
  -0.21935334 -0.24096514 -0.44371169 -0.21594036 -0.36700329 -0.24611195
  -0.25485438 -0.37469737 -0.39570361 -1.1391218   1.83331115 -0.16062932
   0.88124322]
 [-1.41870071  0.30215763 -2.09510711  1.04440611 -1.20361943 -0.32759733
  -0.21935334 -0.24096514  2.25371569 -0.21594036 -0.36700329 -0.24611195
  -0.25485438 -0.37469737 -0.39570361  0.87786925 -0.54546114 -0.16062932
   0.88124322]
 [ 0.9171049  -0.96480263  0.35364342 -0.75735782 -0.84119031 -0.32759733
  -0.21935334 -0.24096514 -0.44371169  4.63090817 -0.36700329 -0.24611195
  -0.25485438 -0.37469737 -0.39570361  0.87786925 -0.54546114 -0.16062932
   0.88124322]]


#### 5: Set Optimized Hyperparameters


In [7]:
# Optimized hyperparameters from hyperopt
params = {
    'learning_rate': 0.09455111298980684,
    'max_depth': int(9.0),
    'min_child_weight': 0.3730492049381335,
    'n_estimators': int(500.0),
    'subsample': 0.9989341273723211,
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse'
}


#### 6: Train the Model and Log with MLflow


In [8]:
# Start an MLflow run
with mlflow.start_run():
    # Initialize and train the model
    model = xgb.XGBRegressor(**params)
    model.fit(X_train_scaled, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test_scaled)

    # Calculate the RMSE
    rmse = root_mean_squared_error(y_test, y_pred)

    # Log parameters
    mlflow.log_params(params)

    # Log the RMSE metric
    mlflow.log_metric("rmse", rmse)

    # Log the model
    mlflow.xgboost.log_model(model, artifact_path="xgboost_model")

    print(f"RMSE: {rmse}")




RMSE: 2291.7581220397637
