#### 2: Import Libraries


In [9]:
import mlflow
import mlflow.xgboost
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import pandas as pd


#### 3: Set Up MLflow


In [3]:
# Set MLflow tracking URI
EC2_PUBLIC_DNS='ec2-16-16-217-131.eu-north-1.compute.amazonaws.com'
mlflow.set_tracking_uri(f"http://{EC2_PUBLIC_DNS}:5000")

# Set experiment name
mlflow.set_experiment("xgboost_optimized_model")


<Experiment: artifact_location='s3://mlflow-artifacts-capstone-mlops/5', creation_time=1722118383817, experiment_id='5', last_update_time=1722118383817, lifecycle_stage='active', name='xgboost_optimized_model', tags={}>

In [20]:
from dotenv import load_dotenv
import os

# Set AWS credentials as environment variables
load_dotenv()

# Retrieve AWS credentials
aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')

# Set AWS environment variables (if needed)
os.environ['AWS_ACCESS_KEY_ID'] = aws_access_key_id
os.environ['AWS_SECRET_ACCESS_KEY'] = aws_secret_access_key


#### 4: Load and Prepare Data


In [6]:
# Load the cleaned data
cleaned_df = pd.read_csv('../../data/cleaned_car_data.csv')

# Define features (X) and target variable (y)
X = cleaned_df.drop(columns=['price'])  # Exclude 'price'
y = cleaned_df['price']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the splits
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

# Optional: Check the first few rows of the scaled data
print("First few rows of the training data:")
print(X_train[:1])


Training data shape: (84888, 19)
Testing data shape: (21223, 19)
First few rows of the training data:
         year  mileage  enginesize    tax   mpg  make_bmw  make_cclass   
71413  2020.0   4750.0         1.5  145.0  44.8     False        False  \

       make_focus  make_ford  make_hyundi  make_merc  make_skoda  make_toyota   
71413       False      False        False      False       False        False  \

       make_vauxhall  make_vw  transmission_Manual  transmission_Semi-Auto   
71413          False    False                 True                   False  \

       fueltype_Hybrid  fueltype_Petrol  
71413            False             True  


#### 5: Set Optimized Hyperparameters


In [7]:
# Optimized hyperparameters from hyperopt
params = {
    'learning_rate': 0.09455111298980684,
    'max_depth': int(9.0),
    'min_child_weight': 0.3730492049381335,
    'n_estimators': int(500.0),
    'subsample': 0.9989341273723211,
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse'
}


#### 6: Train the Model and Log with MLflow


In [None]:
# Start an MLflow run
with mlflow.start_run():
    # Initialize and train the model
    model = xgb.XGBRegressor(**params)
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Calculate metrics
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Log parameters and metrics
    mlflow.log_params(params)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2_score", r2)

    # Log the model
    signature = mlflow.models.infer_signature(X_train, y_train)
    input_example = X_train[:1]
    mlflow.xgboost.log_model(model, artifact_path="xgboost_model", signature=signature, input_example=input_example)

    print(f"RMSE: {rmse}, MAE: {mae}, R2 Score: {r2}")