In [0]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [0]:
current_user = dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()
default_path = f"/Workspace/Users/{current_user}/PosTechChallenge3/dados/CarFeatures.csv"

In [0]:
dbutils.widgets.text("training_data_path", default_path, label="Path to training data")
dbutils.widgets.text("experiment_name", "/dev-mlops-experiment", label="MLflow experiment name")
dbutils.widgets.text("model_name", "car_features_model", label="Model Name")

training_data_path = dbutils.widgets.get("training_data_path")
experiment_name = dbutils.widgets.get("experiment_name")
model_name = dbutils.widgets.get("model_name")

mlflow.set_experiment(experiment_name)

In [0]:
df = pd.read_csv(training_data_path)

# Tratamentos
df["Engine Fuel Type"].fillna("Unknown", inplace=True)
df["Engine HP"].fillna(df["Engine HP"].median(), inplace=True)
df["Engine Cylinders"].fillna(df["Engine Cylinders"].median(), inplace=True)
df["Number of Doors"].fillna(df["Number of Doors"].mode()[0], inplace=True)
df["Market Category"].fillna("Unknown", inplace=True)
df = df.drop_duplicates()

# Features e Targets
X = df.drop(columns=["MSRP", "city mpg", "highway MPG"])
y = df[["MSRP", "city mpg", "highway MPG"]]

num_features = ["Engine HP", "Engine Cylinders", "Popularity", "Year", "Number of Doors"]
cat_features = [col for col in X.columns if col not in num_features]

num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features),
        ("cat", cat_transformer, cat_features)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", MultiOutputRegressor(RandomForestRegressor(
        n_estimators=200,
        max_depth=10,
        random_state=42
    )))
])

In [0]:
with mlflow.start_run():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Logar métricas
    for i, col in enumerate(y.columns):
        mae = mean_absolute_error(y_test.iloc[:, i], y_pred[:, i])
        rmse = np.sqrt(mean_squared_error(y_test.iloc[:, i], y_pred[:, i]))
        r2 = r2_score(y_test.iloc[:, i], y_pred[:, i])
        mlflow.log_metric(f"{col}_mae", mae)
        mlflow.log_metric(f"{col}_rmse", rmse)
        mlflow.log_metric(f"{col}_r2", r2)

    # Salvar modelo no registry
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        registered_model_name=model_name
    )

In [0]:
from mlflow.tracking import MlflowClient
client = MlflowClient()
latest = client.get_latest_versions(model_name, stages=["None"])
model_uri = f"models:/{model_name}/{latest[0].version}"

dbutils.jobs.taskValues.set("model_uri", model_uri)
dbutils.jobs.taskValues.set("model_name", model_name)
dbutils.jobs.taskValues.set("model_version", latest[0].version)

dbutils.notebook.exit(model_uri)