In [0]:
import pandas as pd
import numpy as np

import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [0]:
default_table = "postech_ml.car_features"

dbutils.widgets.text("training_dataset", default_table, label="Training Dataset")
dbutils.widgets.text("experiment_name", "/dev-mlops-experiment", label="MLflow Experiment Name")
dbutils.widgets.text("model_name", "car_features_model", label="Model Name")

training_dataset = dbutils.widgets.get("training_dataset")
experiment_name = dbutils.widgets.get("experiment_name")
model_name = dbutils.widgets.get("model_name")

mlflow.set_experiment(experiment_name)

In [0]:
# Leitura do dataset de treino
df_raw = spark.read.table(training_dataset)
df = df_raw.toPandas()

# Tratamentos
df["engine_fuel_type"].fillna("Unknown", inplace=True)
df["engine_hp"].fillna(df["engine_hp"].median(), inplace=True)
df["engine_cylinders"].fillna(df["engine_cylinders"].median(), inplace=True)
df["number_of_doors"].fillna(df["number_of_doors"].mode()[0], inplace=True)
df["market_category"].fillna("Unknown", inplace=True)
df = df.drop_duplicates()

# Features e Targets
X = df.drop(columns=["msrp", "city_mpg", "highway_mpg"])
y = df[["msrp", "city_mpg", "highway_mpg"]]

num_features = ["engine_hp", "engine_cylinders", "popularity", "year", "number_of_doors"]
cat_features = [col for col in X.columns if col not in num_features]

num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features),
        ("cat", cat_transformer, cat_features)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", MultiOutputRegressor(RandomForestRegressor(
        n_estimators=200,
        max_depth=10,
        random_state=42
    )))
])

In [0]:
with mlflow.start_run():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Logar métricas
    for i, col in enumerate(y.columns):
        mae = mean_absolute_error(y_test.iloc[:, i], y_pred[:, i])
        rmse = np.sqrt(mean_squared_error(y_test.iloc[:, i], y_pred[:, i]))
        r2 = r2_score(y_test.iloc[:, i], y_pred[:, i])
        mlflow.log_metric(f"{col}_mae", mae)
        mlflow.log_metric(f"{col}_rmse", rmse)
        mlflow.log_metric(f"{col}_r2", r2)

    # Salvar modelo no registry
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        registered_model_name=model_name
    )

In [0]:
client = MlflowClient()
latest = client.get_latest_versions(model_name, stages=["None"])
model_uri = f"models:/{model_name}/{latest[0].version}"

dbutils.jobs.taskValues.set("model_uri", model_uri)
dbutils.jobs.taskValues.set("model_name", model_name)
dbutils.jobs.taskValues.set("model_version", latest[0].version)

dbutils.notebook.exit(model_uri)