In [None]:
# Ajuste o caminho do CSV tratado e do diret√≥rio onde salvar modelos/artefatos
FILE_PATH = "/data/processed/concatenado_clean_20251125_202250.csv"
ARTIFACT_DIR = "/data/artifacts"

import os
os.makedirs(ARTIFACT_DIR, exist_ok=True)

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import joblib
import mlflow
import mlflow.sklearn
import numpy as np

# Configure MLflow tracking URI (update env var or keep default)
mlflow_tracking = os.getenv('MLFLOW_TRACKING_URI', 'http://mlflow:5000')
mlflow.set_tracking_uri(mlflow_tracking)
print('MLflow tracking URI:', mlflow.get_tracking_uri())

# Load data
df = pd.read_csv(FILE_PATH, parse_dates=['DATA'], dayfirst=True, low_memory=False)
display(df.head())

# Select features and target
EXTRA_COLUMNS = ['ESTACAO','DATA','HORA_UTC']
FEATURES = ['TEMPERATURA_BULBO_SECO', 'PRESSAO_ATM_EST', 'RADIACAO_GLOBAL']
TARGET = 'UMIDADE_REL'

# Basic dropna for chosen columns
df_model = df[FEATURES + [TARGET]].dropna()
df_model[EXTRA_COLUMNS] = df.loc[df_model.index, EXTRA_COLUMNS]
print('Used rows:', df_model.shape[0])

X = df_model[FEATURES].values
y = df_model[TARGET].values

# Train-test split by time or random
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline
pipeline = Pipeline([('scaler', StandardScaler()), ('rf', RandomForestRegressor(n_estimators=100, random_state=42))])

# Train and evaluate
pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, preds)
print('MAE (test):', mae)

# Log to MLflow - Registro de experimentos
print('\nüìä Registrando experimento no MLflow...')
with mlflow.start_run(run_name='rf_umidade'):
    # Registra par√¢metros do modelo
    mlflow.log_param('model', 'RandomForestRegressor')
    mlflow.log_param('n_estimators', 100)
    mlflow.log_param('features', str(FEATURES))
    mlflow.log_param('target', TARGET)
    
    # Registra m√©tricas
    mlflow.log_metric('mae', float(mae))
    
    # Salva modelo como artefato
    model_path = os.path.join(ARTIFACT_DIR, 'rf_model.pkl')
    joblib.dump(pipeline, model_path)
    mlflow.log_artifact(model_path, artifact_path='models')
    print('‚úì Modelo salvo como artefato no MLflow')
    
    # Salva CSV com predi√ß√µes do conjunto de teste como artefato
    out_df = pd.DataFrame(X_test, columns=FEATURES)
    out_df[TARGET + '_real'] = y_test
    out_df[TARGET + '_pred'] = preds
    preds_path = os.path.join(ARTIFACT_DIR, 'predictions_partial.csv')
    out_df.to_csv(preds_path, index=False)
    mlflow.log_artifact(preds_path, artifact_path='predictions')
    print('‚úì CSV de predi√ß√µes (teste) salvo como artefato no MLflow')
    
    # Obt√©m run_id para refer√™ncia
    run_id = mlflow.active_run().info.run_id
    print(f'‚úì MLflow run_id: {run_id}')

# Save final predictions (full set) to ARTIFACT_DIR para uso no dashboard
print('\nüìÅ Exportando predi√ß√µes completas para CSV...')
full_preds = pipeline.predict(df_model[FEATURES])
df_out = df_model.copy()
df_out[TARGET + '_pred'] = full_preds
final_predictions_path = os.path.join(ARTIFACT_DIR, 'predictions_full.csv')
df_out.to_csv(final_predictions_path, index=False)
print('‚úì Predictions exported to', final_predictions_path)

# Salva modelo localmente tamb√©m
joblib.dump(pipeline, os.path.join(ARTIFACT_DIR, 'rf_model_for_serving.pkl'))
print('‚úì Model saved in', ARTIFACT_DIR)
print('\n‚úÖ Processo conclu√≠do! Verifique o MLflow em http://mlflow:5000')