In [1]:
# Ajuste o caminho do CSV tratado e do diret√≥rio onde salvar modelos/artefatos
FILE_PATH = "/data/processed/concatenado_clean_20251125_202250.csv"
ARTIFACT_DIR = "/data/artifacts"

import os
os.makedirs(ARTIFACT_DIR, exist_ok=True)

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import joblib
import mlflow
import mlflow.sklearn
import numpy as np

# Configure MLflow tracking URI (update env var or keep default)
mlflow_tracking = os.getenv('MLFLOW_TRACKING_URI', 'http://mlflow:5000')
mlflow.set_tracking_uri(mlflow_tracking)
print('MLflow tracking URI:', mlflow.get_tracking_uri())

# Configurar vari√°veis de ambiente para MinIO/S3 (garantir que est√£o configuradas)
os.environ['AWS_ACCESS_KEY_ID'] = os.getenv('AWS_ACCESS_KEY_ID', os.getenv('MINIO_ACCESS_KEY_ID', 'minioadmin'))
os.environ['AWS_SECRET_ACCESS_KEY'] = os.getenv('AWS_SECRET_ACCESS_KEY', os.getenv('MINIO_SECRET_ACCESS_KEY', 'minioadmin'))
os.environ['MLFLOW_S3_ENDPOINT_URL'] = os.getenv('MLFLOW_S3_ENDPOINT_URL', os.getenv('AWS_ENDPOINT_URL', 'http://minio:9000')).rstrip('/')
print('‚úì Vari√°veis S3 configuradas para MinIO')

# Load data
df = pd.read_csv(FILE_PATH, parse_dates=['DATA'], dayfirst=True, low_memory=False)
display(df.head())

# Select features and target
EXTRA_COLUMNS = ['ESTACAO','DATA','HORA_UTC']
FEATURES = ['TEMPERATURA_BULBO_SECO', 'PRESSAO_ATM_EST', 'RADIACAO_GLOBAL']
TARGET = 'UMIDADE_REL'

# Basic dropna for chosen columns
df_model = df[FEATURES + [TARGET]].dropna()
df_model[EXTRA_COLUMNS] = df.loc[df_model.index, EXTRA_COLUMNS]
print('Used rows:', df_model.shape[0])

X = df_model[FEATURES].values
y = df_model[TARGET].values

# Train-test split by time or random
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline
pipeline = Pipeline([('scaler', StandardScaler()), ('rf', RandomForestRegressor(n_estimators=100, random_state=42))])

# Train and evaluate
pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, preds)
print('MAE (test):', mae)

# Log to MLflow - Registro de experimentos
print('\nüìä Registrando experimento no MLflow...')
with mlflow.start_run(run_name='rf_umidade'):
    # Registra par√¢metros do modelo
    mlflow.log_param('model', 'RandomForestRegressor')
    mlflow.log_param('n_estimators', 100)
    mlflow.log_param('features', str(FEATURES))
    mlflow.log_param('target', TARGET)
    
    # Registra m√©tricas
    mlflow.log_metric('mae', float(mae))
    
    # Salva modelo como artefato (com tratamento de erro)
    model_path = os.path.join(ARTIFACT_DIR, 'rf_model.pkl')
    joblib.dump(pipeline, model_path)
    try:
        mlflow.log_artifact(model_path, artifact_path='models')
        print('‚úì Modelo salvo como artefato no MLflow')
    except Exception as e:
        print(f'‚ö† Aviso: N√£o foi poss√≠vel salvar modelo no MLflow: {e}')
        print('  O modelo foi salvo localmente em:', model_path)
    
    # Salva CSV com predi√ß√µes do conjunto de teste como artefato
    out_df = pd.DataFrame(X_test, columns=FEATURES)
    out_df[TARGET + '_real'] = y_test
    out_df[TARGET + '_pred'] = preds
    preds_path = os.path.join(ARTIFACT_DIR, 'predictions_partial.csv')
    out_df.to_csv(preds_path, index=False)
    try:
        mlflow.log_artifact(preds_path, artifact_path='predictions')
        print('‚úì CSV de predi√ß√µes (teste) salvo como artefato no MLflow')
    except Exception as e:
        print(f'‚ö† Aviso: N√£o foi poss√≠vel salvar CSV no MLflow: {e}')
        print('  O CSV foi salvo localmente em:', preds_path)
    
    # Obt√©m run_id para refer√™ncia
    run_id = mlflow.active_run().info.run_id
    print(f'‚úì MLflow run_id: {run_id}')

# Save final predictions (full set) to ARTIFACT_DIR para uso no dashboard
print('\nüìÅ Exportando predi√ß√µes completas para CSV...')
full_preds = pipeline.predict(df_model[FEATURES])
df_out = df_model.copy()
df_out[TARGET + '_pred'] = full_preds
final_predictions_path = os.path.join(ARTIFACT_DIR, 'predictions_full.csv')
df_out.to_csv(final_predictions_path, index=False)
print('‚úì Predictions exported to', final_predictions_path)

# Salva modelo localmente tamb√©m
joblib.dump(pipeline, os.path.join(ARTIFACT_DIR, 'rf_model_for_serving.pkl'))
print('‚úì Model saved in', ARTIFACT_DIR)
print('\n‚úÖ Processo conclu√≠do! Verifique o MLflow em http://mlflow:5000')

MLflow tracking URI: http://mlflow:5000
‚úì Vari√°veis S3 configuradas para MinIO


Unnamed: 0,ESTACAO,DATA,HORA_UTC,PRECIPITACAO_TOTAL,PRESSAO_ATM_EST,PRESSAO_MAX_1H,PRESSAO_MIN_1H,RADIACAO_GLOBAL,TEMPERATURA_BULBO_SECO,TEMPERATURA_PONTO_ORVALHO,TEMPERATURA_MAX_1H,TEMPERATURA_MIN_1H,ORVALHO_MAX_1H,ORVALHO_MIN_1H,UMIDADE_REL_MAX_1H,UMIDADE_REL_MIN_1H,UMIDADE_REL,VENTO_DIRECAO,VENTO_RAJADA_MAX,VENTO_VELOCIDADE
0,PETROLINA,2023/05/03,1400 UTC,17.2,971.7,972.4,971.7,,29.0,20.2,30.5,28.4,20.7,19.5,60.0,54.0,59.0,122.0,6.8,2.8
1,PETROLINA,2023/05/03,1500 UTC,15.4,970.8,971.9,970.8,,30.6,19.6,30.9,28.7,20.6,19.5,60.0,51.0,52.0,143.0,7.3,3.2
2,PETROLINA,2023/05/03,1600 UTC,9.8,969.7,970.8,969.7,,30.7,18.7,31.9,30.3,20.1,18.1,53.0,46.0,49.0,101.0,6.4,2.5
3,PETROLINA,2023/05/03,1700 UTC,0.0,968.6,969.7,968.6,,31.9,18.8,32.5,30.0,19.3,18.2,51.0,44.0,46.0,136.0,6.3,2.1
4,PETROLINA,2023/05/03,1800 UTC,0.0,968.2,968.6,968.2,,32.4,20.2,33.2,30.9,20.2,17.9,52.0,42.0,49.0,176.0,6.2,2.6


Used rows: 79437
MAE (test): 6.648561230698221

üìä Registrando experimento no MLflow...
‚úì Modelo salvo como artefato no MLflow
‚úì CSV de predi√ß√µes (teste) salvo como artefato no MLflow
‚úì MLflow run_id: 7f41672ecd7a42a3a42b4b5972d206e0
üèÉ View run rf_umidade at: http://mlflow:5000/#/experiments/0/runs/7f41672ecd7a42a3a42b4b5972d206e0
üß™ View experiment at: http://mlflow:5000/#/experiments/0

üìÅ Exportando predi√ß√µes completas para CSV...




‚úì Predictions exported to /data/artifacts/predictions_full.csv
‚úì Model saved in /data/artifacts

‚úÖ Processo conclu√≠do! Verifique o MLflow em http://mlflow:5000
