In [None]:
# Ajuste o caminho do CSV tratado e do diretório onde salvar modelos/artefatos
FILE_PATH = r"C:\Users\euque\OneDrive\Documentos\Cesar School\5° Período - CC Cesar School\Análise e Visualização de Dados\avd-projeto\data\processed\concatenado_clean_20251125_202250.csv"  
ARTIFACT_DIR = r"C:\Users\euque\OneDrive\Documentos\Cesar School\5° Período - CC Cesar School\Análise e Visualização de Dados\avd-projeto\data\artifacts"

import os
os.makedirs(ARTIFACT_DIR, exist_ok=True)

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import joblib
import mlflow
import mlflow.sklearn
import numpy as np

# Configure MLflow tracking URI (update env var or keep default)
mlflow_tracking = os.getenv('MLFLOW_TRACKING_URI', 'http://mlflow:5000')
mlflow.set_tracking_uri(mlflow_tracking)
print('MLflow tracking URI:', mlflow.get_tracking_uri())

# Load data
df = pd.read_csv(FILE_PATH, parse_dates=['DATA'], dayfirst=True, low_memory=False)
display(df.head())

# Select features and target
EXTRA_COLUMNS = ['ESTACAO','DATA','HORA_UTC']
FEATURES = ['TEMPERATURA_BULBO_SECO', 'PRESSAO_ATM_EST', 'RADIACAO_GLOBAL']
TARGET = 'UMIDADE_REL'

# Basic dropna for chosen columns
df_model = df[FEATURES + [TARGET]].dropna()
df_model[EXTRA_COLUMNS] = df.loc[df_model.index, EXTRA_COLUMNS]
print('Used rows:', df_model.shape[0])

X = df_model[FEATURES].values
y = df_model[TARGET].values

# Train-test split by time or random
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline
pipeline = Pipeline([('scaler', StandardScaler()), ('rf', RandomForestRegressor(n_estimators=100, random_state=42))])

# Train and evaluate
pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, preds)
print('MAE (test):', mae)

""" Teste MLFLOW
# Log to MLflow
with mlflow.start_run(run_name='rf_umidade'):
    mlflow.log_param('model', 'RandomForestRegressor')
    mlflow.log_param('features', FEATURES)
    mlflow.log_metric('mae', float(mae))
    # save model artifact and log
    model_path = os.path.join(ARTIFACT_DIR, 'rf_model.pkl')
    joblib.dump(pipeline, model_path)
    mlflow.log_artifact(model_path, artifact_path='models')
    # optional: save a small CSV with test set and predictions
    out_df = pd.DataFrame(X_test, columns=FEATURES)
    out_df[TARGET + '_real'] = y_test
    out_df[TARGET + '_pred'] = preds
    preds_path = os.path.join(ARTIFACT_DIR, 'predictions_partial.csv')
    out_df.to_csv(preds_path, index=False)
    mlflow.log_artifact(preds_path, artifact_path='predictions')
    run_id = mlflow.active_run().info.run_id
    print('MLflow run_id:', run_id) """

# Save final predictions (full set) to ARTIFACT_DIR for Power BI use (optional)
# Predict on the whole dataset and export
full_preds = pipeline.predict(df_model[FEATURES])
df_out = df_model.copy()
df_out[TARGET + '_pred'] = full_preds
final_predictions_path = os.path.join(ARTIFACT_DIR, 'predictions_full.csv')
df_out.to_csv(final_predictions_path, index=False)
print('Predictions exported to', final_predictions_path)

# Save model locally as well
joblib.dump(pipeline, os.path.join(ARTIFACT_DIR, 'rf_model_for_serving.pkl'))
print('Model saved in', ARTIFACT_DIR)