In [None]:
# 02_Preprocessamento.ipynb - Etapa 2
import os, sys
sys.path.append(os.path.abspath('../'))
import pandas as pd
import numpy as np

from src.data_preparation import load_data, impute_missing_values, standardize_text_categories
from src.outlier_handler import cap_outliers_iqr
from src.feature_engineering import create_features, one_hot_encode, scale_numeric

# ================================================
# Carregar dados brutos
# ================================================
path_raw = '../data/raw/delivery_time.csv'
df = load_data(path_raw)
print("Dimensões:", df.shape)
df.head()

# ================================================
# Tratar valores faltantes
# ================================================
num_cols = ['distance_km', 'package_weight_kg', 'driver_experience_years',
             'num_stops', 'customer_rating', 'fuel_cost', 'delivery_time_hours']

cat_cols = ['delivery_type', 'vehicle_type', 'traffic_condition', 'weather', 
            'time_of_day', 'day_of_week', 'is_priority', 'package_fragile', 'delivery_zone']

# Mantém apenas as colunas que existem
num_cols = [c for c in num_cols if c in df.columns]
cat_cols = [c for c in cat_cols if c in df.columns]

df_imputed = impute_missing_values(df, num_cols, cat_cols)
print("Valores faltantes após imputação:", df_imputed.isnull().sum().sum())

# ================================================
# Tratar outliers (IQR)
# ================================================
df_capped = cap_outliers_iqr(df_imputed, num_cols)

# ================================================
# Criar novas features
# ================================================
df_feat = create_features(df_capped)

# verificar novas colunas criadas
print([c for c in df_feat.columns if 'avg_speed' in c or 'time_per_km' in c])

# ================================================
# One-Hot Encoding
# ================================================
cat_cols = [c for c in cat_cols if c in df_feat.columns]
df_encoded = one_hot_encode(df_feat, cat_cols, drop_first=True)
print("Dimensões após encoding:", df_encoded.shape)

# ================================================
# Padronizar features numéricas
# ================================================
numeric_cols_present = [c for c in num_cols if c in df_encoded.columns]
extras = [c for c in ['avg_speed_kmh', 'time_per_km_h'] if c in df_encoded.columns]
numeric_cols_present += extras

df_scaled, scaler = scale_numeric(df_encoded, numeric_cols_present, save_path='../models/scaler.pkl')
print("Scaler salvo em ../models/scaler.pkl")

# ================================================
# Salvar dataset processado
# ================================================
os.makedirs('../data/processed', exist_ok=True)
processed_path = '../data/processed/delivery_processed.csv'
df_scaled.to_csv(processed_path, index=False)
print("Dataset processado salvo em:", processed_path)
print("Dimensões finais:", df_scaled.shape)
