In [None]:
# 02_Preprocessamento.ipynb
import os, sys
sys.path.append(os.path.abspath('../'))
import pandas as pd
import numpy as np

from src.data_preparation import load_data, impute_missing_values, standardize_text_categories
from src.outlier_handler import cap_outliers_iqr
from src.feature_engineering import create_features, one_hot_encode, scale_numeric

# ================================================
# Carregar dados brutos
# ================================================
path_raw = '../data/raw/delivery_time.csv'
df = load_data(path_raw)
print("Dimensões:", df.shape)
df.head()

# ================================================
# Tratar valores faltantes
# ================================================
num_cols = ['distance_km', 'package_weight_kg', 'driver_experience_years',
             'num_stops', 'customer_rating', 'fuel_cost', 'delivery_time_hours']

cat_cols = ['delivery_type', 'vehicle_type', 'traffic_condition', 'weather', 
            'time_of_day', 'day_of_week', 'is_priority', 'package_fragile', 'delivery_zone']

# Mantém apenas as colunas que existem
num_cols = [c for c in num_cols if c in df.columns]
cat_cols = [c for c in cat_cols if c in df.columns]

df_imputed = impute_missing_values(df, num_cols, cat_cols)
print("Valores faltantes após imputação:", df_imputed.isnull().sum().sum())

# ================================================
# Tratar outliers (IQR)
# ================================================
df_capped = cap_outliers_iqr(df_imputed, num_cols)

# ================================================
# Criar novas features
# ================================================
df_feat = create_features(df_capped)

# verificar novas colunas criadas
print([c for c in df_feat.columns if 'avg_speed' in c or 'time_per_km' in c])

# ================================================
# One-Hot Encoding
# ================================================
cat_cols = [c for c in cat_cols if c in df_feat.columns]
df_encoded = one_hot_encode(df_feat, cat_cols, drop_first=True)
print("Dimensões após encoding:", df_encoded.shape)

# ================================================
# Padronizar features numéricas
# ================================================
numeric_cols_present = [c for c in num_cols if c in df_encoded.columns]
extras = [c for c in ['avg_speed_kmh', 'time_per_km_h'] if c in df_encoded.columns]
numeric_cols_present += extras

df_scaled, scaler = scale_numeric(df_encoded, numeric_cols_present, save_path='../models/scaler.pkl')
print("Scaler salvo em ../models/scaler.pkl")

# ================================================
# Salvar dataset processado
# ================================================
os.makedirs('../data/processed', exist_ok=True)
processed_path = '../data/processed/delivery_processed.csv'
df_scaled.to_csv(processed_path, index=False)
print("Dataset processado salvo em:", processed_path)
print("Dimensões finais:", df_scaled.shape)


In [27]:
import pandas as pd

df_etapa2 = pd.read_csv(r"C:\Users\levie\asasas\projeto_ml\data\processed\delivery_processed.csv")

df_etapa2.head()


Unnamed: 0,delivery_id,distance_km,package_weight_kg,driver_experience_years,num_stops,customer_rating,fuel_cost,delivery_time_hours,avg_speed_kmh,time_per_km_h,...,package_fragile_Yes,package_fragile_no,package_fragile_yes,delivery_zone_ Suburbana,delivery_zone_ Urbana,delivery_zone_Rural,delivery_zone_Suburbana,delivery_zone_Urbana,delivery_zone_suburbana,delivery_zone_urbana
0,DEL00331,0.212253,2.647201,0.091559,3.108428,1.700263,-0.126139,0.384929,-0.186414,-0.131731,...,False,False,False,False,False,False,False,True,False,False
1,DEL02382,-0.465047,-0.674816,1.179863,-0.081985,1.554082,0.288737,0.005903,-0.249251,-0.105908,...,True,False,False,False,False,False,False,True,False,False
2,DEL00825,1.114955,-0.868911,-0.271209,-0.879588,0.092268,0.807332,-1.690263,1.800354,-0.20546,...,True,False,False,False,False,False,False,True,False,False
3,DEL00336,-0.160864,0.698782,0.817095,-0.879588,-1.745441,-0.216893,0.615012,-0.255975,-0.102167,...,True,False,False,False,False,False,True,False,False,False
4,DEL00551,-0.149922,-0.055204,-1.359513,-0.879588,0.781409,1.377787,0.436058,-0.240753,-0.110295,...,True,False,False,False,False,False,False,True,False,False


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ============================
# 1. Carregar os dados
# ============================
df1 = pd.read_csv(r"C:\Users\levie\asasas\projeto_ml\data\raw\delivery_time.csv")               
df2 = pd.read_csv(r"C:\Users\levie\asasas\projeto_ml\data\processed\delivery_processed.csv")  

# ============================
# 2. Comparar tamanho
# ============================
print("Tamanho dos DataFrames:")
print("Etapa 1:", df1.shape)
print("Etapa 2:", df2.shape)

# ============================
# 3. Comparar colunas removidas, adicionadas ou alteradas
# ============================
colunas_etapa1 = set(df1.columns)
colunas_etapa2 = set(df2.columns)

print("\nColunas removidas na etapa 2:")
print(colunas_etapa1 - colunas_etapa2)

print("\nColunas adicionadas na etapa 2:")
print(colunas_etapa2 - colunas_etapa1)

print("\nColunas mantidas:")
print(colunas_etapa1 & colunas_etapa2)

# ============================
# 4. Comparar estatísticas numéricas
# ============================
comparacao_stats = pd.concat(
    [df1.describe(include='all'), df2.describe(include='all')],
    axis=1,
    keys=['Etapa 1', 'Etapa 2']
)

print("\n===== ESTATÍSTICAS COMPARADAS =====")
display(comparacao_stats)

# ============================
# 5. Comparar quantidade de valores nulos
# ============================
nulos = pd.DataFrame({
    "Etapa 1": df1.isnull().sum(),
    "Etapa 2": df2.isnull().sum()
})

print("\n===== COMPARAÇÃO DE VALORES NULOS =====")
display(nulos)

# ============================
# 6. Gráfico: comparação de distribuições (para numéricos)
# SOMENTE COLUNAS QUE EXISTEM NOS DOIS DATAFRAMES
# ============================

numericas1 = df1.select_dtypes(include=['int64', 'float64']).columns
numericas2 = df2.select_dtypes(include=['int64', 'float64']).columns

# interseção = apenas colunas numéricas presentes nas duas etapas
numericas_comuns = list(set(numericas1) & set(numericas2))

print("Colunas numéricas comparadas:")
print(numericas_comuns)

for col in numericas_comuns:
    plt.figure(figsize=(10,5))
    sns.kdeplot(df1[col], label="Etapa 1 (bruto)", linewidth=2)
    sns.kdeplot(df2[col], label="Etapa 2 (limpo)", linewidth=2)
    plt.title(f"Distribuição antes x depois — {col}")
    plt.legend()
    plt.show()


# ============================
# 7. Correlação antes x depois
# ============================

corr1 = df1.select_dtypes(include=['int64', 'float64']).corr()
corr2 = df2.select_dtypes(include=['int64', 'float64']).corr()

plt.figure(figsize=(10,8))
sns.heatmap(corr1, annot=True, fmt=".2f", cmap="viridis")
plt.title("Correlação — Etapa 1 (bruto)")
plt.show()

plt.figure(figsize=(10,8))
sns.heatmap(corr2, annot=True, fmt=".2f", cmap="viridis")
plt.title("Correlação — Etapa 2 (limpo)")
plt.show()

