# 02_Preprocessamento.ipynb
---------------------------------
1. Imports e Configura√ß√µes
---------------------------------

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")


In [None]:
# C√©lula 2 ‚Äî CORRIGIR CAMINHOS

RAW = r"C:\Users\levie\asasas\projeto_ml\data\raw\delivery_time.csv"
CLEAN = r"C:\Users\levie\asasas\projeto_ml\data\processed\delivery_clean.csv"
SCALED = r"C:\Users\levie\asasas\projeto_ml\data\processed\delivery_processed.csv"


---------------------------------
2. Caminhos dos Arquivos + Carregar arquivos
---------------------------------

In [None]:
RAW = r"C:\Users\levie\asasas\projeto_ml\data\raw\delivery_time.csv"
CLEAN = r"C:\Users\levie\asasas\projeto_ml\data\processed\delivery_clean.csv"
SCALED = r"C:\Users\levie\asasas\projeto_ml\data\processed\delivery_processed.csv"

# carregar raw (erro se faltar ‚Äî arquivo raw √© obrigat√≥rio)
if not os.path.exists(RAW):
    raise FileNotFoundError(f"Arquivo RAW n√£o encontrado: {RAW}")
df_raw = pd.read_csv(RAW)

# carregar clean/processed se existirem; caso contr√°rio criar c√≥pias a partir do raw
if os.path.exists(CLEAN):
    df_clean = pd.read_csv(CLEAN)
else:
    print("WARNING: arquivo CLEAN n√£o encontrado ‚Äî usando c√≥pia de raw como df_clean (execute c√©lulas de preprocessamento).")
    df_clean = df_raw.copy()

if os.path.exists(SCALED):
    df_scaled = pd.read_csv(SCALED)
else:
    print("WARNING: arquivo SCALED n√£o encontrado ‚Äî usando df_clean como df_scaled (execute pr√©-processamento).")
    df_scaled = df_clean.copy()

print("Shapes ‚Äî raw:", df_raw.shape, "clean:", df_clean.shape, "scaled:", df_scaled.shape)
display(df_raw.head())
display(df_clean.head())
display(df_scaled.head())



---------------------------------
4. Verificar Valores Negativos
---------------------------------

In [None]:
cols_to_check = [
    "distance_km",
    "package_weight_kg",
    "driver_experience_years",
    "num_stops",
    "customer_rating",
    "fuel_cost",
    "delivery_time_hours"
]

for col in cols_to_check:
    neg_count = (df_clean[col] < 0).sum()
    if neg_count > 0:
        print(f"‚ö†Ô∏è {col} possui {neg_count} valores negativos!")
    else:
        print(f"‚úîÔ∏è {col} est√° sem valores negativos.")


---------------------------------
5. Gr√°fico de Compara√ß√£o (Raw vs Clean vs Scaled)
---------------------------------

In [None]:
col = "distance_km"  

plt.figure(figsize=(10,5))
sns.kdeplot(df_raw[col], label="Raw (etapa 1)", fill=True)
sns.kdeplot(df_clean[col], label="Clean (etapa 2)", fill=True)
sns.kdeplot(df_scaled[col], label="Scaled (etapa 3)", fill=True)

plt.title(f"Compara√ß√£o de Distribui√ß√µes ‚Äî {col}")
plt.legend()
plt.show()


---------------------------------
6. Resolver Valores Negativos
---------------------------------

In [None]:
# Criar diret√≥rio se n√£o existir
os.makedirs(os.path.dirname(CLEAN), exist_ok=True)

for col in ["distance_km", "package_weight_kg"]:
    df_clean[col] = df_clean[col].abs()

df_clean.to_csv(CLEAN, index=False)
print("Valores negativos corrigidos e arquivo salvo.")


---------------------------------
7. Revalidar Ap√≥s a Corre√ß√£o
---------------------------------

In [None]:
import numpy as np

# usa cols_fix se definida; caso contr√°rio tenta cols_to_check; sen√£o usa todas as num√©ricas em df_clean
cols_fix = globals().get('cols_fix') or globals().get('cols_to_check')
if cols_fix is None:
    cols_fix = [c for c in df_clean.select_dtypes(include=[np.number]).columns.tolist()]

print("Revalidando colunas:", cols_fix)
for col in cols_fix:
    if col not in df_clean.columns:
        print(f"‚ö†Ô∏è Coluna '{col}' n√£o encontrada em df_clean ‚Äî pulando.")
        continue
    neg_remaining = (df_clean[col] < 0).sum()
    print(f"{col}: negativos restantes = {neg_remaining}")


In [None]:
# C√©lula 7.5 ‚Äî Criar arquivo SCALED com StandardScaler

from sklearn.preprocessing import StandardScaler
import joblib

# Definir colunas num√©ricas para escalar
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()

# Criar c√≥pia para escalar
df_scaled = df_clean.copy()

# Criar e ajustar o scaler
scaler = StandardScaler()
df_scaled[numeric_cols] = scaler.fit_transform(df_clean[numeric_cols])

# Criar diret√≥rios se n√£o existirem
os.makedirs(os.path.dirname(SCALED), exist_ok=True)
os.makedirs(r"C:\Users\levie\asasas\projeto_ml\models", exist_ok=True)

# Salvar arquivos
df_scaled.to_csv(SCALED, index=False)
joblib.dump(scaler, r"C:\Users\levie\asasas\projeto_ml\models\scaler.pkl")

# Salvar metadata
meta = {"numeric_to_scale": numeric_cols}
joblib.dump(meta, r"C:\Users\levie\asasas\projeto_ml\models\preprocess_meta.pkl")

print("‚úîÔ∏è Arquivo SCALED salvo!")
print("‚úîÔ∏è Scaler salvo em models/scaler.pkl")
print("‚úîÔ∏è Metadata salvo em models/preprocess_meta.pkl")
print("Colunas escaladas:", numeric_cols)


---------------------------------
8. Gr√°fico de Compara√ß√£o Ap√≥s Corre√ß√£o
---------------------------------

In [None]:
col = "distance_km" 

plt.figure(figsize=(10,5))
sns.kdeplot(df_raw[col], label="Raw (etapa 1)", fill=True)
sns.kdeplot(df_clean[col], label="Clean corrigido (etapa 2)", fill=True)
sns.kdeplot(df_scaled[col], label="Scaled (etapa 3)", fill=True)

plt.title(f"Distribui√ß√£o ‚Äî RAW vs CLEAN CORRIGIDO vs SCALED ‚Äî {col}")
plt.legend()
plt.show()


---------------------------------
9. Mostrar Estat√≠sticas das Tr√™s Vers√µes
---------------------------------

In [None]:
# C√âLULA 09 ‚Äî Estat√≠sticas principais
pd.DataFrame({
    "Raw": df_raw.describe().iloc[1],
    "Clean Corrigido": df_clean.describe().iloc[1],
    "Scaled": df_scaled.describe().iloc[1]
})


---------------------------------
10. Teste Scaler
---------------------------------

In [None]:
# ===========================================
# üîé C√âLULA √öNICA ‚Äî DIAGN√ìSTICO COMPLETO DO SCALER
# ===========================================

import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

RAW = r"C:\Users\levie\asasas\projeto_ml\data\raw\delivery_time.csv"
OUT_CLEAN = r"C:\Users\levie\asasas\projeto_ml\data\processed\delivery_clean.csv"
OUT_PROCESSED = r"C:\Users\levie\asasas\projeto_ml\data\processed\delivery_processed.csv"
SCALER_PATH = r"C:\Users\levie\asasas\projeto_ml\models\scaler.pkl"
META_PATH = r"C:\Users\levie\asasas\projeto_ml\models\preprocess_meta.pkl" 

# --- coluna que voc√™ quer diagnosticar
col = "distance_km"     # üëà TROQUE AQUI

# --- carregar dados e objetos
df_raw = pd.read_csv(RAW)
df_clean = pd.read_csv(OUT_CLEAN)
df_processed = pd.read_csv(OUT_PROCESSED)
scaler = joblib.load(SCALER_PATH)
meta = joblib.load(META_PATH)

numeric_to_scale = meta.get("numeric_to_scale", [])

print("\n==============================")
print("üìå INFORMA√á√ïES INICIAIS")
print("==============================")
print("Coluna analisada:", col)
print("Colunas escaladas:", numeric_to_scale)
print("Existe no processed:", col in df_processed.columns)

if col not in df_raw.columns:
    print(f"\n‚ùå ERRO: coluna {col} n√£o existe no RAW. Verifique o nome.")
    raise SystemExit()

if col not in df_clean.columns:
    print(f"\n‚ùå ERRO: coluna {col} n√£o existe no CLEAN. Verifique o nome.")
    raise SystemExit()

# =======================================================
# TESTE 1 ‚Äî Estat√≠sticas antes/depois
# =======================================================
print("\n==============================")
print("üìä TESTE 1 ‚Äî Estat√≠sticas RAW vs CLEAN vs PROCESSED")
print("==============================")

orig = df_clean[col]
proc = df_processed[col] if col in df_processed.columns else None

print("RAW   ‚Üí min / median / mean / max:", 
      df_raw[col].min(), df_raw[col].median(), df_raw[col].mean(), df_raw[col].max())
print("CLEAN ‚Üí min / median / mean / max:", 
      orig.min(), orig.median(), orig.mean(), orig.max())

if proc is not None:
    print("PROC  ‚Üí min / median / mean / max:",
          proc.min(), proc.median(), proc.mean(), proc.max())
else:
    print("PROC  ‚Üí coluna n√£o est√° no df_processed (poss√≠vel one-hot).")

# =======================================================
# TESTE 2 ‚Äî Plotar distribui√ß√µes
# =======================================================
print("\n==============================")
print("üìà TESTE 2 ‚Äî Gr√°ficos de Distribui√ß√£o")
print("==============================")

plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
sns.histplot(orig.dropna(), kde=True)
plt.title(f"CLEAN ‚Äî {col}")

plt.subplot(1,2,2)
if proc is not None:
    sns.histplot(proc.dropna(), kde=True)
    plt.title(f"PROCESSED (scaled) ‚Äî {col}")
else:
    plt.text(0.3, 0.5, "Coluna n√£o est√° no processed", fontsize=14)
plt.show()

# =======================================================
# TESTE 3 ‚Äî Verificar se virou one-hot (dummies)
# =======================================================
print("\n==============================")
print("üß© TESTE 3 ‚Äî Verificando se a coluna virou dummies")
print("==============================")

related = [c for c in df_processed.columns if c.startswith(col + "_")]
if len(related) > 0:
    print("A coluna original virou as seguintes dummies:")
    print(related)
else:
    print("Nenhuma dummy encontrada ‚Äî √© uma coluna num√©rica normal.")

# =======================================================
# TESTE 4 ‚Äî Invers√£o do scaling
# =======================================================
print("\n==============================")
print("üîÑ TESTE 4 ‚Äî Invers√£o do StandardScaler")
print("==============================")

if col in numeric_to_scale:
    idx = numeric_to_scale.index(col)
    scaled_vals = df_processed[col].to_numpy()
    inv = scaled_vals * scaler.scale_[idx] + scaler.mean_[idx]

    print("Valores invertidos (min/median/max):",
          np.nanmin(inv), np.nanmedian(inv), np.nanmax(inv))
else:
    print("A coluna N√ÉO est√° em numeric_to_scale ‚Üí n√£o passou pelo scaler.")

# =======================================================
# DIAGN√ìSTICO FINAL
# =======================================================
print("\n==============================")
print("üß† DIAGN√ìSTICO FINAL")
print("==============================")

dummy_flag = len(related) > 0
scaled_flag = col in numeric_to_scale

if dummy_flag:
    print("‚úî A coluna virou v√°rias colunas one-hot ‚ö†Ô∏è")
    print("Picos altos s√£o NORMAIS em distribui√ß√µes one-hot + scaler.")
elif not scaled_flag and col in df_processed:
    print("‚ö† A coluna existe no processed mas N√ÉO est√° na lista numeric_to_scale.")
    print("Isso pode gerar comportamento estranho no gr√°fico.")
elif scaled_flag:
    print("‚úî A coluna foi escalada corretamente.")
    print("Se existe pico, provavelmente √©:")
    print("- distribui√ß√£o muito concentrada (normal);")
    print("- valores discretos;")
    print("- ou outliers que foram limitados.")
else:
    print("‚ùå Algo est√° inconsistente: coluna n√£o aparece no processed.")
