# Training v3 - MLP com Otimização de Hiperparâmetros

## Objetivo
Treinar modelo MLP com busca de hiperparâmetros usando:
1. Análise Exploratória de `1_analise_exploratoria.ipynb`
2. Pré-processamento de `2_preprocess.ipynb`
3. Modelo Avançado de `v3_model.ipynb`

## Diferenças v3
- Busca de hiperparâmetros com múltiplos seeds
- Safe MSE com clipping para evitar overflow
- Validação temporal com TimeSeriesSplit
- Seleção de melhor combinação por métricas

---


In [1]:
print("="*70)
print("TRAINING V3 - MLP COM OTIMIZAÇÃO DE HIPERPARÂMETROS")
print("="*70)

print("\nNotebooks Reutilizados:")
print("  1. 1_analise_exploratoria.ipynb")
print("  2. 2_preprocess.ipynb")
print("  3. v3_model.ipynb")

print("\nEtapas deste notebook:")
print("  - Carregar dados e fazer feature engineering")
print("  - Definir espaço de busca de hiperparâmetros")
print("  - Treinar com TimeSeriesSplit e múltiplos seeds")
print("  - Selecionar melhor configuração por Competition Score")
print("  - Treinar modelo final e gerar submission")

print("\nNOTA: Este notebook implementa busca de parâmetros otimizada")


TRAINING V3 - MLP COM OTIMIZAÇÃO DE HIPERPARÂMETROS

Notebooks Reutilizados:
  1. 1_analise_exploratoria.ipynb
  2. 2_preprocess.ipynb
  3. v3_model.ipynb

Etapas deste notebook:
  - Carregar dados e fazer feature engineering
  - Definir espaço de busca de hiperparâmetros
  - Treinar com TimeSeriesSplit e múltiplos seeds
  - Selecionar melhor configuração por Competition Score
  - Treinar modelo final e gerar submission

NOTA: Este notebook implementa busca de parâmetros otimizada


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.base import clone
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import ParameterSampler

plt.style.use('seaborn-v0_8-darkgrid')

DATA_DIR = Path('.')
TRAIN_DIR = DATA_DIR / 'train'
TEST_PATH = DATA_DIR / 'test.csv'

print("Importações carregadas")


Importações carregadas


In [3]:
## Safe MSE com clipping para evitar overflow

from sklearn import metrics

np.seterr(over='ignore', invalid='ignore')

_ORIG_MSE = metrics.mean_squared_error
PRED_MAX = 1e10
PRED_MIN = 0.0
LOG_MAX = 20.0
LOG_MIN = -20.0

def _safe_mse(y_true, y_pred, sample_weight=None, multioutput="uniform_average"):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    y_true = np.nan_to_num(y_true, nan=PRED_MIN, posinf=PRED_MAX, neginf=PRED_MIN)
    y_pred = np.nan_to_num(y_pred, nan=PRED_MIN, posinf=PRED_MAX, neginf=PRED_MIN)
    y_true = np.clip(y_true, PRED_MIN, PRED_MAX)
    y_pred = np.clip(y_pred, PRED_MIN, PRED_MAX)
    return _ORIG_MSE(y_true, y_pred, sample_weight=sample_weight, multioutput=multioutput)

metrics.mean_squared_error = _safe_mse

def competition_score(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    den = np.where(np.abs(y_true) < 1e-9, 1.0, np.abs(y_true))
    ape = np.abs(y_true - y_pred) / den
    frac_le1 = float(np.mean(ape <= 1.0))
    if 1.0 - frac_le1 > 0.30:
        return {"score": 0.0, "frac_le1": frac_le1}
    mask = ape <= 1.0
    if not np.any(mask):
        return {"score": 0.0, "frac_le1": 0.0}
    mape_subset = float(np.mean(ape[mask]))
    scaled_mape = float(mape_subset / max(frac_le1, 1e-12))
    return {"score": float(1.0 - scaled_mape), "frac_le1": frac_le1}

print("Safe MSE e métricas configuradas")


Safe MSE e métricas configuradas


In [4]:
## Carregar e Processar Dados

new_house = pd.read_csv(TRAIN_DIR / 'new_house_transactions.csv')
print(f"Shape original: {new_house.shape}")

m = (
    new_house['month']
    .astype(str)
    .str.strip()
    .str.replace(r'[\u2013\u2014]', '-', regex=True)
    .str.replace('.', '', regex=False)
    .str.replace('Sept', 'Sep', regex=False)
)

d = pd.to_datetime(m, format='%Y-%b', errors='coerce')
d = d.fillna(pd.to_datetime(m, format='%Y-%m', errors='coerce'))
d = d.fillna(pd.to_datetime(m, format='%b %Y', errors='coerce'))
d = d.fillna(pd.to_datetime(m, format='%Y %b', errors='coerce'))
d = d.fillna(pd.to_datetime(m, errors='coerce'))

new_house['date'] = d
new_house['sector_id'] = (
    new_house['sector']
    .astype(str)
    .str.extract(r'(\d+)', expand=False)
    .astype('int64')
)

before = new_house.shape[0]
new_house = new_house.dropna(subset=['date'])
after = new_house.shape[0]

print(f"Após parsing: {after}/{before} linhas válidas")
print(f"Período: {new_house['date'].min().date()} a {new_house['date'].max().date()}")
print(f"Setores: {new_house['sector_id'].nunique()}")


Shape original: (5433, 11)
Após parsing: 5433/5433 linhas válidas
Período: 2019-01-01 a 2024-07-01
Setores: 95


In [5]:
## Feature Engineering

df = new_house.copy()
df = df.sort_values(['sector_id', 'date']).reset_index(drop=True)

df['y'] = df['amount_new_house_transactions']
df['area'] = df['area_new_house_transactions']
df['price'] = df['price_new_house_transactions']
df['num'] = df['num_new_house_transactions']

df['price_x_area'] = df['price'] * df['area']
df['price_per_unit'] = df['price'] / (df['num'] + 1e-6)
df['area_per_unit'] = df['area'] / (df['num'] + 1e-6)

for lag in [1, 3, 6]:
    df[f'price_lag{lag}'] = df.groupby('sector_id')['price'].shift(lag)
    df[f'area_lag{lag}'] = df.groupby('sector_id')['area'].shift(lag)
    df[f'num_lag{lag}'] = df.groupby('sector_id')['num'].shift(lag)
    df[f'amount_lag{lag}'] = df.groupby('sector_id')['y'].shift(lag)

for window in [3, 6]:
    df[f'price_ma{window}'] = df.groupby('sector_id')['price'].transform(
        lambda x: x.rolling(window, min_periods=1).mean()
    )
    df[f'area_ma{window}'] = df.groupby('sector_id')['area'].transform(
        lambda x: x.rolling(window, min_periods=1).mean()
    )
    df[f'amount_ma{window}'] = df.groupby('sector_id')['y'].transform(
        lambda x: x.rolling(window, min_periods=1).mean()
    )

df['month'] = df['date'].dt.month
df['quarter'] = df['date'].dt.quarter
df['year'] = df['date'].dt.year

lag_cols = [c for c in df.columns if 'lag' in c or 'ma' in c]
for col in lag_cols:
    df[col] = df.groupby('sector_id')[col].ffill().bfill()

feature_cols = [
    'area', 'price', 'num', 'price_x_area', 'price_per_unit', 'area_per_unit',
    'price_lag1', 'price_lag3', 'price_lag6',
    'area_lag1', 'area_lag3', 'area_lag6',
    'num_lag1', 'num_lag3', 'num_lag6',
    'amount_lag1', 'amount_lag3', 'amount_lag6',
    'price_ma3', 'price_ma6', 'area_ma3', 'area_ma6', 'amount_ma3', 'amount_ma6',
    'month', 'quarter', 'year'
]

for col in feature_cols:
    if df[col].isna().sum() > 0:
        df[col] = df[col].fillna(df[col].median())

df_clean = df[['sector_id', 'date', 'y'] + feature_cols].dropna(subset=['sector_id', 'date', 'y']).reset_index(drop=True)

print(f"Shape após feature eng: {df_clean.shape}")
print(f"Features: {len(feature_cols)}")


Shape após feature eng: (5433, 30)
Features: 27


In [None]:
## Busca de Hiperparâmetros com TimeSeriesSplit

X = df_clean[feature_cols].copy()
y = df_clean['y'].values
y_log = np.log1p(y)

preprocess = ColumnTransformer([
    ('scaler', StandardScaler(), feature_cols)
], remainder='drop')

tscv = TimeSeriesSplit(n_splits=5)

param_grid = {
    'hidden_layer_sizes': [(128, 64, 32), (256, 128, 64), (256, 128)],
    'activation': ['relu', 'tanh'],
    'alpha': [1e-5, 1e-4, 1e-3],
    'batch_size': [128, 256],
    'learning_rate_init': [0.001, 0.01]
}

param_sampler = ParameterSampler(param_grid, n_iter=12, random_state=42)
seeds = [42, 123, 456]

results = []
print("Buscando hiperparâmetros...\n")

for iter_idx, params in enumerate(param_sampler, start=1):
    best_for_params = None
    
    for seed in seeds:
        model = MLPRegressor(
            random_state=seed,
            early_stopping=True,
            max_iter=400,
            n_iter_no_change=30,
            validation_fraction=0.15,
            **params,
            solver="adam",
        )
        
        oof_pred = np.zeros_like(y, dtype=float)
        fold_scores = []
        
        for fold, (tr_idx, te_idx) in enumerate(tscv.split(X)):
            try:
                Xtr, Xte = X.iloc[tr_idx], X.iloc[te_idx]
                ytr_log = y_log[tr_idx]
                
                pipe = Pipeline([('prep', clone(preprocess)), ('mlp', clone(model))])
                pipe.fit(Xtr, ytr_log)
                
                yhat_log = pipe.predict(Xte)
                yhat_log = np.clip(yhat_log, LOG_MIN, LOG_MAX)
                yhat = np.expm1(yhat_log)
                yhat = np.clip(yhat, PRED_MIN, PRED_MAX)
                
                oof_pred[te_idx] = yhat
                fold_scores.append(mean_squared_error(y[te_idx], yhat))
                
            except Exception as e:
                fold_scores = None
                break
        
        if fold_scores is not None:
            oof_pred = np.clip(oof_pred, PRED_MIN, PRED_MAX)
            comp_score = competition_score(y, oof_pred)
            mse = mean_squared_error(y, oof_pred)
            
            result = {
                'params': {**params, 'random_state': seed},
                'mse': mse,
                'comp_score': comp_score['score']
            }
            results.append(result)
            
            if best_for_params is None or comp_score['score'] > best_for_params['comp_score']:
                best_for_params = result
    
    if best_for_params:
        print(f"Comb {iter_idx}: Score={best_for_params['comp_score']:.4f} MSE={best_for_params['mse']:.0f}")

best_result = sorted(results, key=lambda d: (-d['comp_score'], d['mse']))[0]
print(f"\nMelhor resultado:")
print(f"  Score: {best_result['comp_score']:.4f}")
print(f"  MSE: {best_result['mse']:.0f}")


Buscando hiperparâmetros...

Comb 1: Score=0.7079 MSE=132779519718414
Comb 2: Score=0.7820 MSE=399016297
Comb 3: Score=0.6844 MSE=103585631184632


In [None]:
## Treinar Modelo Final

print("Treinando modelo final...")

final_model = MLPRegressor(
    **{k: v for k, v in best_result['params'].items() if k != 'random_state'},
    random_state=best_result['params']['random_state'],
    early_stopping=True,
    max_iter=400,
    n_iter_no_change=30,
    validation_fraction=0.15,
    solver="adam"
)

final_pipe = Pipeline([('prep', preprocess), ('mlp', final_model)])
final_pipe.fit(X, y_log)

import joblib
joblib.dump(final_pipe, 'mlp_model_v3.joblib')
joblib.dump(feature_cols, 'feature_cols_v3.joblib')

print("Modelo final treinado e salvo")


In [None]:
## Gerar Submission

test_df = pd.read_csv(TEST_PATH)

print(f"Test shape: {test_df.shape}")

if 'id' in test_df.columns:
    test_ids = test_df['id'].values
    
    if set(feature_cols).issubset(set(test_df.columns)):
        X_test = test_df[feature_cols].copy()
        
        for col in feature_cols:
            if X_test[col].isna().sum() > 0:
                X_test[col] = X_test[col].fillna(X[col].median())
        
        y_test_log = final_pipe.predict(X_test)
        y_test_log = np.clip(y_test_log, LOG_MIN, LOG_MAX)
        y_test = np.expm1(y_test_log)
        y_test = np.clip(y_test, PRED_MIN, PRED_MAX)
        
        submission = pd.DataFrame({
            'id': test_ids,
            'amount_new_house_transactions': y_test
        })
        
        submission.to_csv('submission_v3.csv', index=False)
        print(f"Submission salva em 'submission_v3.csv'")
        print(f"  Shape: {submission.shape}")
        print(f"  Min: {submission['amount_new_house_transactions'].min():,.1f}")
        print(f"  Max: {submission['amount_new_house_transactions'].max():,.1f}")
        print(f"  Mean: {submission['amount_new_house_transactions'].mean():,.1f}")
    else:
        print("Features não encontradas em test.csv")
else:
    print("Coluna 'id' não encontrada em test.csv")


In [None]:
## Resumo Final

print("="*70)
print("RESUMO FINAL - TRAINING V3")
print("="*70)

print(f"\nDados: {df_clean.shape[0]} linhas, {len(feature_cols)} features")
print(f"Período: {df_clean['date'].min().date()} a {df_clean['date'].max().date()}")
print(f"Setores: {df_clean['sector_id'].nunique()}")

print(f"\nMelhor Configuração:")
print(f"  Hidden layers: {best_result['params']['hidden_layer_sizes']}")
print(f"  Activation: {best_result['params']['activation']}")
print(f"  Alpha: {best_result['params']['alpha']}")
print(f"  Batch size: {best_result['params']['batch_size']}")
print(f"  Learning rate: {best_result['params']['learning_rate_init']}")
print(f"  Seed: {best_result['params']['random_state']}")

print(f"\nPerformance:")
print(f"  Competition Score: {best_result['comp_score']:.4f}")
print(f"  MSE: {best_result['mse']:.0f}")

print(f"\nArquivos gerados:")
print(f"  mlp_model_v3.joblib")
print(f"  feature_cols_v3.joblib")
print(f"  submission_v3.csv")

print("\n" + "="*70)


In [None]:
## Salvar Métricas para Comparação

import json
from datetime import datetime

metrics_v3 = {
    "versao": "v3",
    "data": datetime.now().isoformat(),
    "modelo": "MLPRegressor com Busca de Hiperparâmetros",
    "validacao": "TimeSeriesSplit + Múltiplos Seeds",
    "features": len(feature_cols),
    "observacoes": df_clean.shape[0],
    "setores": df_clean['sector_id'].nunique(),
    "metricas_melhor_resultado": {
        "rmse": float(best_result['mse'] ** 0.5),
        "mse": float(best_result['mse']),
        "competition_score": float(best_result['comp_score'])
    },
    "melhor_configuracao": {
        "hidden_layers": best_result['params']['hidden_layer_sizes'],
        "activation": best_result['params']['activation'],
        "alpha": best_result['params']['alpha'],
        "batch_size": best_result['params']['batch_size'],
        "learning_rate": best_result['params']['learning_rate_init'],
        "seed": int(best_result['params']['random_state'])
    },
    "resumo_busca": {
        "total_combinacoes_testadas": len(results),
        "parametros_grid": {
            "hidden_layer_sizes": 3,
            "activation": 2,
            "alpha": 3,
            "batch_size": 2,
            "learning_rate_init": 2
        },
        "seeds_utilizados": 3
    }
}

with open('metricas_v3.json', 'w') as f:
    json.dump(metrics_v3, f, indent=2)

print("\nMétricas salvas em 'metricas_v3.json'")
print("\nResumo Final v3 (Melhor Resultado):")
print(f"  RMSE: {metrics_v3['metricas_melhor_resultado']['rmse']:.2f}")
print(f"  MSE: {metrics_v3['metricas_melhor_resultado']['mse']:.2f}")
print(f"  Competition Score: {metrics_v3['metricas_melhor_resultado']['competition_score']:.4f}")
print(f"\nTotal de combinações testadas: {metrics_v3['resumo_busca']['total_combinacoes_testadas']}")

