In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.multioutput import MultiOutputRegressor
from scipy.optimize import differential_evolution
import joblib
import optuna
from optuna.samplers import TPESampler
import warnings
warnings.filterwarnings('ignore')

# Carregar dados
train = pd.read_csv('../data/train_renomeado.csv')
test = pd.read_csv('../data/test_renomeado.csv')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

train = train.rename(columns={'ENTRADAS (v3)': 'Id'})
test = test.rename(columns={'ENTRADAS (v3)': 'Id'})

target_cols = train.columns[-11:].tolist()
print(f"Targets identificados: {len(target_cols)} colunas")


Targets identificados: 11 colunas


In [3]:
def create_features(df):
    """Função de feature engineering (mantida igual)"""
    df_new = df.copy()

    # 1. Tratamento de outliers e transformações
    skewed_cols = ['Emissões pesticidas', 'Urea', 'Macronutrientes']
    for col in skewed_cols:
        if col in df_new.columns:
            df_new[col] = np.log1p(np.abs(df_new[col]))

    # 2. Flags binárias para features com excesso de zeros
    zero_flag_cols = ['Energia da biomassa', 'Energia elétrica (kwh)', 
                      'Esterco animal (kg)', 'Esterco verde (kg)', 
                      'Micronutrientes', 'Transformation total, to and from']

    for col in zero_flag_cols:
        if col in df_new.columns:
            df_new[f'flag_{col}'] = (df_new[col] > 0).astype(int)

    # 3. Combinação de features correlacionadas
    if 'Calcário e gesso' in df_new.columns and 'Ocuppation, total' in df_new.columns:
        df_new['intensidade_calcario'] = df_new['Calcário e gesso'] / (df_new['Ocuppation, total'] + 1e-6)

    if 'Transformation total, to and from' in df_new.columns and 'Energia da biomassa' in df_new.columns:
        df_new['eficiencia_energetica'] = df_new['Transformation total, to and from'] / (df_new['Energia da biomassa'] + 1e-6)

    # 4. PCA para features de pesticidas
    pesticidas_cols = ['Fungicida, herbicida e pesticida', 'Emissões pesticidas']
    if all(col in df_new.columns for col in pesticidas_cols):
        pca = PCA(n_components=1)
        pesticidas_pca = pca.fit_transform(df_new[pesticidas_cols])
        df_new['pesticidas_pca'] = pesticidas_pca

    # 5. Criação de índices compostos
    nutrientes_cols = ['Macronutrientes', 'Micronutrientes']
    if all(col in df_new.columns for col in nutrientes_cols):
        df_new['nutrientes_total'] = df_new['Macronutrientes'] + df_new['Micronutrientes']

    if 'Urea' in df_new.columns and 'Ammonia e afins' in df_new.columns:
        df_new['impacto_fertilizantes'] = df_new['Urea'] + 0.7 * df_new['Ammonia e afins']

    # 6. Interações com a cultura (Seed)
    if 'Seed' in df_new.columns:
        df_new['biomassa_por_seed'] = df_new.groupby('Seed')['Energia da biomassa'].transform('mean')
        df_new['calcario_por_seed'] = df_new.groupby('Seed')['Calcário e gesso'].transform('mean')

    # 7. Transformações polinomiais para top features
    top_features = [
        'Transformation total, to and from',
        'Energia da biomassa',
        'Micronutrientes',
        'Esterco animal (kg)',
        'Fungicida, herbicida e pesticida'
    ]

    for col in top_features:
        if col in df_new.columns:
            df_new[f'{col}_sq'] = df_new[col] ** 2
            df_new[f'{col}_sqrt'] = np.sqrt(np.abs(df_new[col]))

    return df_new

In [4]:
def encode_categorical(train_df, test_df):
    """Função de encoding categórico (mantida igual)"""
    train_encoded = train_df.copy()
    test_encoded = test_df.copy()

    if 'Seed' in train_encoded.columns:
        # Target Encoding
        target_encoder = {}
        for target in target_cols:
            for seed in train_encoded['Seed'].unique():
                mask = (train_encoded['Seed'] == seed)
                target_encoder[(seed, target)] = train_encoded.loc[mask, target].mean()

        # Aplicar encoding
        for col in target_cols:
            train_encoded[f'Seed_encoded_{col}'] = train_encoded['Seed'].apply(
                lambda x: target_encoder.get((x, col), 0)
            )
            test_encoded[f'Seed_encoded_{col}'] = test_encoded['Seed'].apply(
                lambda x: target_encoder.get((x, col), np.nan)
            )

            # Preencher valores faltantes
            global_mean = train_encoded[col].mean()
            test_encoded[f'Seed_encoded_{col}'] = test_encoded[f'Seed_encoded_{col}'].fillna(global_mean)

        # Remover coluna original
        train_encoded = train_encoded.drop(columns=['Seed'])
        test_encoded = test_encoded.drop(columns=['Seed'])

    return train_encoded, test_encoded


In [5]:
# Preprocessamento dos dados
train_featured = create_features(train)
test_featured = create_features(test)
train_encoded, test_encoded = encode_categorical(train_featured, test_featured)

# Preparar dados
train_ids = train_encoded['Id']
test_ids = test_encoded['Id']

X_train = train_encoded.drop(columns=['Id'] + target_cols)
y_train = train_encoded[target_cols]
X_test = test_encoded.drop(columns=['Id'])

# Garantir mesmas colunas no treino e teste
for col in X_train.columns:
    if col not in X_test.columns:
        X_test[col] = 0
X_test = X_test[X_train.columns]

# Normalização
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [6]:
# Garantir mesmas colunas no treino e teste
for col in X_train.columns:
    if col not in X_test.columns:
        X_test[col] = 0
X_test = X_test[X_train.columns]

# Normalização
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [7]:
# ==================== BASELINE PARA COMPARAÇÃO ====================

print("Treinando baseline para comparação...")

baseline_models = {
    "RandomForest": MultiOutputRegressor(
        RandomForestRegressor(
            n_estimators=300,
            max_depth=10,
            min_samples_split=5,
            max_features=0.8,
            random_state=42,
            n_jobs=-1
        )
    ),
    "XGBoost": MultiOutputRegressor(
        XGBRegressor(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            n_jobs=-1
        )
    ),
    "SVR": MultiOutputRegressor(
        SVR(C=3.0, epsilon=0.1, kernel='rbf')
    )
}

Treinando baseline para comparação...


In [8]:
# Avaliar baseline
baseline_scores = {}
kf = KFold(n_splits=3, shuffle=True, random_state=42)

for name, model in baseline_models.items():
    scores = []
    for train_idx, val_idx in kf.split(X_train_scaled):
        X_tr, X_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
        y_tr, y_val = y_train.values[train_idx], y_train.values[val_idx]
        
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        scores.append(mean_squared_error(y_val, y_pred))
    
    baseline_scores[name] = np.mean(scores)
    print(f"{name} Baseline MSE: {baseline_scores[name]:.5f}")

best_baseline_score = min(baseline_scores.values())
print(f"\nMelhor baseline MSE: {best_baseline_score:.5f}")

RandomForest Baseline MSE: 0.22349
XGBoost Baseline MSE: 0.22719
SVR Baseline MSE: 0.38741

Melhor baseline MSE: 0.22349


In [9]:
# ==================== OPTUNA OPTIMIZATION - VERSÃO CONSERVADORA ====================

def create_optimized_model(trial, model_type):
    """Cria modelo com otimização conservadora próxima aos valores baseline"""
    
    if model_type == 'RandomForest':
        params = {
            'n_estimators': trial.suggest_int('rf_n_estimators', 200, 600, step=50),
            'max_depth': trial.suggest_int('rf_max_depth', 8, 15),
            'min_samples_split': trial.suggest_int('rf_min_samples_split', 3, 10),
            'min_samples_leaf': trial.suggest_int('rf_min_samples_leaf', 1, 5),
            'max_features': trial.suggest_float('rf_max_features', 0.6, 0.9),
            'random_state': 42,
            'n_jobs': -1
        }
        return MultiOutputRegressor(RandomForestRegressor(**params))
    
    elif model_type == 'XGBoost':
        params = {
            'n_estimators': trial.suggest_int('xgb_n_estimators', 300, 800, step=50),
            'learning_rate': trial.suggest_float('xgb_learning_rate', 0.03, 0.15),
            'max_depth': trial.suggest_int('xgb_max_depth', 4, 10),
            'subsample': trial.suggest_float('xgb_subsample', 0.7, 0.9),
            'colsample_bytree': trial.suggest_float('xgb_colsample_bytree', 0.7, 0.9),
            'reg_alpha': trial.suggest_float('xgb_reg_alpha', 0.0, 1.0),
            'reg_lambda': trial.suggest_float('xgb_reg_lambda', 0.0, 1.0),
            'random_state': 42,
            'n_jobs': -1
        }
        return MultiOutputRegressor(XGBRegressor(**params))
    
    elif model_type == 'SVR':
        params = {
            'C': trial.suggest_float('svr_C', 1.0, 10.0),
            'epsilon': trial.suggest_float('svr_epsilon', 0.05, 0.3),
            'kernel': trial.suggest_categorical('svr_kernel', ['rbf', 'poly']),
        }
        if params['kernel'] == 'poly':
            params['degree'] = trial.suggest_int('svr_degree', 2, 4)
        return MultiOutputRegressor(SVR(**params))

def conservative_objective(trial):
    """Função objetivo conservadora com early stopping baseado no baseline"""
    
    model_type = trial.suggest_categorical('model_type', ['RandomForest', 'XGBoost', 'SVR'])
    model = create_optimized_model(trial, model_type)
    
    # Cross-validation mais rápido
    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    cv_scores = []
    
    for train_idx, val_idx in kf.split(X_train_scaled):
        X_tr, X_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
        y_tr, y_val = y_train.values[train_idx], y_train.values[val_idx]
        
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        mse = mean_squared_error(y_val, y_pred)
        cv_scores.append(mse)
        
        # Early stopping se muito pior que o baseline
        current_avg = np.mean(cv_scores)
        if current_avg > best_baseline_score * 3:  # Se 3x pior que baseline, parar
            break
    
    return np.mean(cv_scores)


In [10]:
# ==================== EXECUTAR OTIMIZAÇÃO CONSERVADORA ====================

print("\nIniciando otimização conservadora com Optuna...")

study = optuna.create_study(
    direction='minimize',
    sampler=TPESampler(seed=42, n_startup_trials=10),
    pruner=optuna.pruners.MedianPruner(
        n_startup_trials=5, 
        n_warmup_steps=5, 
        interval_steps=1
    )
)

[I 2025-07-31 11:47:16,065] A new study created in memory with name: no-name-94027231-b7ce-498c-8abd-56c20e548914



Iniciando otimização conservadora com Optuna...


In [11]:
# Otimização mais conservadora
n_trials = 100  # Reduzido para evitar overfitting
study.optimize(conservative_objective, n_trials=n_trials, timeout=7200)  

print("Otimização concluída!")
print(f"Melhor MSE: {study.best_value:.5f}")
print(f"Baseline MSE: {best_baseline_score:.5f}")
print(f"Melhoria: {((best_baseline_score - study.best_value) / best_baseline_score * 100):.2f}%")
print(f"Melhores parâmetros: {study.best_params}")


[I 2025-07-31 11:47:31,436] Trial 0 finished with value: 0.22886881910062126 and parameters: {'model_type': 'XGBoost', 'xgb_n_estimators': 600, 'xgb_learning_rate': 0.04872223685309238, 'xgb_max_depth': 5, 'xgb_subsample': 0.7116167224336398, 'xgb_colsample_bytree': 0.8732352291549871, 'xgb_reg_alpha': 0.6011150117432088, 'xgb_reg_lambda': 0.7080725777960455}. Best is trial 0 with value: 0.22886881910062126.
[I 2025-07-31 11:47:40,827] Trial 1 finished with value: 0.23954259723779173 and parameters: {'model_type': 'XGBoost', 'xgb_n_estimators': 400, 'xgb_learning_rate': 0.051818996064852074, 'xgb_max_depth': 5, 'xgb_subsample': 0.7608484485919075, 'xgb_colsample_bytree': 0.8049512863264475, 'xgb_reg_alpha': 0.43194501864211576, 'xgb_reg_lambda': 0.2912291401980419}. Best is trial 0 with value: 0.22886881910062126.
[I 2025-07-31 11:48:03,846] Trial 2 finished with value: 0.2369299115424283 and parameters: {'model_type': 'RandomForest', 'rf_n_estimators': 350, 'rf_max_depth': 11, 'rf_min

Otimização concluída!
Melhor MSE: 0.20852
Baseline MSE: 0.22349
Melhoria: 6.70%
Melhores parâmetros: {'model_type': 'XGBoost', 'xgb_n_estimators': 650, 'xgb_learning_rate': 0.12115835832645282, 'xgb_max_depth': 5, 'xgb_subsample': 0.8362721722240525, 'xgb_colsample_bytree': 0.8440517050412732, 'xgb_reg_alpha': 0.9987313524203768, 'xgb_reg_lambda': 0.8401018015463615}


In [12]:
# ==================== CRIAR ENSEMBLE HÍBRIDO ====================

def get_hybrid_models(study, baseline_models):
    """Combina modelo otimizado com baselines comprovados"""
    models = {}
    
    # Adicionar baseline sempre (segurança)
    models.update(baseline_models)
    
    # Só adicionar modelo otimizado se for melhor que baseline
    if study.best_value < best_baseline_score * 1.1:  # Até 10% pior é aceitável
        best_params = study.best_params
        model_type = best_params['model_type']
        
        if model_type == 'RandomForest':
            rf_params = {k.replace('rf_', ''): v for k, v in best_params.items() if k.startswith('rf_')}
            rf_params['random_state'] = 42
            rf_params['n_jobs'] = -1
            models['RandomForest_Optimized'] = MultiOutputRegressor(RandomForestRegressor(**rf_params))
        
        elif model_type == 'XGBoost':
            xgb_params = {k.replace('xgb_', ''): v for k, v in best_params.items() if k.startswith('xgb_')}
            xgb_params['random_state'] = 42
            xgb_params['n_jobs'] = -1
            models['XGBoost_Optimized'] = MultiOutputRegressor(XGBRegressor(**xgb_params))
        
        elif model_type == 'SVR':
            svr_params = {}
            for k, v in best_params.items():
                if k.startswith('svr_'):
                    param_name = k.replace('svr_', '')
                    svr_params[param_name] = v
            models['SVR_Optimized'] = MultiOutputRegressor(SVR(**svr_params))
        
        print(f"✅ Modelo otimizado {model_type} adicionado ao ensemble")
    else:
        print("⚠️  Modelo otimizado não foi melhor que baseline - usando apenas baselines")
    
    return models

# Obter modelos híbridos
hybrid_models = get_hybrid_models(study, baseline_models)


✅ Modelo otimizado XGBoost adicionado ao ensemble


In [13]:
# ==================== TREINAR ENSEMBLE FINAL ====================

print(f"\nTreinando ensemble final com {len(hybrid_models)} modelos...")

n_splits = 50
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
oof_predictions = {name: np.zeros_like(y_train.values) for name in hybrid_models}
test_predictions = {name: np.zeros((X_test.shape[0], len(target_cols))) for name in hybrid_models}

for name, model in hybrid_models.items():
    print(f"\nTreinando {name}...")
    fold_test_preds = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_scaled)):
        print(f"  Fold {fold+1}/{n_splits}")
        X_tr, X_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
        y_tr, y_val = y_train.values[train_idx], y_train.values[val_idx]

        model.fit(X_tr, y_tr)
        val_preds = model.predict(X_val)
        oof_predictions[name][val_idx] = val_preds

        fold_mse = mean_squared_error(y_val, val_preds)
        print(f"  Fold MSE: {fold_mse:.5f}")

        fold_test_preds.append(model.predict(X_test_scaled))

    test_predictions[name] = np.mean(fold_test_preds, axis=0)
    full_mse = mean_squared_error(y_train, oof_predictions[name])
    print(f"{name} OOF MSE: {full_mse:.5f}")



Treinando ensemble final com 4 modelos...

Treinando RandomForest...
  Fold 1/50
  Fold MSE: 0.00338
  Fold 2/50
  Fold MSE: 0.00420
  Fold 3/50
  Fold MSE: 0.02920
  Fold 4/50
  Fold MSE: 0.06636
  Fold 5/50
  Fold MSE: 0.07890
  Fold 6/50
  Fold MSE: 0.00855
  Fold 7/50
  Fold MSE: 0.00368
  Fold 8/50
  Fold MSE: 0.00955
  Fold 9/50
  Fold MSE: 0.00035
  Fold 10/50
  Fold MSE: 0.00043
  Fold 11/50
  Fold MSE: 0.00001
  Fold 12/50
  Fold MSE: 0.00689
  Fold 13/50
  Fold MSE: 0.01107
  Fold 14/50
  Fold MSE: 0.00678
  Fold 15/50
  Fold MSE: 0.59527
  Fold 16/50
  Fold MSE: 2.35203
  Fold 17/50
  Fold MSE: 0.00154
  Fold 18/50
  Fold MSE: 0.00367
  Fold 19/50
  Fold MSE: 0.00106
  Fold 20/50
  Fold MSE: 0.06180
  Fold 21/50
  Fold MSE: 0.07586
  Fold 22/50
  Fold MSE: 0.00564
  Fold 23/50
  Fold MSE: 0.59416
  Fold 24/50
  Fold MSE: 0.02071
  Fold 25/50
  Fold MSE: 0.09790
  Fold 26/50
  Fold MSE: 0.00477
  Fold 27/50
  Fold MSE: 0.07993
  Fold 28/50
  Fold MSE: 0.00005
  Fold 29/50
  

In [14]:
# ==================== ENSEMBLE COM PESOS OTIMIZADOS ====================

print("\nOtimizando pesos do ensemble...")

all_oof_preds = [oof_predictions[name] for name in hybrid_models]
all_test_preds = [test_predictions[name] for name in hybrid_models]

def ensemble_mse(weights, preds_list, y_true):
    weighted_preds = np.zeros_like(y_true)
    for i, w in enumerate(weights):
        weighted_preds += w * preds_list[i]
    return mean_squared_error(y_true, weighted_preds)

optimal_weights = np.zeros((len(target_cols), len(hybrid_models)))

for target_idx in range(len(target_cols)):
    target_oof_preds = [preds[:, target_idx] for preds in all_oof_preds]

    def target_objective(weights):
        return ensemble_mse(weights, target_oof_preds, y_train.values[:, target_idx])

    bounds = [(0, 1)] * len(hybrid_models)
    result = differential_evolution(
        target_objective,
        bounds,
        strategy='best1bin',
        maxiter=50,  # Reduzido
        popsize=10,  # Reduzido
        tol=1e-4,
        seed=42
    )

    weights = result.x
    weights /= weights.sum()
    optimal_weights[target_idx] = weights



Otimizando pesos do ensemble...


In [15]:
# ==================== PREVISÕES FINAIS ====================

print("\nCriando previsões finais...")
final_predictions = np.zeros((X_test_scaled.shape[0], len(target_cols)))

for target_idx in range(len(target_cols)):
    for model_idx, model_name in enumerate(hybrid_models):
        weight = optimal_weights[target_idx, model_idx]
        final_predictions[:, target_idx] += weight * all_test_preds[model_idx][:, target_idx]

# Calcular MSE final do ensemble
ensemble_oof = np.zeros_like(y_train.values)
for target_idx in range(len(target_cols)):
    for model_idx, model_name in enumerate(hybrid_models):
        weight = optimal_weights[target_idx, model_idx]
        ensemble_oof[:, target_idx] += weight * all_oof_preds[model_idx][:, target_idx]

final_mse = mean_squared_error(y_train, ensemble_oof)
print(f"\n🎯 MSE Final do Ensemble: {final_mse:.5f}")
print(f"📊 Comparação - Baseline: {best_baseline_score:.5f}")
print(f"📈 Melhoria: {((best_baseline_score - final_mse) / best_baseline_score * 100):.2f}%")


Criando previsões finais...

🎯 MSE Final do Ensemble: 0.19614
📊 Comparação - Baseline: 0.22349
📈 Melhoria: 12.24%


In [16]:
# ==================== SUBMISSÃO ====================

submission = pd.DataFrame(final_predictions, columns=[f"target{i+1}" for i in range(11)])
submission.insert(0, 'Id', test_ids.values)
submission.iloc[:, 1:] = submission.iloc[:, 1:].clip(lower=0)

submission_file = '../submissions/optuna_conservative_predictions.csv'
submission.to_csv(submission_file, index=False)
print(f"\n💾 Submissão salva em: {submission_file}")

# ==================== SALVAR RESULTADOS ====================

print("Salvando modelos e resultados...")
for name, model in hybrid_models.items():
    joblib.dump(model, f'../models/{name}_conservative.pkl')
joblib.dump(scaler, '../models/scaler_conservative.pkl')
np.save('../models/optimal_weights_conservative.npy', optimal_weights)
joblib.dump(study, '../models/optuna_study_conservative.pkl')

# Relatório final
with open('../results/optuna_conservative_report.txt', 'w') as f:
    f.write("=== RELATÓRIO OTIMIZAÇÃO CONSERVADORA ===\n\n")
    f.write(f"Melhor Baseline MSE: {best_baseline_score:.5f}\n")
    f.write(f"Melhor Optuna MSE: {study.best_value:.5f}\n")
    f.write(f"MSE Final Ensemble: {final_mse:.5f}\n")
    f.write(f"Melhoria total: {((best_baseline_score - final_mse) / best_baseline_score * 100):.2f}%\n\n")
    f.write(f"Modelos no ensemble: {list(hybrid_models.keys())}\n")
    f.write(f"Pesos por target:\n")
    for i, target in enumerate(target_cols):
        f.write(f"  {target}: {optimal_weights[i]}\n")

print("✅ Processo completo! Verifique o score - deve ser melhor que 0.8")



💾 Submissão salva em: ../submissions/optuna_conservative_predictions.csv
Salvando modelos e resultados...
✅ Processo completo! Verifique o score - deve ser melhor que 0.8
