# Híbrido (ARIMA + MLP) - Produção Solar França

Estratégia: ARIMA modela componente linear com sazonalidade; MLP modela resíduos para capturar não-linearidades.

## Parâmetros ARIMA
- **order**: (1, 1, 1)
- **seasonal_order**: (2, 1, 1, 24)

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import os, json, warnings
warnings.filterwarnings('ignore')

output_dir='../../out/solar_france/Hibrido'
os.makedirs(output_dir, exist_ok=True)
plt.style.use('seaborn-v0_8')
plt.rcParams['figure.figsize']=(12,6)
np.random.seed(42)

## 1. Carregar Dados

In [None]:
# Dados
df = pd.read_excel('../../data/solar_france.xlsx')
df['Date and Hour'] = pd.to_datetime(df['Date and Hour'])
df = df.set_index('Date and Hour').sort_index().dropna()
y = df['Production']

# Divisão treino-teste
train_size = int(len(y) * 0.8)
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

print(f'Train: {len(y_train)} | Test: {len(y_test)}')
print(f'Período treino: {y_train.index.min()} -> {y_train.index.max()}')
print(f'Período teste: {y_test.index.min()} -> {y_test.index.max()}')

## 2. Fase 1: Modelo SARIMAX

In [None]:
# Parâmetros especificados
order = (1, 1, 1)
seasonal_order = (2, 1, 1, 24)

print(f'Treinando SARIMAX{order}x{seasonal_order}...')
print('Aguarde, isso pode levar alguns minutos...')

# Ajustar SARIMAX
sarimax_model = SARIMAX(
    y_train,
    order=order,
    seasonal_order=seasonal_order,
    enforce_stationarity=False,
    enforce_invertibility=False
)

sarimax_fitted = sarimax_model.fit(disp=False, maxiter=200)
print('✓ SARIMAX ajustado!')
print(f'AIC: {sarimax_fitted.aic:.2f}, BIC: {sarimax_fitted.bic:.2f}')

# Previsões SARIMAX
sarimax_forecast = sarimax_fitted.forecast(steps=len(y_test))
sarimax_train_pred = sarimax_fitted.fittedvalues

# Métricas SARIMAX puro
mse_sarimax = mean_squared_error(y_test, sarimax_forecast)
rmse_sarimax = np.sqrt(mse_sarimax)
mae_sarimax = mean_absolute_error(y_test, sarimax_forecast)
r2_sarimax = r2_score(y_test, sarimax_forecast)

print(f'\nMétricas SARIMAX (baseline):')
print(f'RMSE: {rmse_sarimax:.2f} | MAE: {mae_sarimax:.2f} | R²: {r2_sarimax:.4f}')

## 3. Calcular Resíduos

In [None]:
# Resíduos no treino
residuals_train = y_train - sarimax_train_pred

# Target de resíduos no teste (diferença entre real e previsão SARIMAX)
residuals_test_target = y_test - sarimax_forecast

print(f'Resíduos treino - média: {residuals_train.mean():.4f}, std: {residuals_train.std():.2f}')
print(f'Resíduos teste - média: {residuals_test_target.mean():.4f}, std: {residuals_test_target.std():.2f}')

# Visualizar resíduos
fig, axes = plt.subplots(2, 1, figsize=(15, 10))

axes[0].plot(residuals_train.index, residuals_train.values, alpha=0.7)
axes[0].axhline(y=0, color='r', linestyle='--')
axes[0].set_title('Resíduos SARIMAX - Treino')
axes[0].set_ylabel('Resíduo')
axes[0].grid(True, alpha=0.3)

axes[1].hist(residuals_train, bins=50, edgecolor='black', alpha=0.7)
axes[1].set_title('Distribuição dos Resíduos - Treino')
axes[1].set_xlabel('Resíduo')
axes[1].set_ylabel('Frequência')

plt.tight_layout()
plt.savefig(f'{output_dir}/sarimax_residuals.png', dpi=300, bbox_inches='tight')
plt.show()

## 4. Fase 2: MLP nos Resíduos

In [None]:
def make_features(idx, series, resids, n_lags=24):
    """
    Criar features para MLP a partir da série original e resíduos
    """
    dfF = pd.DataFrame(index=idx)
    
    # Lags da série original
    for i in range(1, n_lags+1):
        dfF[f'y_lag_{i}'] = series.shift(i).reindex(idx)
        dfF[f'resid_lag_{i}'] = resids.shift(i).reindex(idx)
    
    # Features temporais
    dfF['hour'] = idx.hour
    dfF['dow'] = idx.dayofweek
    dfF['month'] = idx.month
    dfF['hour_sin'] = np.sin(2*np.pi*dfF['hour']/24)
    dfF['hour_cos'] = np.cos(2*np.pi*dfF['hour']/24)
    dfF['dow_sin'] = np.sin(2*np.pi*dfF['dow']/7)
    dfF['dow_cos'] = np.cos(2*np.pi*dfF['dow']/7)
    dfF['month_sin'] = np.sin(2*np.pi*dfF['month']/12)
    dfF['month_cos'] = np.cos(2*np.pi*dfF['month']/12)
    
    return dfF.dropna()

# Features de treino
X_train = make_features(y_train.index, y_train, residuals_train, n_lags=24)
y_train_res = residuals_train.reindex(X_train.index)

# Features de teste
y_complete = pd.concat([y_train, y_test])
resids_complete = pd.concat([residuals_train, pd.Series(0, index=y_test.index)])
X_test = make_features(y_test.index, y_complete, resids_complete, n_lags=24)
y_test_res = residuals_test_target.reindex(X_test.index)

print(f'X_train: {X_train.shape}, X_test: {X_test.shape}')
print(f'Features: {X_train.shape[1]}')

## 5. Normalização e Grid Search do MLP

In [None]:
# Normalização
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

# Grid de hiperparâmetros para MLP
param_grid = {
    'hidden_layer_sizes': [(64,), (128,64), (128,64,32)],
    'alpha': [0.0001, 0.001, 0.01],
    'activation': ['relu', 'tanh'],
    'learning_rate_init': [0.001, 0.01]
}

mlp = MLPRegressor(
    max_iter=500,
    early_stopping=True,
    validation_fraction=0.1,
    random_state=42,
    verbose=False
)

tscv = TimeSeriesSplit(n_splits=5)

print('Grid Search para MLP nos resíduos...')
grid_mlp = GridSearchCV(
    mlp,
    param_grid,
    cv=tscv,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

# Treinar em amostra
sample = min(10000, len(X_train_s))
grid_mlp.fit(X_train_s[:sample], y_train_res.iloc[:sample])

print('\nMelhores parâmetros MLP:', grid_mlp.best_params_)
print('Melhor CV MSE:', -grid_mlp.best_score_)

## 6. Treinar MLP Final e Combinar Previsões

In [None]:
# Treinar MLP com todos os dados de treino
best_mlp = grid_mlp.best_estimator_
best_mlp.fit(X_train_s, y_train_res)

# Previsão dos resíduos
residuals_pred_test = best_mlp.predict(X_test_s)

# Combinar: Previsão Híbrida = SARIMAX + MLP(resíduos)
sarimax_forecast_aligned = sarimax_forecast.reindex(X_test.index)
y_pred_hybrid = sarimax_forecast_aligned + residuals_pred_test

# Alinhar y_test
y_test_aligned = y_test.reindex(X_test.index)

print('✓ Modelo Híbrido completo!')

## 7. Avaliação do Modelo Híbrido

In [None]:
# Métricas do modelo híbrido
mse = mean_squared_error(y_test_aligned, y_pred_hybrid)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_aligned, y_pred_hybrid)
mape = np.mean(np.abs((y_test_aligned - y_pred_hybrid) / (y_test_aligned + 1e-10))) * 100
r2 = r2_score(y_test_aligned, y_pred_hybrid)

print('='*50)
print('COMPARAÇÃO DE MÉTRICAS')
print('='*50)
print('\nSARIMAX (baseline):')
print(f'RMSE: {rmse_sarimax:.2f} | MAE: {mae_sarimax:.2f} | R²: {r2_sarimax:.4f}')
print('\nHíbrido (SARIMAX + MLP):')
print(f'RMSE: {rmse:.2f} | MAE: {mae:.2f} | MAPE: {mape:.2f}% | R²: {r2:.4f}')
print('\nMelhoria:')
print(f'RMSE: {((rmse_sarimax - rmse)/rmse_sarimax*100):.2f}%')
print(f'MAE: {((mae_sarimax - mae)/mae_sarimax*100):.2f}%')

## 8. Visualizações

In [None]:
# Comparação de previsões
fig, axes = plt.subplots(2, 1, figsize=(15, 12))

# Série completa
axes[0].plot(y_test_aligned.index, y_test_aligned.values, label='Real', alpha=0.7, linewidth=2)
axes[0].plot(sarimax_forecast_aligned.index, sarimax_forecast_aligned.values, 
            label='SARIMAX', alpha=0.7, linestyle='--')
axes[0].plot(y_test_aligned.index, y_pred_hybrid, label='Híbrido', alpha=0.7)
axes[0].set_title('Comparação: Real vs SARIMAX vs Híbrido')
axes[0].set_ylabel('Produção (MW)')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Zoom primeiros 3 dias
zoom_size = min(72, len(y_test_aligned))
axes[1].plot(y_test_aligned.index[:zoom_size], y_test_aligned.values[:zoom_size], 
            label='Real', marker='o', alpha=0.7)
axes[1].plot(sarimax_forecast_aligned.index[:zoom_size], sarimax_forecast_aligned.values[:zoom_size], 
            label='SARIMAX', marker='s', alpha=0.7, linestyle='--')
axes[1].plot(y_test_aligned.index[:zoom_size], y_pred_hybrid[:zoom_size], 
            label='Híbrido', marker='^', alpha=0.7)
axes[1].set_title('Zoom - Primeiros 3 dias')
axes[1].set_ylabel('Produção (MW)')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.xticks(rotation=45)

plt.tight_layout()
plt.savefig(f'{output_dir}/hybrid_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Análise de resíduos do modelo híbrido
residuals_hybrid = y_test_aligned - y_pred_hybrid

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Resíduos ao longo do tempo
axes[0,0].plot(y_test_aligned.index, residuals_hybrid)
axes[0,0].axhline(y=0, color='r', linestyle='--')
axes[0,0].set_title('Resíduos Modelo Híbrido')
axes[0,0].set_ylabel('Resíduo')
axes[0,0].grid(True, alpha=0.3)

# Histograma
axes[0,1].hist(residuals_hybrid, bins=50, edgecolor='black', alpha=0.7)
axes[0,1].set_title('Distribuição dos Resíduos')
axes[0,1].set_xlabel('Resíduo')

# Q-Q plot
from scipy import stats
stats.probplot(residuals_hybrid, dist="norm", plot=axes[1,0])
axes[1,0].set_title('Q-Q Plot')

# Scatter
axes[1,1].scatter(y_test_aligned, y_pred_hybrid, alpha=0.5)
axes[1,1].plot([y_test_aligned.min(), y_test_aligned.max()], 
              [y_test_aligned.min(), y_test_aligned.max()], 'r--', lw=2)
axes[1,1].set_xlabel('Real')
axes[1,1].set_ylabel('Previsto')
axes[1,1].set_title('Real vs Previsto')
axes[1,1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(f'{output_dir}/hybrid_residuals.png', dpi=300, bbox_inches='tight')
plt.show()

## 9. Salvar Resultados

In [None]:
# Salvar previsões
pred_df = pd.DataFrame({
    'real': y_test_aligned,
    'sarimax': sarimax_forecast_aligned,
    'hibrido': y_pred_hybrid,
    'residual': residuals_hybrid
}, index=y_test_aligned.index)
pred_df.to_csv(f'{output_dir}/hybrid_predictions.csv')

# Salvar métricas
results = {
    'model': 'Hybrid_SARIMAX_MLP',
    'dataset': 'solar_france',
    'sarimax_params': {
        'order': order,
        'seasonal_order': seasonal_order,
        'aic': float(sarimax_fitted.aic),
        'bic': float(sarimax_fitted.bic)
    },
    'mlp_params': grid_mlp.best_params_,
    'metrics': {
        'mse': float(mse),
        'rmse': float(rmse),
        'mae': float(mae),
        'mape': float(mape),
        'r2': float(r2)
    },
    'sarimax_baseline': {
        'rmse': float(rmse_sarimax),
        'mae': float(mae_sarimax),
        'r2': float(r2_sarimax)
    },
    'improvement': {
        'rmse_pct': float((rmse_sarimax - rmse)/rmse_sarimax*100),
        'mae_pct': float((mae_sarimax - mae)/mae_sarimax*100)
    },
    'n_features': X_train.shape[1],
    'train_size': len(X_train),
    'test_size': len(X_test)
}

with open(f'{output_dir}/hybrid_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print(f'\n✓ Resultados salvos em {output_dir}/')
print('\n✓ Análise Híbrida (SARIMAX + MLP) concluída com sucesso!')