# Random Forest - Produção Solar França

Modelo de floresta aleatória com engenharia de features temporais, validação temporal e tuning de hiperparâmetros.

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import os, json, warnings
warnings.filterwarnings('ignore')

output_dir = '../../out/solar_france/RandomForest'
os.makedirs(output_dir, exist_ok=True)

plt.style.use('seaborn-v0_8')
plt.rcParams['figure.figsize']=(12,6)
sns.set_palette('husl')

## 1. Carregar Dados

In [None]:
# Carregar dados
data_path = '../../data/solar_france.xlsx'
df = pd.read_excel(data_path)
df['Date and Hour'] = pd.to_datetime(df['Date and Hour'])
df = df.set_index('Date and Hour').sort_index().dropna()

print('Shape:', df.shape)
print('Período:', df.index.min(), '->', df.index.max())
print('Frequência:', pd.infer_freq(df.index))
print('\nPrimeiras linhas:')
df.head()

## 2. Feature Engineering

In [None]:
def create_features(data, n_lags=48):
    """
    Criar features temporais para Random Forest
    """
    X = pd.DataFrame(index=data.index)
    y = data['Production']
    
    # Lags
    for i in range(1, n_lags+1):
        X[f'lag_{i}'] = y.shift(i)
    
    # Rolling statistics
    for w in [3, 6, 12, 24, 48, 96]:
        X[f'roll_mean_{w}'] = y.rolling(w).mean()
        X[f'roll_std_{w}'] = y.rolling(w).std()
        X[f'roll_max_{w}'] = y.rolling(w).max()
        X[f'roll_min_{w}'] = y.rolling(w).min()
    
    # Features temporais
    X['hour'] = data.index.hour
    X['dow'] = data.index.dayofweek
    X['month'] = data.index.month
    X['day'] = data.index.day
    X['quarter'] = data.index.quarter
    
    # Codificação cíclica
    X['hour_sin'] = np.sin(2*np.pi*X['hour']/24)
    X['hour_cos'] = np.cos(2*np.pi*X['hour']/24)
    X['dow_sin'] = np.sin(2*np.pi*X['dow']/7)
    X['dow_cos'] = np.cos(2*np.pi*X['dow']/7)
    X['month_sin'] = np.sin(2*np.pi*X['month']/12)
    X['month_cos'] = np.cos(2*np.pi*X['month']/12)
    
    return X, y

# Criar features
X, y = create_features(df, n_lags=48)
data = pd.concat([X, y.rename('target')], axis=1).dropna()
X, y = data.drop('target', axis=1), data['target']

print(f'Features: {X.shape[1]}')
print(f'Amostras: {len(X)}')
print(f'\nExemplo de features:')
print(X.columns[:15].tolist())

## 3. Divisão e Preparação

In [None]:
# Divisão treino-teste (80-20)
train_size = int(len(X) * 0.8)
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

print(f'Treino: {len(X_train)} | Teste: {len(X_test)}')

# Normalização (opcional para RF, mas pode ajudar)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print('Dados preparados!')

## 4. Grid Search

In [None]:
# Grid de hiperparâmetros
param_grid = {
    'n_estimators': [200, 400],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 0.5]
}

# Random Forest base
rf = RandomForestRegressor(
    random_state=42,
    n_jobs=-1,
    verbose=0
)

# Time Series Cross-Validation
tscv = TimeSeriesSplit(n_splits=5)

print('Iniciando Grid Search...')
grid = GridSearchCV(
    rf,
    param_grid,
    cv=tscv,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

# Usar amostra para acelerar
sample = min(10000, len(X_train_scaled))
grid.fit(X_train_scaled[:sample], y_train.iloc[:sample])

print('\nMelhores parâmetros:', grid.best_params_)
print('Melhor CV MSE:', -grid.best_score_)

## 5. Modelo Final

In [None]:
# Treinar com todos os dados de treino
best_rf = grid.best_estimator_
best_rf.fit(X_train_scaled, y_train)

# Previsões
y_pred_train = best_rf.predict(X_train_scaled)
y_pred_test = best_rf.predict(X_test_scaled)

print('Modelo treinado e previsões feitas!')

## 6. Avaliação

In [None]:
# Métricas
mse = mean_squared_error(y_test, y_pred_test)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred_test)
mape = np.mean(np.abs((y_test - y_pred_test) / (y_test + 1e-10))) * 100
r2 = r2_score(y_test, y_pred_test)

print('='*50)
print('MÉTRICAS - RANDOM FOREST')
print('='*50)
print(f'RMSE: {rmse:.2f}')
print(f'MAE: {mae:.2f}')
print(f'MAPE: {mape:.2f}%')
print(f'R²: {r2:.4f}')

# Visualização
fig, axes = plt.subplots(2, 1, figsize=(15, 10))

# Predições vs Real
axes[0].plot(y_test.index, y_test.values, label='Real', alpha=0.7)
axes[0].plot(y_test.index, y_pred_test, label='Previsto', alpha=0.7)
axes[0].set_title('Random Forest - Previsões vs Real')
axes[0].set_ylabel('Produção (MW)')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Scatter
axes[1].scatter(y_test, y_pred_test, alpha=0.5)
axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[1].set_xlabel('Real')
axes[1].set_ylabel('Previsto')
axes[1].set_title('Real vs Previsto')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(f'{output_dir}/rf_predictions.png', dpi=300, bbox_inches='tight')
plt.show()

## 7. Importância de Features

In [None]:
# Importância de features
importances = best_rf.feature_importances_
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': importances
}).sort_values('importance', ascending=False)

print('\nTop 20 features mais importantes:')
print(feature_importance.head(20))

# Visualização
fig, ax = plt.subplots(figsize=(12, 8))
top_n = 20
top_features = feature_importance.head(top_n)
ax.barh(range(top_n), top_features['importance'])
ax.set_yticks(range(top_n))
ax.set_yticklabels(top_features['feature'])
ax.invert_yaxis()
ax.set_xlabel('Importância')
ax.set_title(f'Top {top_n} Features Mais Importantes')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(f'{output_dir}/feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

# Salvar importâncias
feature_importance.to_csv(f'{output_dir}/feature_importance.csv', index=False)

## 8. Salvar Resultados

In [None]:
# Salvar previsões
pred_df = pd.DataFrame({
    'real': y_test,
    'previsto': y_pred_test
}, index=y_test.index)
pred_df.to_csv(f'{output_dir}/rf_predictions.csv')

# Salvar métricas e parâmetros
results = {
    'model': 'RandomForest',
    'dataset': 'solar_france',
    'best_params': grid.best_params_,
    'metrics': {
        'mse': float(mse),
        'rmse': float(rmse),
        'mae': float(mae),
        'mape': float(mape),
        'r2': float(r2)
    },
    'n_features': X.shape[1],
    'train_size': len(X_train),
    'test_size': len(X_test)
}

with open(f'{output_dir}/rf_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print(f'\n✓ Resultados salvos em {output_dir}/')