2. Pré-processamento dos Dados

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Carregar os dados
df = pd.read_csv('./data/generated/final-dataset.csv')

# Selecionar colunas relevantes
features = ['Coal (TWh)', 'Gas (TWh)', 'Oil (TWh)', 'Nuclear (TWh)', 'Hydro (TWh)', 'Solar (TWh)', 'Wind (TWh)']
target = 'Total (CO₂)'

# Preencher valores ausentes com 0
df[features] = df[features].fillna(0)

# Filtrar dados completos
df = df.dropna(subset=[target])

# Separar variáveis e target
X = df[features]
y = df[target]

# Dividir dados: 80% treino, 20% teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalizar os dados
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

3. Teste de Algoritmos de Regressão

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

# Modelos a serem testados
models = {
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "SVR": SVR(kernel='rbf')
}

# Treinar e avaliar modelos
results = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    results[name] = {"RMSE": rmse, "R²": r2}

results

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.inspection import permutation_importance

# Carregar os dados
df = pd.read_csv('./data/generated/final-dataset.csv')

# 1. Pré-processamento de dados
# Preencher valores ausentes com 0
energy_columns = ['Coal (TWh)', 'Gas (TWh)', 'Oil (TWh)', 'Nuclear (TWh)', 'Hydro (TWh)', 'Solar (TWh)', 'Wind (TWh)']
df[energy_columns] = df[energy_columns].fillna(0)

# Filtrar dados completos para emissão de CO₂
df = df.dropna(subset=['Total (CO₂)'])

# 2. Engenharia de características
# Criar variável de energia total
df['Total Energy (TWh)'] = df[energy_columns].sum(axis=1)

# Calcular proporções de cada fonte
for col in energy_columns:
    df[f'{col} %'] = df[col] / df['Total Energy (TWh)']

# Substituir infinitos por 0
df = df.replace([np.inf, -np.inf], 0).fillna(0)

# 3. Seleção de características
features = energy_columns + ['Total Energy (TWh)'] + [f'{col} %' for col in energy_columns]
target = 'Total (CO₂)'

X = df[features]
y = df[target]

# 4. Divisão dos dados
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 5. Normalização dos dados
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 6. Treinamento do modelo Random Forest com ajuste de hiperparâmetros
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestRegressor(random_state=42, n_jobs=-1)
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train_scaled, y_train)

# Melhores parâmetros encontrados
best_params = grid_search.best_params_
print(f"Melhores parâmetros: {best_params}")

# Treinar modelo final com melhores parâmetros
best_rf = RandomForestRegressor(**best_params, random_state=42, n_jobs=-1)
best_rf.fit(X_train_scaled, y_train)

# 7. Avaliação do modelo
y_pred = best_rf.predict(X_test_scaled)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nMétricas de Desempenho:")
print(f"RMSE: {rmse:,.2f}")
print(f"MAE: {mae:,.2f}")
print(f"R²: {r2:.4f}")

# 8. Análise de importância das características
# Importância baseada em permutação (mais confiável)
result = permutation_importance(
    best_rf, X_test_scaled, y_test, n_repeats=10, random_state=42, n_jobs=-1
)

sorted_importances_idx = result.importances_mean.argsort()[::-1]
importances = pd.DataFrame(
    result.importances[sorted_importances_idx].T,
    columns=X.columns[sorted_importances_idx]
)

# Plotar importância das características
plt.figure(figsize=(12, 8))
sns.boxplot(data=importances, orient="h")
plt.title("Importância das Características (Permutação)")
plt.xlabel("Redução na pontuação R²")
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=300)
plt.show()

# 9. Análise de resíduos
residuals = y_test - y_pred

plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_pred, y=residuals, alpha=0.6)
plt.axhline(y=0, color='r', linestyle='--')
plt.title("Análise de Resíduos")
plt.xlabel("Valores Preditos")
plt.ylabel("Resíduos")
plt.tight_layout()
plt.savefig('residuals_analysis.png', dpi=300)
plt.show()

# 10. Comparação de valores reais vs. preditos
plt.figure(figsize=(10, 8))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.6)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
plt.title("Valores Reais vs. Valores Preditos")
plt.xlabel("Valores Reais de CO₂")
plt.ylabel("Valores Preditos de CO₂")
plt.tight_layout()
plt.savefig('actual_vs_predicted.png', dpi=300)
plt.show()

# 11. Salvar modelo para uso futuro
import joblib
joblib.dump(best_rf, 'co2_emission_rf_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

print("\nModelo e scaler salvos com sucesso!")