In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, precision_recall_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Carregar e preparar dados
df = pd.read_csv("sample_data/dataset_transformadores_mes_expandido.csv")
df['Data_Hora'] = pd.to_datetime(df['Data_Hora'])

# Engenharia de features
df['Hora'] = df['Data_Hora'].dt.hour
df['Dia_Semana'] = df['Data_Hora'].dt.dayofweek
df['Variacao_Tensao_1h'] = df.groupby('Transformador')['Tensao_RMS'].diff(1)
df['Razao_Tensao_Consumo'] = df['Tensao_RMS'] / (df['Consumo_kWh'] + 1e-10)

# Criar labels
df['Anomalia_Real'] = np.where(
    (df['Tensao_RMS'] < 210) | (df['Tensao_RMS'] > 240) |
    (df['Temp_Transformador_C'] > 90),
    1, 0
)

# Remover possíveis NaNs
df.dropna(subset=['Variacao_Tensao_1h'], inplace=True)

# Seleção de features
features = ['Tensao_RMS', 'Temp_Transformador_C', 'Consumo_kWh', 'Variacao_Tensao_1h', 'Razao_Tensao_Consumo']

# 2. Pré-processamento
scaler = RobustScaler()
X = scaler.fit_transform(df[features])
y = df['Anomalia_Real']

# 3. Modelo Final
final_model = IsolationForest(
    n_estimators=200,
    max_samples=512,
    contamination=0.05,
    max_features=0.8,
    random_state=42,
    n_jobs=-1
)
final_model.fit(X)

# 4. Análise de Threshold Ótimo
y_scores = final_model.decision_function(X)
precision, recall, thresholds = precision_recall_curve(y, -y_scores)

# Encontrar threshold para recall ~0.7
target_recall = 0.7
optimal_idx = np.argmin(np.abs(recall - target_recall))
optimal_threshold = -thresholds[optimal_idx]

# 5. Métricas Finais
y_pred_final = np.where(y_scores < optimal_threshold, 1, 0)
print("\n🔍 Métricas Finais:")
print(classification_report(y, y_pred_final))

# 6. Feature Importance via Permutation Importance com anomaly scores
result = permutation_importance(
    final_model, X, y_scores,
    n_repeats=10, random_state=42, n_jobs=-1,
    scoring='neg_mean_absolute_error'
)

feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': result.importances_mean
}).sort_values('Importance', ascending=False)

print("\n📌 Importância das Features:")
print(feature_importance)

# 7. Visualizações
plt.figure(figsize=(15, 5))

plt.subplot(1, 2, 1)
sns.histplot(-y_scores, bins=50, kde=True)
plt.axvline(x=optimal_threshold, color='r', linestyle='--')
plt.title("Distribuição de Scores de Anomalia")
plt.xlabel('Anomaly Score')

plt.subplot(1, 2, 2)
plt.plot(recall, precision)
plt.axvline(x=target_recall, color='r', linestyle='--')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Curva Precision-Recall')

plt.tight_layout()
plt.show()
# Adiciona as previsões e scores ao DataFrame original
df['Anomaly_Score'] = -y_scores  # negativo porque maiores valores = mais anômalos no Isolation Forest
df['Anomalia_Prevista'] = y_pred_final

# Ranqueia transformadores pelos mais críticos (mais anomalias detectadas)
ranking_transformadores = df.groupby('Transformador').agg({
    'Anomalia_Prevista': 'sum',
    'Anomaly_Score': 'mean'
}).rename(columns={
    'Anomalia_Prevista': 'Qtd_Anomalias',
    'Anomaly_Score': 'Score_Medio'
}).sort_values(['Qtd_Anomalias', 'Score_Medio'], ascending=[False, False])

# Exibe ranking
print("\n📊 Ranking de Transformadores mais Críticos para Preventiva:")
print(ranking_transformadores.head(10))  # top 10

# Opcional: visualizar ranking
plt.figure(figsize=(12,6))
sns.barplot(
    data=ranking_transformadores.reset_index().head(10),
    x='Transformador', y='Qtd_Anomalias', palette='Reds_r'
)
plt.title('Top 10 Transformadores mais Críticos (Qtd de Anomalias)')
plt.xticks(rotation=45)
plt.ylabel('Qtd de Anomalias Detectadas')
plt.xlabel('Transformador')
plt.tight_layout()
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'sample_data/dataset_transformadores_mes_expandido.csv'