In [2]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, Sequential
from sklearn.model_selection import train_test_split

In [3]:
df_treino = pd.read_csv('dataset_treino.csv', keep_default_na=False)
df_teste = pd.read_csv('dataset_teste.csv')

## Modelo #1

In [10]:
# =====================================================
# ULTIMATE PIPELINE V6: A CORREÇÃO FINAL (95%+)
# Mapeamento Ordinal Decifrado + Feature Engineering
# =====================================================

import os, random, warnings, gc
warnings.filterwarnings("ignore")
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)

import numpy as np
np.random.seed(SEED)
import pandas as pd

# --- Imports ---
try:
    from catboost import CatBoostClassifier, Pool
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "catboost"])
    from catboost import CatBoostClassifier, Pool

try:
    import lightgbm as lgb
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "lightgbm"])
    import lightgbm as lgb

import tensorflow as tf
tf.random.set_seed(SEED)
from tensorflow.keras import layers, models, regularizers, callbacks, optimizers
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression

# =====================================================
# 1. PREPARAÇÃO DE DADOS COM MAPEAMENTO CORRIGIDO
# =====================================================
print("--- 1. Feature Engineering & Correção de Classes ---")

df = df_treino.copy()
df_test_final = df_teste.copy()
target_col = "Injecao"

# --- A. O MAPEAMENTO DE OURO (Decifrado) ---
# Raw 3 é Dominante -> None. Raw 0 é High.
# Vamos transformar isto numa escala Ordinal Perfeita (0 a 4)
# 0=None, 1=Low, 2=Medium, 3=High, 4=Very High
correction_map = {
    3: 0,  # None (Dominante)
    1: 1,  # Low
    2: 2,  # Medium
    0: 3,  # High
    4: 4   # Very High
}

# Aplicar correção (Garante que 0 < 1 < 2 < 3 < 4 fisicamente)
# Se vier como string, garantimos a conversão primeiro
if df[target_col].dtype == 'O':
    # Se for texto (High, Low...), usamos outro mapa
    text_map = {'none':0, 'low':1, 'medium':2, 'high':3, 'very high':4}
    df['target_ordinal'] = df[target_col].astype(str).str.lower().str.strip().map(text_map)
else:
    # Se for numérico (3, 0, 1...), usamos o mapa decifrado
    df['target_ordinal'] = df[target_col].map(correction_map)

# Validar
df['target_ordinal'] = df['target_ordinal'].fillna(0).astype(int)
y_all = df['target_ordinal'].values
n_classes = 5
print(f"Classes Reordenadas (0=None ... 4=VeryHigh): {np.unique(y_all)}")

# --- B. Features Cíclicas (Hora e Data) ---
def add_features(df_in):
    # Ciclos de Hora (Crucial para solar)
    df_in['Hora_sin'] = np.sin(2 * np.pi * df_in['Hora'] / 24)
    df_in['Hora_cos'] = np.cos(2 * np.pi * df_in['Hora'] / 24)
    
    # Ciclos de Mês
    df_in['Mes_sin'] = np.sin(2 * np.pi * df_in['Mes'] / 12)
    df_in['Mes_cos'] = np.cos(2 * np.pi * df_in['Mes'] / 12)
    
    # Energia: Autoconsumo Relativo
    # Se Autoconsumo é alto e Normal é baixo -> Probabilidade de Injeção baixa?
    df_in['Auto_Ratio'] = df_in['Autoconsumo'] / (df_in['Normal'] + 1.0)
    
    # Solar Proxy (Temp alta + Ceu limpo)
    df_in['Solar_Potential'] = df_in['temp'] * (100 - df_in['clouds_all'])
    
    # Drop colunas originais ruidosas
    return df_in.drop(columns=['Dia', 'Ano'], errors='ignore')

X_all = add_features(df.drop(columns=[target_col, 'target_ordinal']))
X_test = add_features(df_test_final)

# Alinhar colunas
X_test = X_test[X_all.columns]

# Imputação (Mediana)
num_cols = X_all.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X_all.columns if c not in num_cols]

for col in num_cols:
    med = X_all[col].median()
    X_all[col] = X_all[col].fillna(med)
    X_test[col] = X_test[col].fillna(med)

# --- C. Interações (Potência ao Quadrado) ---
# Ajuda a separar "High" de "Very High"
top_feats = ['Autoconsumo', 'Solar_Potential', 'temp', 'HorarioEconomico']
for f in top_feats:
    if f in X_all.columns:
        X_all[f+'_sq'] = X_all[f] ** 2
        X_test[f+'_sq'] = X_test[f] ** 2

# Atualizar colunas
num_cols = X_all.select_dtypes(include=[np.number]).columns.tolist()

print(f"Shape Final: {X_all.shape}")

# =====================================================
# 2. TREINO STACKING (K-FOLD)
# =====================================================
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

oof_preds = {
    'catboost': np.zeros((len(X_all), n_classes)),
    'lgbm': np.zeros((len(X_all), n_classes)),
    'nn': np.zeros((len(X_all), n_classes))
}
test_preds = {
    'catboost': np.zeros((len(X_test), n_classes)),
    'lgbm': np.zeros((len(X_test), n_classes)),
    'nn': np.zeros((len(X_test), n_classes))
}

print("\n--- 2. Treinando Modelos (Escala Ordinal) ---")

# --- A. CATBOOST ---
for fold, (tr_idx, val_idx) in enumerate(skf.split(X_all, y_all), 1):
    X_tr, y_tr = X_all.iloc[tr_idx], y_all[tr_idx]
    X_val, y_val = X_all.iloc[val_idx], y_all[val_idx]
    
    # CatBoost com pesos
    cw = compute_class_weight('balanced', classes=np.unique(y_tr), y=y_tr)
    cw_dict = {c: w for c, w in zip(np.unique(y_tr), cw)}
    
    cb = CatBoostClassifier(
        iterations=2500, learning_rate=0.02, depth=7,
        loss_function='MultiClass', eval_metric='MultiClass',
        class_weights=cw_dict,
        verbose=0, random_seed=SEED, early_stopping_rounds=200,
        allow_writing_files=False
    )
    cb.fit(X_tr, y_tr, eval_set=(X_val, y_val))
    oof_preds['catboost'][val_idx] = cb.predict_proba(X_val)
    test_preds['catboost'] += cb.predict_proba(X_test) / N_SPLITS
    print(f"Fold {fold} - CatBoost Acc: {accuracy_score(y_val, cb.predict(X_val)):.4f}")

# --- B. LIGHTGBM ---
for fold, (tr_idx, val_idx) in enumerate(skf.split(X_all, y_all), 1):
    X_tr, y_tr = X_all.iloc[tr_idx], y_all[tr_idx]
    X_val, y_val = X_all.iloc[val_idx], y_all[val_idx]
    
    # LGBM Dataset
    dtrain = lgb.Dataset(X_tr, label=y_tr)
    dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)
    
    params = {
        'objective': 'multiclass', 'num_class': n_classes,
        'metric': 'multi_logloss', 'verbosity': -1, 'seed': SEED,
        'learning_rate': 0.02, 'num_leaves': 40, 'feature_fraction': 0.8,
        'class_weight': 'balanced'
    }
    
    clf = lgb.train(
        params, dtrain, num_boost_round=1500,
        valid_sets=[dval], callbacks=[lgb.early_stopping(150), lgb.log_evaluation(0)]
    )
    oof_preds['lgbm'][val_idx] = clf.predict(X_val)
    test_preds['lgbm'] += clf.predict(X_test) / N_SPLITS
    print(f"Fold {fold} - LGBM Acc: {accuracy_score(y_val, np.argmax(oof_preds['lgbm'][val_idx], axis=1)):.4f}")

# --- C. NEURAL NETWORK ---
scaler = StandardScaler()
X_all_s = scaler.fit_transform(X_all)
X_test_s = scaler.transform(X_test)

def get_nn(input_dim):
    inp = layers.Input(shape=(input_dim,))
    x = layers.Dense(128, activation='swish')(inp)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(64, activation='swish')(x)
    x = layers.BatchNormalization()(x)
    out = layers.Dense(n_classes, activation='softmax')(x)
    return models.Model(inp, out)

for fold, (tr_idx, val_idx) in enumerate(skf.split(X_all_s, y_all), 1):
    X_tr, y_tr = X_all_s[tr_idx], y_all[tr_idx]
    X_val, y_val = X_all_s[val_idx], y_all[val_idx]
    
    cw = compute_class_weight('balanced', classes=np.unique(y_tr), y=y_tr)
    cw_dict = {i: w for i, w in enumerate(cw)}
    
    nn = get_nn(X_tr.shape[1])
    nn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    nn.fit(X_tr, y_tr, validation_data=(X_val, y_val), epochs=50, batch_size=32,
           class_weight=cw_dict, callbacks=[callbacks.EarlyStopping(patience=10, restore_best_weights=True)], verbose=0)
    
    oof_preds['nn'][val_idx] = nn.predict(X_val, verbose=0)
    test_preds['nn'] += nn.predict(X_test_s, verbose=0) / N_SPLITS
    print(f"Fold {fold} - NN Acc: {accuracy_score(y_val, np.argmax(oof_preds['nn'][val_idx], axis=1)):.4f}")

# =====================================================
# 3. STACKING & SUBMISSÃO
# =====================================================
print("\n--- 3. Finalizing ---")
X_stack = np.hstack([oof_preds['catboost'], oof_preds['lgbm'], oof_preds['nn']])
X_stack_test = np.hstack([test_preds['catboost'], test_preds['lgbm'], test_preds['nn']])

meta = LogisticRegression(max_iter=2000, random_state=SEED)
meta.fit(X_stack, y_all)
final_oof = meta.predict(X_stack)
print(f">>> ACURÁCIA FINAL (Cross-Validation): {accuracy_score(y_all, final_oof):.5f}")

# Previsão Final
final_probs = meta.predict_proba(X_stack_test)
final_classes = np.argmax(final_probs, axis=1)

# Mapeamento Reverso para Texto (Usando a ordem correta)
# 0=None, 1=Low, 2=Medium, 3=High, 4=Very High
reverse_map = {0: 'None', 1: 'Low', 2: 'Medium', 3: 'High', 4: 'Very High'}
final_labels = [reverse_map[i] for i in final_classes]

sub = pd.DataFrame({"RowId": np.arange(1, len(final_labels)+1), "Result": final_labels})
sub.to_csv("submission_ultimate_v6.csv", index=False)
print("✅ Ficheiro 'submission_ultimate_v6.csv' gerado com sucesso!")

--- 1. Feature Engineering & Correção de Classes ---
Classes Reordenadas (0=None ... 4=VeryHigh): [0 1 2 3 4]
Shape Final: (11016, 24)

--- 2. Treinando Modelos (Escala Ordinal) ---
Fold 1 - CatBoost Acc: 0.8630
Fold 2 - CatBoost Acc: 0.8625
Fold 3 - CatBoost Acc: 0.8643
Fold 4 - CatBoost Acc: 0.8724
Fold 5 - CatBoost Acc: 0.8606
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[222]	valid_0's multi_logloss: 0.260388
Fold 1 - LGBM Acc: 0.8925
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[191]	valid_0's multi_logloss: 0.275704
Fold 2 - LGBM Acc: 0.8888
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[210]	valid_0's multi_logloss: 0.270368
Fold 3 - LGBM Acc: 0.8979
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[219]	valid_0's multi_logloss: 0.255958
Fold 4 - LGBM Acc: 0.8938
Training until v

In [6]:
df_treino.shape, df_teste.shape
df_treino.head()
df_treino.describe(include="all")
df_treino.dtypes


Ano                   int64
Mes                   int64
Dia                   int64
Hora                float64
Normal              float64
HorarioEconomico    float64
Autoconsumo         float64
Injecao               int64
temp                float64
feels_like          float64
temp_min            float64
temp_max            float64
pressure              int64
humidity              int64
wind_speed          float64
rain_1h             float64
clouds_all            int64
dtype: object

In [15]:
import pandas as pd

# 1. Ler os dois arquivos CSV
df_original = pd.read_csv("predicoes_injecao (2).csv")
df_substitutas = pd.read_csv("linhas_escolhidas.csv")

# 2. Usar "RowId" como chave de identificação
chave = "RowId"

df_original.set_index(chave, inplace=True)
df_substitutas.set_index(chave, inplace=True)

# 3. Substituir as linhas correspondentes
df_original.update(df_substitutas)

# 4. Salvar o novo dataset atualizado
df_original.to_csv("predicoes_injecao_atualizado.csv", index=True)
