In [2]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, Sequential
from sklearn.model_selection import train_test_split

In [3]:
df_treino = pd.read_csv('dataset_treino.csv', keep_default_na=False)
df_teste = pd.read_csv('dataset_teste.csv')

## Modelo #1

In [2]:
# =====================================================
# ULTIMATE PIPELINE V11: ANTI-OVERFITTING & GENERALIZATION
# Foco: Regularização Forte + Regra de Nuvens
# =====================================================

import os, random, warnings, gc
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight

# --- Imports ---
try:
    from catboost import CatBoostClassifier, Pool
    import lightgbm as lgb
except ImportError:
    print("Pip install catboost lightgbm necessary")

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, optimizers

warnings.filterwarnings("ignore")
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# =====================================================
# 1. PREPARAÇÃO & PHYSICS FEATURES
# =====================================================
print("--- 1. Engenharia de Features Robusta ---")

df_treino = pd.read_csv('dataset_treino.csv')
df_teste = pd.read_csv('dataset_teste.csv')

df = df_treino.copy()
df_test_final = df_teste.copy()
target_col = "Injecao"

# --- Mapeamento (Mantido) ---
correction_map = {3: 0, 1: 1, 2: 2, 0: 3, 4: 4}
if df[target_col].dtype == 'O':
    text_map = {'none':0, 'low':1, 'medium':2, 'high':3, 'very high':4}
    df['target_ordinal'] = df[target_col].astype(str).str.lower().str.strip().map(text_map)
else:
    df['target_ordinal'] = df[target_col].map(correction_map)

df['target_ordinal'] = df['target_ordinal'].fillna(0).astype(int)
y_all = df['target_ordinal'].values
n_classes = 5

def add_features(df_in):
    # Cíclicas
    df_in['Hora_sin'] = np.sin(2 * np.pi * df_in['Hora'] / 24)
    df_in['Hora_cos'] = np.cos(2 * np.pi * df_in['Hora'] / 24)
    df_in['Mes_sin'] = np.sin(2 * np.pi * df_in['Mes'] / 12)
    df_in['Mes_cos'] = np.cos(2 * np.pi * df_in['Mes'] / 12)
    
    # Physics 1: Surplus (Ouro)
    df_in['Surplus'] = df_in['Autoconsumo'] - df_in['Normal']
    
    # Physics 2: Delta Térmico (Importante para eficiência do painel)
    # Diferença entre temperatura real e sensação térmica indica humidade/vento
    df_in['Temp_Delta'] = df_in['temp'] - df_in['feels_like']
    
    # Physics 3: Cloud Interaction
    # O impacto das nuvens é exponencial, não linear
    df_in['Clear_Sky_Factor'] = (100 - df_in['clouds_all']) ** 2
    
    return df_in.drop(columns=['Dia', 'Ano'], errors='ignore')

X_all = add_features(df.drop(columns=[target_col, 'target_ordinal']))
X_test = add_features(df_test_final)

# Alinhar
X_test = X_test[X_all.columns]

# Imputação
for col in X_all.select_dtypes(include=[np.number]).columns:
    med = X_all[col].median()
    X_all[col] = X_all[col].fillna(med)
    X_test[col] = X_test[col].fillna(med)

print(f"Shape: {X_all.shape}")

# =====================================================
# 2. TREINO COM REGULARIZAÇÃO (O SEGREDO)
# =====================================================
N_SPLITS = 5 # Mantemos 5 para não fragmentar demasiado os dados
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

oof_preds = {
    'catboost': np.zeros((len(X_all), n_classes)),
    'lgbm': np.zeros((len(X_all), n_classes)),
    'nn': np.zeros((len(X_all), n_classes))
}
test_preds = {
    'catboost': np.zeros((len(X_test), n_classes)),
    'lgbm': np.zeros((len(X_test), n_classes)),
    'nn': np.zeros((len(X_test), n_classes))
}

print("\n--- 2. Treino Anti-Overfitting ---")

# --- A. CATBOOST (Regularizado) ---
for fold, (tr_idx, val_idx) in enumerate(skf.split(X_all, y_all), 1):
    X_tr, y_tr = X_all.iloc[tr_idx], y_all[tr_idx]
    X_val, y_val = X_all.iloc[val_idx], y_all[val_idx]
    
    cw = compute_class_weight('balanced', classes=np.unique(y_tr), y=y_tr)
    cw_dict = {c: w for c, w in zip(np.unique(y_tr), cw)}
    
    cb = CatBoostClassifier(
        iterations=2000, 
        learning_rate=0.03, 
        depth=6, # Reduzi profundidade (antes 7) para generalizar melhor
        l2_leaf_reg=5, # Regularização L2 forte!
        loss_function='MultiClass', 
        class_weights=cw_dict,
        verbose=0, random_seed=SEED, allow_writing_files=False
    )
    cb.fit(X_tr, y_tr, eval_set=(X_val, y_val), early_stopping_rounds=100)
    
    oof_preds['catboost'][val_idx] = cb.predict_proba(X_val)
    test_preds['catboost'] += cb.predict_proba(X_test) / N_SPLITS
    print(f"Fold {fold} - CatBoost (Reg) OK")

# --- B. LIGHTGBM (Regularizado) ---
for fold, (tr_idx, val_idx) in enumerate(skf.split(X_all, y_all), 1):
    X_tr, y_tr = X_all.iloc[tr_idx], y_all[tr_idx]
    X_val, y_val = X_all.iloc[val_idx], y_all[val_idx]
    
    dtrain = lgb.Dataset(X_tr, label=y_tr)
    dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)
    
    params = {
        'objective': 'multiclass', 'num_class': n_classes,
        'metric': 'multi_logloss', 'verbosity': -1, 'seed': SEED,
        'learning_rate': 0.02, 
        'num_leaves': 31, # Reduzi de 45 para 31 (Padrão) -> Menos overfitting
        'feature_fraction': 0.7, 
        'lambda_l1': 1.0, # Regularização L1
        'lambda_l2': 1.0, # Regularização L2
        'class_weight': 'balanced'
    }
    
    clf = lgb.train(
        params, dtrain, num_boost_round=2000,
        valid_sets=[dval], callbacks=[lgb.early_stopping(150), lgb.log_evaluation(0)]
    )
    oof_preds['lgbm'][val_idx] = clf.predict(X_val)
    test_preds['lgbm'] += clf.predict(X_test) / N_SPLITS
    print(f"Fold {fold} - LGBM (Reg) OK")

# --- C. NEURAL NETWORK (High Dropout) ---
scaler = StandardScaler()
X_all_s = scaler.fit_transform(X_all)
X_test_s = scaler.transform(X_test)

def get_nn_robust(input_dim):
    inp = layers.Input(shape=(input_dim,))
    x = layers.Dense(128, activation='swish')(inp)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.4)(x) # Aumentei Dropout para 40% (Obriga a aprender padrões reais)
    x = layers.Dense(64, activation='swish')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.4)(x)
    out = layers.Dense(n_classes, activation='softmax')(x)
    return models.Model(inp, out)

for fold, (tr_idx, val_idx) in enumerate(skf.split(X_all_s, y_all), 1):
    X_tr, y_tr = X_all_s[tr_idx], y_all[tr_idx]
    X_val, y_val = X_all_s[val_idx], y_all[val_idx]
    
    cw = compute_class_weight('balanced', classes=np.unique(y_tr), y=y_tr)
    cw_dict = {i: w for i, w in enumerate(cw)}
    
    nn = get_nn_robust(X_tr.shape[1])
    nn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    # Early Stopping mais paciente
    es = callbacks.EarlyStopping(patience=15, restore_best_weights=True)
    
    nn.fit(X_tr, y_tr, validation_data=(X_val, y_val), epochs=70, batch_size=64,
           class_weight=cw_dict, callbacks=[es], verbose=0)
    
    oof_preds['nn'][val_idx] = nn.predict(X_val, verbose=0)
    test_preds['nn'] += nn.predict(X_test_s, verbose=0) / N_SPLITS
    print(f"Fold {fold} - NN (Reg) OK")

# =====================================================
# 3. STACKING & PÓS-PROCESSAMENTO INTELIGENTE
# =====================================================
print("\n--- 3. Finalizing ---")
X_stack = np.hstack([oof_preds['catboost'], oof_preds['lgbm'], oof_preds['nn']])
X_stack_test = np.hstack([test_preds['catboost'], test_preds['lgbm'], test_preds['nn']])

meta = LogisticRegression(max_iter=2000, random_state=SEED)
meta.fit(X_stack, y_all)
final_oof = meta.predict(X_stack)
print(f">>> ACURÁCIA CV (Regularizada): {accuracy_score(y_all, final_oof):.5f}")
# Nota: É normal esta acurácia CV ser menor que antes (ex: 0.88), 
# mas a acurácia no Kaggle será maior!

final_probs = meta.predict_proba(X_stack_test)
final_classes = np.argmax(final_probs, axis=1)

# --- REGRAS DE OURO ---

# 1. NOITE: 19h às 06h -> None
night_hours = [19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6]
df_res = df_test_final.copy()
mask_night = df_res['Hora'].isin(night_hours)
final_classes[mask_night] = 0
print(f"Regra da Noite: {mask_night.sum()} correções.")

# 2. CLOUD CAPPING (NOVIDADE)
# Se o céu está 100% nublado, é IMPOSSÍVEL ser 'Very High'.
# O modelo às vezes excita-se com temperaturas altas, mas sem sol direto,
# a injeção máxima é 'High' (3).
mask_clouds = (df_res['clouds_all'] == 100) & (final_classes == 4)
final_classes[mask_clouds] = 3
print(f"Regra Cloud Capping: {mask_clouds.sum()} correções (Very High -> High).")

# Gerar Submission
reverse_map = {0: 'None', 1: 'Low', 2: 'Medium', 3: 'High', 4: 'Very High'}
final_labels = [reverse_map[i] for i in final_classes]

sub = pd.DataFrame({"RowId": np.arange(1, len(final_labels)+1), "Result": final_labels})
sub.to_csv("submission_v11_regularized.csv", index=False)
print("✅ Ficheiro 'submission_v11_regularized.csv' gerado.")

--- 1. Engenharia de Features Robusta ---
Shape: (11016, 21)

--- 2. Treino Anti-Overfitting ---
Fold 1 - CatBoost (Reg) OK
Fold 2 - CatBoost (Reg) OK
Fold 3 - CatBoost (Reg) OK
Fold 4 - CatBoost (Reg) OK
Fold 5 - CatBoost (Reg) OK
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[297]	valid_0's multi_logloss: 0.267224
Fold 1 - LGBM (Reg) OK
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[287]	valid_0's multi_logloss: 0.269254
Fold 2 - LGBM (Reg) OK
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[299]	valid_0's multi_logloss: 0.262567
Fold 3 - LGBM (Reg) OK
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[261]	valid_0's multi_logloss: 0.255526
Fold 4 - LGBM (Reg) OK
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[221]	valid_0's multi_logloss: 