In [5]:
import pandas as pd
import numpy as np
# Ensure these libraries are installed in your local environment
from lightgbm import LGBMClassifier
from lightgbm.callback import early_stopping  # <<< CORRECTED IMPORT
from catboost import CatBoostClassifier 
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# --- 1. Data Loading and Initial Cleaning ---
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

# Prepare target variable and test IDs
y = df_train['Y'].astype(int)
test_ids = df_test['id']
df_train = df_train.drop(['id', 'Y'], axis=1)
df_test = df_test.drop('id', axis=1)

# Replace +/- infinity with NaN for uniform handling in preprocessing
df_train = df_train.replace([-np.inf, np.inf], np.nan)
df_test = df_test.replace([-np.inf, np.inf], np.nan)


# --- 2. Preprocessing Function: Missing Indicator and Random Sample Imputation ---
def preprocess_data(df_train, df_test):
    """
    Applies Missing Indicator and Random Sample Imputation using ONLY training data.
    """
    df_full = pd.concat([df_train, df_test], axis=0).reset_index(drop=True)
    n_train = len(df_train)
    
    # Identify all numerical columns with NaNs
    missing_cols = [col for col in df_full.columns if df_full[col].isnull().any() and pd.api.types.is_numeric_dtype(df_full[col])]
    
    for col in missing_cols:
        # 1. Create Missing Indicator Feature
        indicator_col = f'{col}_is_missing'
        df_full[indicator_col] = df_full[col].isnull().astype(int)
        
        # 2. Perform Random Sample Imputation
        # Get non-missing values from the TRAINING portion ONLY
        non_missing_values = df_full.loc[:n_train-1, col].dropna().values
        
        # Identify NaN indices
        nan_indices = df_full[df_full[col].isnull()].index
        
        # Sample values to fill NaNs
        if len(nan_indices) > 0 and len(non_missing_values) > 0:
            random_samples = np.random.choice(non_missing_values, size=len(nan_indices), replace=True)
            df_full.loc[nan_indices, col] = random_samples
        elif len(nan_indices) > 0 and len(non_missing_values) == 0:
             df_full.loc[nan_indices, col] = 0.0
            
    # Separate back into Train and Test
    X_train = df_full.iloc[:n_train]
    X_test_proc = df_full.iloc[n_train:]
    
    return X_train, X_test_proc

# Run Preprocessing
X, X_test_proc = preprocess_data(df_train, df_test)

print(f"Train features shape after preprocessing: {X.shape}")
print(f"Test features shape after preprocessing: {X_test_proc.shape}")


# --- 3. Model Training and Prediction (Cross-Validation Ensemble) ---

# --- Model Parameters ---
N_ESTIMATORS = 500
EARLY_STOPPING_ROUNDS = 50

lgbm_params = {
    'objective': 'binary', 
    'metric': 'auc', 
    'n_estimators': N_ESTIMATORS, 
    'learning_rate': 0.05,
    'random_state': 42, 
    'n_jobs': -1,
    'verbose': -1, # This suppresses overall boosting output
}

cat_params = {
    'objective': 'Logloss', 
    'eval_metric': 'AUC', 
    'n_estimators': N_ESTIMATORS, 
    'learning_rate': 0.05,
    'random_seed': 42, 
    'verbose': 0, # This suppresses overall boosting output
    'allow_writing_files': False,
    'thread_count': -1 
}

# --- Cross-Validation Setup ---
N_SPLITS = 5
kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

# Containers for predictions
oof_preds_lgbm = np.zeros(X.shape[0])
oof_preds_cat = np.zeros(X.shape[0])
test_preds_lgbm = np.zeros(X_test_proc.shape[0])
test_preds_cat = np.zeros(X_test_proc.shape[0])

# --- CV Loop ---
for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Model 1: LightGBM (FIXED: using the imported early_stopping callback)
    lgbm = LGBMClassifier(**lgbm_params)
    lgbm.fit(X_train, y_train, 
             eval_set=[(X_val, y_val)],
             # CORRECTED LINE: use the imported 'early_stopping' function
             callbacks=[early_stopping(stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=False)])
    
    oof_preds_lgbm[val_idx] = lgbm.predict_proba(X_val)[:, 1]
    test_preds_lgbm += lgbm.predict_proba(X_test_proc)[:, 1] / N_SPLITS
    
    # Model 2: CatBoost (Uses early_stopping_rounds parameter directly)
    cat = CatBoostClassifier(**cat_params)
    cat.fit(X_train, y_train, 
            eval_set=(X_val, y_val),
            early_stopping_rounds=EARLY_STOPPING_ROUNDS)
            
    oof_preds_cat[val_idx] = cat.predict_proba(X_val)[:, 1]
    test_preds_cat += cat.predict_proba(X_test_proc)[:, 1] / N_SPLITS

# --- Individual Model CV Scores ---
lgbm_auc = roc_auc_score(y, oof_preds_lgbm)
cat_auc = roc_auc_score(y, oof_preds_cat)
print(f"\nLightGBM OOF AUC: {lgbm_auc:.5f}")
print(f"CatBoost OOF AUC: {cat_auc:.5f}")

# --- 4. Ensemble Blending ---
best_auc = 0
best_weight = 0

# Grid search for the optimal weight 'w' (weight for LightGBM)
weights = np.linspace(0, 1, 101) 
for w in weights:
    blended_oof = w * oof_preds_lgbm + (1 - w) * oof_preds_cat
    auc = roc_auc_score(y, blended_oof)
    
    if auc > best_auc:
        best_auc = auc
        best_weight = w

print(f"\nBest Blending Weight (w) for LightGBM: {best_weight:.2f}")
print(f"Ensemble OOF AUC (Max ROC AUC): {best_auc:.5f}")

# --- 5. Final Prediction and Submission ---
final_test_prediction = best_weight * test_preds_lgbm + (1 - best_weight) * test_preds_cat

# Generate the submission file
submission = pd.DataFrame({'id': test_ids, 'Y': final_test_prediction})
submission.to_csv('euphoria_ensemble_submission.csv', index=False)

print("\nFinal submission file 'euphoria_ensemble_submission.csv' generated with the optimal blend.")

Train features shape after preprocessing: (63093, 28)
Test features shape after preprocessing: (38670, 28)

LightGBM OOF AUC: 0.79156
CatBoost OOF AUC: 0.79238

Best Blending Weight (w) for LightGBM: 0.33
Ensemble OOF AUC (Max ROC AUC): 0.79269

Final submission file 'euphoria_ensemble_submission.csv' generated with the optimal blend.


In [6]:
binary_prediction = (final_test_prediction > 0.5).astype(int)

# Assuming 'test_ids' is the array of IDs from the test set:
submission = pd.DataFrame({'id': test_ids, 'Y': binary_prediction})
submission.to_csv('euphoria_ensemble_binary_submission.csv', index=False)