In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from lightgbm.callback import early_stopping, log_evaluation
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
from sklearn.preprocessing import RobustScaler, QuantileTransformer, StandardScaler
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
from scipy import stats
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("NEURAL NETWORK META-LEARNER + LIGHTGBM STACKING")
print("="*80)

# Set seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

# -----------------------------
# 1. Load data
# -----------------------------
print("\n[1/11] Loading data...")
X = pd.read_csv('trainingData.txt', header=None)
y = pd.read_csv('trainingTruth.txt', header=None, names=['label']).squeeze()
test_data = pd.read_csv('testData.txt', header=None)

print(f"Training data shape: {X.shape}")
print(f"Test data shape: {test_data.shape}")

# -----------------------------
# 2. Data Preprocessing
# -----------------------------
print("\n[2/11] Preprocessing...")

X = X.replace('', np.nan).apply(pd.to_numeric, errors='coerce')
test_data = test_data.apply(pd.to_numeric, errors='coerce')

valid_mask = ~y.isna()
X = X[valid_mask].reset_index(drop=True)
y = y[valid_mask].reset_index(drop=True)

print(f"After cleaning: {X.shape[0]} samples, {X.shape[1]} features")

class_counts = y.value_counts().sort_index()
print("\nClass distribution:")
for cls in class_counts.index:
    print(f"  Class {int(cls)}: {class_counts[cls]} ({100*class_counts[cls]/len(y):.2f}%)")

is_imbalanced = (class_counts.max() / class_counts.min()) > 1.5
y = y - 1  # Zero-based

# -----------------------------
# 3. Feature Engineering
# -----------------------------
print("\n[3/11] Feature engineering...")

missing_pct = X.isna().sum() / len(X) * 100

# Imputation strategies
imputer_median = SimpleImputer(strategy='median')
X_imputed = imputer_median.fit_transform(X)
test_imputed = imputer_median.transform(test_data)

# Missing indicators
important_missing_cols = (missing_pct > 5).values
missing_indicators = X.isna().astype(int).values[:, important_missing_cols]
test_missing_indicators = test_data.isna().astype(int).values[:, important_missing_cols]

# Statistical features
def create_statistical_features(data):
    features = []
    features.append(np.mean(data, axis=1).reshape(-1, 1))
    features.append(np.std(data, axis=1).reshape(-1, 1))
    features.append(np.median(data, axis=1).reshape(-1, 1))
    features.append(np.min(data, axis=1).reshape(-1, 1))
    features.append(np.max(data, axis=1).reshape(-1, 1))
    features.append((np.max(data, axis=1) - np.min(data, axis=1)).reshape(-1, 1))
    features.append(stats.skew(data, axis=1).reshape(-1, 1))
    features.append(stats.kurtosis(data, axis=1).reshape(-1, 1))
    features.append(np.percentile(data, 25, axis=1).reshape(-1, 1))
    features.append(np.percentile(data, 75, axis=1).reshape(-1, 1))
    return np.hstack(features)

row_features = create_statistical_features(X_imputed)
test_row_features = create_statistical_features(test_imputed)

print(f"Created {row_features.shape[1]} statistical features")
print(f"Created {missing_indicators.shape[1]} missingness indicators")

# Feature selection
mi_scores = mutual_info_classif(X_imputed, y, random_state=42, n_neighbors=5)
mi_threshold = np.percentile(mi_scores, 20)  # Keep top 80%
selected_features = mi_scores > mi_threshold

X_selected = X_imputed[:, selected_features]
test_selected = test_imputed[:, selected_features]

print(f"Selected {selected_features.sum()} features via mutual information")

# Quantile transformation
quantile_transformer = QuantileTransformer(output_distribution='normal', random_state=42)
X_quantile = quantile_transformer.fit_transform(X_selected)
test_quantile = quantile_transformer.transform(test_selected)

# PCA features
pca = PCA(n_components=50, random_state=42)
X_pca = pca.fit_transform(X_quantile)
test_pca = pca.transform(test_quantile)

# Combine features
X_final = np.hstack([X_selected, X_quantile, X_pca, row_features, missing_indicators])
test_final = np.hstack([test_selected, test_quantile, test_pca, test_row_features, test_missing_indicators])

print(f"Final feature count: {X_final.shape[1]}")

# -----------------------------
# 4. Base Model Configurations
# -----------------------------
print("\n[4/11] Configuring diverse base models...")

base_model_configs = [
    {
        'objective': 'multiclass', 'num_class': 4, 'metric': 'multi_logloss',
        'boosting_type': 'gbdt', 'learning_rate': 0.01, 'num_leaves': 127,
        'max_depth': 10, 'min_data_in_leaf': 15, 'feature_fraction': 0.8,
        'bagging_fraction': 0.8, 'bagging_freq': 5, 'lambda_l1': 0.5,
        'lambda_l2': 0.5, 'verbose': -1, 'is_unbalance': is_imbalanced, 'seed': 42
    },
    {
        'objective': 'multiclass', 'num_class': 4, 'metric': 'multi_logloss',
        'boosting_type': 'gbdt', 'learning_rate': 0.008, 'num_leaves': 95,
        'max_depth': 12, 'min_data_in_leaf': 12, 'feature_fraction': 0.75,
        'bagging_fraction': 0.75, 'bagging_freq': 4, 'lambda_l1': 0.3,
        'lambda_l2': 0.7, 'verbose': -1, 'is_unbalance': is_imbalanced, 'seed': 123
    },
    {
        'objective': 'multiclass', 'num_class': 4, 'metric': 'multi_logloss',
        'boosting_type': 'dart', 'learning_rate': 0.015, 'num_leaves': 80,
        'max_depth': 9, 'min_data_in_leaf': 18, 'feature_fraction': 0.85,
        'bagging_fraction': 0.85, 'drop_rate': 0.1, 'skip_drop': 0.5,
        'verbose': -1, 'is_unbalance': is_imbalanced, 'seed': 456
    },
    {
        'objective': 'multiclass', 'num_class': 4, 'metric': 'multi_logloss',
        'boosting_type': 'gbdt', 'learning_rate': 0.012, 'num_leaves': 110,
        'max_depth': 11, 'min_data_in_leaf': 10, 'feature_fraction': 0.9,
        'bagging_fraction': 0.9, 'bagging_freq': 2, 'lambda_l1': 0.1,
        'lambda_l2': 0.3, 'verbose': -1, 'is_unbalance': is_imbalanced, 'seed': 789
    },
    {
        'objective': 'multiclass', 'num_class': 4, 'metric': 'multi_logloss',
        'boosting_type': 'gbdt', 'learning_rate': 0.02, 'num_leaves': 63,
        'max_depth': 8, 'min_data_in_leaf': 20, 'feature_fraction': 0.7,
        'bagging_fraction': 0.7, 'bagging_freq': 6, 'lambda_l1': 0.8,
        'lambda_l2': 1.0, 'verbose': -1, 'is_unbalance': is_imbalanced, 'seed': 2024
    }
]

# -----------------------------
# 5. Generate Base Model Predictions (Stacking)
# -----------------------------
print("\n[5/11] Training base models and generating meta-features...")

n_folds = 10
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Store out-of-fold predictions for meta-learner training
oof_predictions = np.zeros((len(X_final), len(base_model_configs), 4))
test_predictions = np.zeros((len(test_final), len(base_model_configs), 4))

for model_idx, params in enumerate(base_model_configs):
    print(f"\n--- Base Model {model_idx + 1}/{len(base_model_configs)} ---")
    
    fold_test_preds = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_final, y)):
        X_train, X_val = X_final[train_idx], X_final[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        train_dataset = lgb.Dataset(X_train, label=y_train)
        valid_dataset = lgb.Dataset(X_val, label=y_val, reference=train_dataset)
        
        model = lgb.train(
            params,
            train_dataset,
            num_boost_round=3000,
            valid_sets=[valid_dataset],
            callbacks=[early_stopping(stopping_rounds=150), log_evaluation(period=0)]
        )
        
        # Out-of-fold predictions for meta-learner
        oof_predictions[val_idx, model_idx, :] = model.predict(X_val, num_iteration=model.best_iteration)
        
        # Test predictions
        fold_test_preds.append(model.predict(test_final, num_iteration=model.best_iteration))
    
    # Average test predictions across folds
    test_predictions[:, model_idx, :] = np.mean(fold_test_preds, axis=0)
    
    # Evaluate base model
    base_pred_labels = np.argmax(oof_predictions[:, model_idx, :], axis=1)
    base_accuracy = accuracy_score(y, base_pred_labels)
    print(f"  Base Model {model_idx + 1} OOF Accuracy: {base_accuracy:.4f}")

# -----------------------------
# 6. Neural Network Meta-Learner Architecture
# -----------------------------
print("\n[6/11] Defining neural network meta-learner...")

class MetaLearner(nn.Module):
    def __init__(self, n_base_models, n_classes, dropout_rate=0.3):
        super(MetaLearner, self).__init__()
        
        input_size = n_base_models * n_classes  # Flattened predictions from all base models
        
        # Deep architecture with residual connections
        self.fc1 = nn.Linear(input_size, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.dropout1 = nn.Dropout(dropout_rate)
        
        self.fc2 = nn.Linear(256, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.dropout2 = nn.Dropout(dropout_rate)
        
        self.fc3 = nn.Linear(128, 64)
        self.bn3 = nn.BatchNorm1d(64)
        self.dropout3 = nn.Dropout(dropout_rate)
        
        self.fc4 = nn.Linear(64, 32)
        self.bn4 = nn.BatchNorm1d(32)
        self.dropout4 = nn.Dropout(dropout_rate)
        
        self.fc_out = nn.Linear(32, n_classes)
        
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        # Input: [batch_size, n_base_models, n_classes]
        x = x.view(x.size(0), -1)  # Flatten to [batch_size, n_base_models * n_classes]
        
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.dropout1(x)
        
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.dropout2(x)
        
        x = self.fc3(x)
        x = self.bn3(x)
        x = self.relu(x)
        x = self.dropout3(x)
        
        x = self.fc4(x)
        x = self.bn4(x)
        x = self.relu(x)
        x = self.dropout4(x)
        
        x = self.fc_out(x)
        
        return x

# -----------------------------
# 7. Train Meta-Learner
# -----------------------------
print("\n[7/11] Training neural network meta-learner...")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Prepare meta-features (OOF predictions from base models)
X_meta = oof_predictions.reshape(len(X_final), -1)
y_meta = y.values

# Split for meta-learner validation
meta_skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
meta_models = []
meta_val_scores = []

for meta_fold, (meta_train_idx, meta_val_idx) in enumerate(meta_skf.split(X_meta, y_meta)):
    print(f"\n  Meta-fold {meta_fold + 1}/5")
    
    X_meta_train, X_meta_val = X_meta[meta_train_idx], X_meta[meta_val_idx]
    y_meta_train, y_meta_val = y_meta[meta_train_idx], y_meta[meta_val_idx]
    
    # Convert to PyTorch tensors
    X_train_tensor = torch.FloatTensor(X_meta_train)
    y_train_tensor = torch.LongTensor(y_meta_train)
    X_val_tensor = torch.FloatTensor(X_meta_val)
    y_val_tensor = torch.LongTensor(y_meta_val)
    
    # Create DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    
    # Initialize model
    meta_model = MetaLearner(n_base_models=len(base_model_configs), n_classes=4, dropout_rate=0.3)
    meta_model = meta_model.to(device)
    
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(meta_model.parameters(), lr=0.001, weight_decay=0.01)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=10, verbose=False)
    
    # Training loop
    best_val_acc = 0
    patience_counter = 0
    max_patience = 30
    
    for epoch in range(200):
        meta_model.train()
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            batch_X = batch_X.view(batch_X.size(0), len(base_model_configs), 4)
            
            optimizer.zero_grad()
            outputs = meta_model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
        
        # Validation
        meta_model.eval()
        with torch.no_grad():
            X_val_device = X_val_tensor.to(device).view(-1, len(base_model_configs), 4)
            val_outputs = meta_model(X_val_device)
            val_preds = torch.argmax(val_outputs, dim=1).cpu().numpy()
            val_acc = accuracy_score(y_meta_val, val_preds)
        
        scheduler.step(val_acc)
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            patience_counter = 0
            best_model_state = meta_model.state_dict().copy()
        else:
            patience_counter += 1
        
        if patience_counter >= max_patience:
            break
    
    # Load best model
    meta_model.load_state_dict(best_model_state)
    meta_models.append(meta_model)
    meta_val_scores.append(best_val_acc)
    
    print(f"    Best validation accuracy: {best_val_acc:.4f}")

avg_meta_score = np.mean(meta_val_scores)
print(f"\n  Average meta-learner CV accuracy: {avg_meta_score:.4f} ± {np.std(meta_val_scores):.4f}")

# -----------------------------
# 8. Evaluate Meta-Learner on Full OOF
# -----------------------------
print("\n[8/11] Evaluating meta-learner on full out-of-fold predictions...")

meta_model_ensemble = meta_models[0]
meta_model_ensemble.eval()

with torch.no_grad():
    X_meta_tensor = torch.FloatTensor(X_meta).to(device).view(-1, len(base_model_configs), 4)
    oof_meta_outputs = meta_model_ensemble(X_meta_tensor)
    oof_meta_preds = torch.argmax(oof_meta_outputs, dim=1).cpu().numpy()

meta_accuracy = accuracy_score(y, oof_meta_preds)
print(f"Meta-learner OOF Accuracy: {meta_accuracy:.4f}")

print("\nMeta-learner Classification Report:")
print(classification_report(y, oof_meta_preds, 
                          target_names=[f'Class {i+1}' for i in range(4)],
                          digits=4))

# Compare with simple averaging
avg_oof_preds = np.mean(oof_predictions, axis=1)
avg_pred_labels = np.argmax(avg_oof_preds, axis=1)
avg_accuracy = accuracy_score(y, avg_pred_labels)
print(f"\nSimple averaging baseline: {avg_accuracy:.4f}")
print(f"Meta-learner improvement: {meta_accuracy - avg_accuracy:+.4f}")

# -----------------------------
# 9. Meta-Learner Predictions on Test Set
# -----------------------------
print("\n[9/11] Generating meta-learner predictions on test set...")

# Prepare test meta-features
X_test_meta = test_predictions.reshape(len(test_final), -1)

# Ensemble all meta-models
test_meta_probs = []

for meta_model in meta_models:
    meta_model.eval()
    with torch.no_grad():
        X_test_tensor = torch.FloatTensor(X_test_meta).to(device).view(-1, len(base_model_configs), 4)
        test_outputs = meta_model(X_test_tensor)
        test_probs = torch.softmax(test_outputs, dim=1).cpu().numpy()
        test_meta_probs.append(test_probs)

# Average predictions from all meta-models
final_test_probs = np.mean(test_meta_probs, axis=0)
final_test_labels = np.argmax(final_test_probs, axis=1) + 1

# -----------------------------
# 10. Analysis and Diagnostics
# -----------------------------
print("\n[10/11] Prediction analysis...")

prediction_confidence = np.max(final_test_probs, axis=1)
print(f"\nPrediction confidence: {prediction_confidence.mean():.4f} ± {prediction_confidence.std():.4f}")
print(f"High confidence (>0.9): {(prediction_confidence > 0.9).sum()} samples")
print(f"Low confidence (<0.6): {(prediction_confidence < 0.6).sum()} samples")

print("\nPredicted class distribution:")
pred_counts = pd.Series(final_test_labels).value_counts().sort_index()
for cls in pred_counts.index:
    print(f"  Class {int(cls)}: {pred_counts[cls]} ({100*pred_counts[cls]/len(final_test_labels):.2f}%)")

# -----------------------------
# 11. Save Results
# -----------------------------
print("\n[11/11] Saving results...")

output = np.column_stack([final_test_probs, final_test_labels])
np.savetxt('testLabel_metalearner.txt', output, 
           fmt='%.6f\t%.6f\t%.6f\t%.6f\t%d', 
           delimiter='\t')

# Save detailed diagnostics
np.savetxt('testLabel_diagnostics.txt',
           np.column_stack([final_test_labels, prediction_confidence]),
           fmt='%d\t%.6f',
           header='predicted_label\tconfidence',
           comments='')

print("\n" + "="*80)
print("✓ COMPLETED")
print("="*80)
print(f"\nResults saved to 'testLabel_metalearner.txt'")
print(f"Expected accuracy: {meta_accuracy:.4f}")
print(f"Improvement over simple averaging: {meta_accuracy - avg_accuracy:+.4f}")
print("="*80)