In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from lightgbm.callback import early_stopping, log_evaluation
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
from sklearn.preprocessing import RobustScaler, QuantileTransformer
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("ADVANCED LIGHTGBM MULTI-CLASS CLASSIFICATION")
print("="*80)

# -----------------------------
# 1. Load data
# -----------------------------
print("\n[1/10] Loading data...")
X = pd.read_csv('trainingData.txt', header=None)
y = pd.read_csv('trainingTruth.txt', header=None, names=['label']).squeeze()
test_data = pd.read_csv('testData.txt', header=None)

print(f"Training data shape: {X.shape}")
print(f"Test data shape: {test_data.shape}")

# -----------------------------
# 2. Advanced Data Preprocessing
# -----------------------------
print("\n[2/10] Advanced preprocessing...")

# Replace empty strings with NaN and convert to numeric
X = X.replace('', np.nan).apply(pd.to_numeric, errors='coerce')
test_data = test_data.apply(pd.to_numeric, errors='coerce')

# Remove rows where y is null
valid_mask = ~y.isna()
X = X[valid_mask].reset_index(drop=True)
y = y[valid_mask].reset_index(drop=True)

print(f"After cleaning: {X.shape[0]} samples, {X.shape[1]} features")

# Check class distribution
print("\nClass distribution:")
class_counts = y.value_counts().sort_index()
for cls in class_counts.index:
    print(f"  Class {int(cls)}: {class_counts[cls]} ({100*class_counts[cls]/len(y):.2f}%)")

is_imbalanced = (class_counts.max() / class_counts.min()) > 1.5
y = y - 1  # Zero-based

# -----------------------------
# 3. Smart Imputation with Multiple Strategies
# -----------------------------
print("\n[3/10] Multi-strategy imputation...")

# Analyze missingness pattern per feature
missing_pct = X.isna().sum() / len(X) * 100
high_missing = missing_pct[missing_pct > 50].index.tolist()
low_missing = missing_pct[(missing_pct > 0) & (missing_pct <= 50)].index.tolist()

print(f"Features with >50% missing: {len(high_missing)}")
print(f"Features with 0-50% missing: {len(low_missing)}")

# Create multiple imputed versions
imputers = {
    'median': SimpleImputer(strategy='median'),
    'mean': SimpleImputer(strategy='mean'),
}

X_imputed_median = imputers['median'].fit_transform(X)
X_imputed_mean = imputers['mean'].fit_transform(X)

test_imputed_median = imputers['median'].transform(test_data)
test_imputed_mean = imputers['mean'].transform(test_data)

# Add missingness indicators as features (can be informative)
missing_indicators = X.isna().astype(int).values
test_missing_indicators = test_data.isna().astype(int).values

# Select top missing indicators (those with >5% missingness)
important_missing_cols = (missing_pct > 5).values
missing_indicators = missing_indicators[:, important_missing_cols]
test_missing_indicators = test_missing_indicators[:, important_missing_cols]

print(f"Added {missing_indicators.shape[1]} missingness indicators")

# -----------------------------
# 4. Advanced Feature Engineering
# -----------------------------
print("\n[4/10] Feature engineering...")

X_base = X_imputed_median.copy()
test_base = test_imputed_median.copy()

# Statistical features per row
row_features = []
test_row_features = []

for data in [X_base, test_base]:
    features = []
    features.append(np.mean(data, axis=1).reshape(-1, 1))  # Mean
    features.append(np.std(data, axis=1).reshape(-1, 1))   # Std
    features.append(np.median(data, axis=1).reshape(-1, 1))  # Median
    features.append(np.min(data, axis=1).reshape(-1, 1))   # Min
    features.append(np.max(data, axis=1).reshape(-1, 1))   # Max
    features.append((np.max(data, axis=1) - np.min(data, axis=1)).reshape(-1, 1))  # Range
    features.append(stats.skew(data, axis=1).reshape(-1, 1))  # Skewness
    features.append(stats.kurtosis(data, axis=1).reshape(-1, 1))  # Kurtosis
    features.append(np.percentile(data, 25, axis=1).reshape(-1, 1))  # Q1
    features.append(np.percentile(data, 75, axis=1).reshape(-1, 1))  # Q3
    
    if data is X_base:
        row_features = np.hstack(features)
    else:
        test_row_features = np.hstack(features)

print(f"Created {row_features.shape[1]} statistical features")

# -----------------------------
# 5. Feature Selection and Transformation
# -----------------------------
print("\n[5/10] Feature selection and transformation...")

# Calculate mutual information
mi_scores = mutual_info_classif(X_base, y, random_state=42, n_neighbors=5)
mi_threshold = np.percentile(mi_scores, 25)  # Keep top 75%
selected_features = mi_scores > mi_threshold

print(f"Selected {selected_features.sum()} features based on mutual information")

X_selected = X_base[:, selected_features]
test_selected = test_base[:, selected_features]

# Apply QuantileTransformer for better distribution
quantile_transformer = QuantileTransformer(output_distribution='normal', random_state=42)
X_quantile = quantile_transformer.fit_transform(X_selected)
test_quantile = quantile_transformer.transform(test_selected)

# Create PCA features for additional representation
pca = PCA(n_components=50, random_state=42)
X_pca = pca.fit_transform(X_quantile)
test_pca = pca.transform(test_quantile)

print(f"PCA explained variance: {pca.explained_variance_ratio_.sum():.4f}")

# Combine all feature sets
X_final = np.hstack([
    X_selected,           # Selected original features
    X_quantile,           # Quantile-transformed features
    X_pca,                # PCA features
    row_features,         # Statistical features
    missing_indicators    # Missingness indicators
])

test_final = np.hstack([
    test_selected,
    test_quantile,
    test_pca,
    test_row_features,
    test_missing_indicators
])

print(f"Final feature count: {X_final.shape[1]}")

# -----------------------------
# 6. Optimized Hyperparameters
# -----------------------------
print("\n[6/10] Configuring optimized hyperparameters...")

param_configs = [
    {
        'objective': 'multiclass',
        'num_class': 4,
        'metric': 'multi_logloss',
        'boosting_type': 'gbdt',
        'learning_rate': 0.01,
        'num_leaves': 127,
        'max_depth': 10,
        'min_data_in_leaf': 15,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'lambda_l1': 0.5,
        'lambda_l2': 0.5,
        'min_gain_to_split': 0.001,
        'path_smooth': 1.0,
        'verbose': -1,
        'is_unbalance': is_imbalanced,
        'seed': 42
    },
    {
        'objective': 'multiclass',
        'num_class': 4,
        'metric': 'multi_logloss',
        'boosting_type': 'gbdt',
        'learning_rate': 0.008,
        'num_leaves': 95,
        'max_depth': 12,
        'min_data_in_leaf': 12,
        'feature_fraction': 0.75,
        'bagging_fraction': 0.75,
        'bagging_freq': 4,
        'lambda_l1': 0.3,
        'lambda_l2': 0.7,
        'min_gain_to_split': 0.001,
        'path_smooth': 0.5,
        'verbose': -1,
        'is_unbalance': is_imbalanced,
        'seed': 123
    },
    {
        'objective': 'multiclass',
        'num_class': 4,
        'metric': 'multi_logloss',
        'boosting_type': 'dart',  # Different boosting type
        'learning_rate': 0.015,
        'num_leaves': 80,
        'max_depth': 9,
        'min_data_in_leaf': 18,
        'feature_fraction': 0.85,
        'bagging_fraction': 0.85,
        'bagging_freq': 3,
        'lambda_l1': 0.2,
        'lambda_l2': 0.5,
        'drop_rate': 0.1,
        'skip_drop': 0.5,
        'verbose': -1,
        'is_unbalance': is_imbalanced,
        'seed': 456
    },
    {
        'objective': 'multiclass',
        'num_class': 4,
        'metric': 'multi_logloss',
        'boosting_type': 'gbdt',
        'learning_rate': 0.012,
        'num_leaves': 110,
        'max_depth': 11,
        'min_data_in_leaf': 10,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.9,
        'bagging_freq': 2,
        'lambda_l1': 0.1,
        'lambda_l2': 0.3,
        'min_gain_to_split': 0.0005,
        'path_smooth': 1.5,
        'verbose': -1,
        'is_unbalance': is_imbalanced,
        'seed': 789
    },
    {
        'objective': 'multiclass',
        'num_class': 4,
        'metric': 'multi_logloss',
        'boosting_type': 'gbdt',
        'learning_rate': 0.02,
        'num_leaves': 63,
        'max_depth': 8,
        'min_data_in_leaf': 20,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.7,
        'bagging_freq': 6,
        'lambda_l1': 0.8,
        'lambda_l2': 1.0,
        'min_gain_to_split': 0.002,
        'path_smooth': 0.8,
        'verbose': -1,
        'is_unbalance': is_imbalanced,
        'seed': 2024
    }
]

# -----------------------------
# 7. Cross-Validation with Stratified Folds
# -----------------------------
print("\n[7/10] Cross-validation training...")

n_folds = 10  # More folds for better CV estimate
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

all_config_scores = []

for config_idx, params in enumerate(param_configs):
    print(f"\n--- Configuration {config_idx + 1}/{len(param_configs)} ---")
    
    fold_scores = []
    fold_models = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_final, y)):
        X_train, X_val = X_final[train_idx], X_final[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        train_dataset = lgb.Dataset(X_train, label=y_train)
        valid_dataset = lgb.Dataset(X_val, label=y_val, reference=train_dataset)
        
        model = lgb.train(
            params,
            train_dataset,
            num_boost_round=3000,
            valid_sets=[valid_dataset],
            callbacks=[
                early_stopping(stopping_rounds=150),
                log_evaluation(period=0)
            ]
        )
        
        y_val_pred = model.predict(X_val, num_iteration=model.best_iteration)
        y_val_pred_labels = np.argmax(y_val_pred, axis=1)
        accuracy = accuracy_score(y_val, y_val_pred_labels)
        
        fold_scores.append(accuracy)
        fold_models.append(model)
    
    avg_score = np.mean(fold_scores)
    std_score = np.std(fold_scores)
    print(f"  → CV Score: {avg_score:.4f} ± {std_score:.4f}")
    
    all_config_scores.append((avg_score, std_score, config_idx, fold_models))

# Select best configuration
best_score, best_std, best_config_idx, best_fold_models = max(all_config_scores, key=lambda x: x[0])
best_params = param_configs[best_config_idx]

print(f"\n✓ Best configuration: Config {best_config_idx + 1}")
print(f"  CV Score: {best_score:.4f} ± {best_std:.4f}")

# -----------------------------
# 8. Validation Metrics
# -----------------------------
print("\n[8/10] Computing validation metrics...")

all_val_preds = []
all_val_true = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_final, y)):
    X_val = X_final[val_idx]
    y_val = y.iloc[val_idx]
    
    model = best_fold_models[fold]
    y_val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    
    all_val_preds.append(y_val_pred)
    all_val_true.extend(y_val.values)

all_val_preds = np.vstack(all_val_preds)
all_val_true = np.array(all_val_true)
val_pred_labels = np.argmax(all_val_preds, axis=1)

print(f"\nOverall Validation Accuracy: {accuracy_score(all_val_true, val_pred_labels):.4f}")

print("\nClass-wise AUC scores:")
for i in range(4):
    y_true_bin = (all_val_true == i).astype(int)
    auc = roc_auc_score(y_true_bin, all_val_preds[:, i])
    print(f"  Class {i+1} AUC: {auc:.4f}")

print("\nClassification Report:")
print(classification_report(all_val_true, val_pred_labels, 
                          target_names=[f'Class {i+1}' for i in range(4)],
                          digits=4))

# -----------------------------
# 9. Diverse Ensemble Prediction
# -----------------------------
print("\n[9/10] Generating diverse ensemble predictions...")

ensemble_test_preds = []

# Strategy 1: Use all CV fold models
for fold_idx, model in enumerate(best_fold_models):
    test_pred = model.predict(test_final, num_iteration=model.best_iteration)
    ensemble_test_preds.append(test_pred)
    print(f"  CV fold model {fold_idx + 1}/{len(best_fold_models)}")

# Strategy 2: Train additional models with different seeds
additional_seeds = [2025, 3141, 9876, 5555, 7777]
avg_best_iter = int(np.mean([m.best_iteration for m in best_fold_models]))

for seed in additional_seeds:
    params_with_seed = best_params.copy()
    params_with_seed['seed'] = seed
    
    full_train = lgb.Dataset(X_final, label=y)
    model = lgb.train(params_with_seed, full_train, num_boost_round=avg_best_iter)
    
    test_pred = model.predict(test_final)
    ensemble_test_preds.append(test_pred)

print(f"  Total ensemble models: {len(ensemble_test_preds)}")

# Weighted ensemble (give more weight to CV models)
weights = [1.2] * len(best_fold_models) + [1.0] * len(additional_seeds)
weights = np.array(weights) / sum(weights)

test_pred_final = np.average(ensemble_test_preds, axis=0, weights=weights)
test_labels = np.argmax(test_pred_final, axis=1) + 1

# Prediction statistics
prediction_confidence = np.max(test_pred_final, axis=1)
print(f"\nPrediction confidence: {prediction_confidence.mean():.4f} ± {prediction_confidence.std():.4f}")

print("\nPredicted class distribution:")
pred_counts = pd.Series(test_labels).value_counts().sort_index()
for cls in pred_counts.index:
    print(f"  Class {int(cls)}: {pred_counts[cls]} ({100*pred_counts[cls]/len(test_labels):.2f}%)")

# -----------------------------
# 10. Save Results
# -----------------------------
print("\n[10/10] Saving results...")

output = np.column_stack([test_pred_final, test_labels])
np.savetxt('testLabel_lightgbm_improved.txt', output, 
           fmt='%.6f\t%.6f\t%.6f\t%.6f\t%d', 
           delimiter='\t')

print("\n" + "="*80)
print("✓ COMPLETED")
print("="*80)
print(f"\nResults saved to 'testLabel_lightgbm_improved.txt'")
print(f"Expected accuracy: {best_score:.4f} ± {best_std:.4f}")
print("="*80)

ADVANCED LIGHTGBM MULTI-CLASS CLASSIFICATION

[1/10] Loading data...
Training data shape: (27617, 411)
Test data shape: (13082, 411)

[2/10] Advanced preprocessing...
After cleaning: 27617 samples, 411 features

Class distribution:
  Class 1: 8874 (32.13%)
  Class 2: 6127 (22.19%)
  Class 3: 8483 (30.72%)
  Class 4: 4133 (14.97%)

[3/10] Multi-strategy imputation...
Features with >50% missing: 0
Features with 0-50% missing: 1
Added 0 missingness indicators

[4/10] Feature engineering...
Created 10 statistical features

[5/10] Feature selection and transformation...
