In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("RANDOM FOREST MULTI-CLASS CLASSIFICATION (NOISE-ROBUST)")
print("="*80)

# -----------------------------
# 1. Load data
# -----------------------------
print("\n[1/8] Loading data...")
X = pd.read_csv('trainingData.txt', header=None)
y = pd.read_csv('trainingTruth.txt', header=None, names=['label']).squeeze()
test_data = pd.read_csv('testData.txt', header=None)

print(f"Training data shape: {X.shape}")
print(f"Test data shape: {test_data.shape}")

# -----------------------------
# 2. Data Preprocessing
# -----------------------------
print("\n[2/8] Preprocessing data...")

# Replace empty strings with NaN and convert to numeric
X = X.replace('', np.nan).apply(pd.to_numeric, errors='coerce')
test_data = test_data.apply(pd.to_numeric, errors='coerce')

# Remove rows where y is null
valid_mask = ~y.isna()
X = X[valid_mask].reset_index(drop=True)
y = y[valid_mask].reset_index(drop=True)

print(f"After cleaning: {X.shape[0]} samples, {X.shape[1]} features")

# Analyze missing data
missing_percentage = (X.isna().sum() / len(X)) * 100
features_with_missing = (missing_percentage > 0).sum()
print(f"Features with missing values: {features_with_missing}/{X.shape[1]}")
if features_with_missing > 0:
    print(f"  Max missing %: {missing_percentage.max():.2f}%")
    print(f"  Mean missing %: {missing_percentage[missing_percentage > 0].mean():.2f}%")

# Check class distribution
print("\nClass distribution:")
class_counts = y.value_counts().sort_index()
for cls in class_counts.index:
    print(f"  Class {int(cls)}: {class_counts[cls]} samples ({100*class_counts[cls]/len(y):.2f}%)")

# Check for class imbalance
is_imbalanced = (class_counts.max() / class_counts.min()) > 1.5
if is_imbalanced:
    print("⚠️  Dataset appears imbalanced - will use balanced class weights")
    class_weight = 'balanced'
else:
    class_weight = None

# -----------------------------
# 3. Imputation
# -----------------------------
print("\n[3/8] Applying median imputation...")

imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)
test_imputed = imputer.transform(test_data)

print(f"Feature count: {X_imputed.shape[1]}")

# -----------------------------
# 4. Random Forest Hyperparameter Configurations
# -----------------------------
print("\n[4/8] Configuring Random Forest parameters (noise-robust)...")

# Random Forest parameters optimized for noisy data:
# - n_estimators: More trees = better averaging and noise reduction
# - max_depth: Limited depth prevents overfitting to noise
# - min_samples_split: Higher values prevent learning from noisy samples
# - min_samples_leaf: Ensures leaves have enough samples
# - max_features: Controls randomness and prevents correlation between trees
# - bootstrap: True for bagging (reduces variance)

rf_configs = [
    {
        'n_estimators': 500,
        'max_depth': 20,
        'min_samples_split': 20,
        'min_samples_leaf': 10,
        'max_features': 'sqrt',
        'bootstrap': True,
        'class_weight': class_weight,
        'random_state': 42,
        'n_jobs': -1,
        'verbose': 0
    },
    {
        'n_estimators': 500,
        'max_depth': 25,
        'min_samples_split': 15,
        'min_samples_leaf': 8,
        'max_features': 'sqrt',
        'bootstrap': True,
        'class_weight': class_weight,
        'random_state': 123,
        'n_jobs': -1,
        'verbose': 0
    },
    {
        'n_estimators': 600,
        'max_depth': 18,
        'min_samples_split': 25,
        'min_samples_leaf': 12,
        'max_features': 'sqrt',
        'bootstrap': True,
        'class_weight': class_weight,
        'random_state': 456,
        'n_jobs': -1,
        'verbose': 0
    },
    {
        'n_estimators': 400,
        'max_depth': 22,
        'min_samples_split': 18,
        'min_samples_leaf': 9,
        'max_features': 'log2',
        'bootstrap': True,
        'class_weight': class_weight,
        'random_state': 789,
        'n_jobs': -1,
        'verbose': 0
    }
]

# -----------------------------
# 5. Cross-Validation Training
# -----------------------------
print("\n[5/8] Training with 5-fold cross-validation...")

n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

all_config_scores = []

for config_idx, params in enumerate(rf_configs):
    print(f"\n--- Configuration {config_idx + 1}/{len(rf_configs)} ---")
    print(f"n_estimators: {params['n_estimators']}, max_depth: {params['max_depth']}, "
          f"min_samples_split: {params['min_samples_split']}, max_features: {params['max_features']}")
    
    fold_scores = []
    fold_models = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_imputed, y)):
        X_train, X_val = X_imputed[train_idx], X_imputed[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        # Train model
        model = RandomForestClassifier(**params)
        model.fit(X_train, y_train)
        
        # Validate
        y_val_pred = model.predict(X_val)
        accuracy = accuracy_score(y_val, y_val_pred)
        
        fold_scores.append(accuracy)
        fold_models.append(model)
        
        print(f"  Fold {fold + 1}: Accuracy = {accuracy:.4f}")
    
    # Calculate average CV score
    avg_score = np.mean(fold_scores)
    std_score = np.std(fold_scores)
    print(f"  → CV Score: {avg_score:.4f} ± {std_score:.4f}")
    
    all_config_scores.append((avg_score, std_score, config_idx, fold_models, params))

# Select best configuration
best_score, best_std, best_config_idx, best_fold_models, best_params = max(all_config_scores, key=lambda x: x[0])

print(f"\n✓ Best configuration: Config {best_config_idx + 1}")
print(f"  CV Score: {best_score:.4f} ± {best_std:.4f}")

# -----------------------------
# 6. Validation Metrics
# -----------------------------
print("\n[6/8] Detailed validation metrics...")

all_val_preds = []
all_val_probs = []
all_val_true = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_imputed, y)):
    X_val = X_imputed[val_idx]
    y_val = y.iloc[val_idx]
    
    model = best_fold_models[fold]
    y_val_pred = model.predict(X_val)
    y_val_probs = model.predict_proba(X_val)
    
    all_val_preds.extend(y_val_pred)
    all_val_probs.append(y_val_probs)
    all_val_true.extend(y_val.values)

all_val_probs = np.vstack(all_val_probs)
all_val_preds = np.array(all_val_preds)
all_val_true = np.array(all_val_true)

accuracy = accuracy_score(all_val_true, all_val_preds)

print(f"\nOverall Validation Accuracy: {accuracy:.4f}")

print("\nClass-wise AUC scores:")
auc_scores = []
for i in range(4):
    y_true_bin = (all_val_true == (i+1)).astype(int)
    auc = roc_auc_score(y_true_bin, all_val_probs[:, i])
    auc_scores.append(auc)
    print(f"  Class {i+1} AUC: {auc:.4f}")

macro_auc = np.mean(auc_scores)
print(f"  Macro-average AUC: {macro_auc:.4f}")

print("\nClassification Report:")
print(classification_report(all_val_true, all_val_preds, 
                          target_names=[f'Class {i+1}' for i in range(4)],
                          digits=4))

# -----------------------------
# 7. Feature Importance
# -----------------------------
print("\n[7/8] Analyzing feature importance...")

# Get feature importance from first model (MDI - Mean Decrease in Impurity)
importance = best_fold_models[0].feature_importances_

importance_df = pd.DataFrame({
    'feature_idx': range(len(importance)),
    'importance': importance
}).sort_values('importance', ascending=False)

print("\nTop 20 most important features:")
for idx, row in importance_df.head(20).iterrows():
    print(f"  Feature {int(row['feature_idx'])}: {row['importance']:.6f}")

zero_importance = (importance == 0).sum()
if zero_importance > 0:
    print(f"\n⚠️  {zero_importance} features have zero importance (likely noise)")

# -----------------------------
# 8. Ensemble Prediction on Test Set
# -----------------------------
print("\n[8/8] Generating ensemble predictions on test set...")

ensemble_test_preds = []
seeds = [42, 123, 456, 789, 2024, 2025, 3141, 9876]

for seed_idx, seed in enumerate(seeds):
    print(f"  Training ensemble model {seed_idx + 1}/{len(seeds)} (seed={seed})...")
    
    params_with_seed = best_params.copy()
    params_with_seed['random_state'] = seed
    
    model = RandomForestClassifier(**params_with_seed)
    model.fit(X_imputed, y)
    
    test_probs = model.predict_proba(test_imputed)
    ensemble_test_preds.append(test_probs)

# Average ensemble predictions
test_pred_final = np.mean(ensemble_test_preds, axis=0)
test_labels = np.argmax(test_pred_final, axis=1) + 1

# Calculate confidence metrics
prediction_confidence = np.max(test_pred_final, axis=1)
ensemble_agreement = np.array([
    np.mean([np.argmax(pred[i]) == np.argmax(test_pred_final[i]) 
             for pred in ensemble_test_preds])
    for i in range(len(test_pred_final))
])

print(f"\nTest prediction confidence statistics:")
print(f"  Mean: {prediction_confidence.mean():.4f}")
print(f"  Median: {np.median(prediction_confidence):.4f}")
print(f"  Min: {prediction_confidence.min():.4f}")
print(f"  Max: {prediction_confidence.max():.4f}")

print(f"\nTest ensemble agreement statistics:")
print(f"  Mean agreement: {ensemble_agreement.mean():.4f}")
print(f"  High agreement (>0.8): {(ensemble_agreement > 0.8).sum()} samples")
print(f"  Low agreement (<0.5): {(ensemble_agreement < 0.5).sum()} samples")

# Show class distribution
print("\nTest predicted class distribution:")
pred_counts = pd.Series(test_labels).value_counts().sort_index()
for cls in pred_counts.index:
    print(f"  Class {int(cls)}: {pred_counts[cls]} samples ({100*pred_counts[cls]/len(test_labels):.2f}%)")

# Compare with training
print("\nClass distribution comparison (Train → Test):")
for cls in sorted(class_counts.index):
    train_pct = 100 * class_counts[cls] / len(y)
    test_pct = 100 * pred_counts.get(cls, 0) / len(test_labels)
    diff = test_pct - train_pct
    print(f"  Class {int(cls)}: {train_pct:.1f}% → {test_pct:.1f}% (Δ {diff:+.1f}%)")

# -----------------------------
# 9. Save Test Results
# -----------------------------
output = np.column_stack([test_pred_final, test_labels])
np.savetxt('testLabel_rf.txt', output, 
           fmt='%.6f\t%.6f\t%.6f\t%.6f\t%d', 
           delimiter='\t')

np.savetxt('testLabel_rf_confidence.txt', 
           np.column_stack([test_labels, prediction_confidence, ensemble_agreement]),
           fmt='%d\t%.6f\t%.6f',
           header='predicted_label\tconfidence\tensemble_agreement',
           comments='')

print(f"\n✓ Test predictions saved to 'testLabel_rf.txt'")
print(f"✓ Confidence metrics saved to 'testLabel_rf_confidence.txt'")

# -----------------------------
# 10. Blind Data Prediction
# -----------------------------
print("\n" + "="*80)
print("[9/9] Generating predictions for BLIND dataset...")
print("="*80)

try:
    blind_data = pd.read_csv('blindData.txt', header=None)
    print(f"\nBlind data shape: {blind_data.shape}")
    
    blind_data = blind_data.apply(pd.to_numeric, errors='coerce')
    blind_imputed = imputer.transform(blind_data)
    
    print(f"Preprocessed blind data shape: {blind_imputed.shape}")
    print(f"\nGenerating ensemble predictions with {len(seeds)} models...")
    
    ensemble_blind_preds = []
    
    for seed_idx, seed in enumerate(seeds):
        print(f"  Model {seed_idx + 1}/{len(seeds)}...", end='\r')
        
        params_with_seed = best_params.copy()
        params_with_seed['random_state'] = seed
        
        model = RandomForestClassifier(**params_with_seed)
        model.fit(X_imputed, y)
        
        blind_probs = model.predict_proba(blind_imputed)
        ensemble_blind_preds.append(blind_probs)
    
    print(f"  Completed all {len(seeds)} models    ")
    
    # Average predictions
    blind_pred_final = np.mean(ensemble_blind_preds, axis=0)
    blind_labels = np.argmax(blind_pred_final, axis=1) + 1
    
    # Calculate metrics
    blind_confidence = np.max(blind_pred_final, axis=1)
    blind_agreement = np.array([
        np.mean([np.argmax(pred[i]) == np.argmax(blind_pred_final[i]) 
                 for pred in ensemble_blind_preds])
        for i in range(len(blind_pred_final))
    ])
    
    print(f"\nBlind prediction confidence statistics:")
    print(f"  Mean: {blind_confidence.mean():.4f}")
    print(f"  Median: {np.median(blind_confidence):.4f}")
    print(f"  Min: {blind_confidence.min():.4f}")
    print(f"  Max: {blind_confidence.max():.4f}")
    
    print(f"\nBlind ensemble agreement statistics:")
    print(f"  Mean agreement: {blind_agreement.mean():.4f}")
    print(f"  High agreement (>0.8): {(blind_agreement > 0.8).sum()} samples")
    print(f"  Low agreement (<0.5): {(blind_agreement < 0.5).sum()} samples")
    
    print("\nBlind predicted class distribution:")
    blind_pred_counts = pd.Series(blind_labels).value_counts().sort_index()
    for cls in blind_pred_counts.index:
        print(f"  Class {int(cls)}: {blind_pred_counts[cls]} samples ({100*blind_pred_counts[cls]/len(blind_labels):.2f}%)")
    
    print("\nClass distribution comparison (Train → Test → Blind):")
    for cls in sorted(class_counts.index):
        train_pct = 100 * class_counts[cls] / len(y)
        test_pct = 100 * pred_counts.get(cls, 0) / len(test_labels)
        blind_pct = 100 * blind_pred_counts.get(cls, 0) / len(blind_labels)
        print(f"  Class {int(cls)}: {train_pct:.1f}% → {test_pct:.1f}% → {blind_pct:.1f}%")
    
    # Save predictions
    blind_output = np.column_stack([blind_pred_final, blind_labels])
    np.savetxt('blindLabel_rf.txt', blind_output, 
               fmt='%.6f\t%.6f\t%.6f\t%.6f\t%d', 
               delimiter='\t')
    
    np.savetxt('blindLabel_rf_confidence.txt', 
               np.column_stack([blind_labels, blind_confidence, blind_agreement]),
               fmt='%d\t%.6f\t%.6f',
               header='predicted_label\tconfidence\tensemble_agreement',
               comments='')
    
    print(f"\n✓ Blind predictions saved to 'blindLabel_rf.txt'")
    print(f"✓ Blind confidence metrics saved to 'blindLabel_rf_confidence.txt'")
    print(f"  - {len(blind_labels)} predictions generated")
    
except FileNotFoundError:
    print("\n⚠️  'blindData.txt' not found - skipping blind data prediction")
except Exception as e:
    print(f"\n⚠️  Error processing blind data: {e}")

# -----------------------------
# Final Summary
# -----------------------------
print("\n" + "="*80)
print("✓ ALL PREDICTIONS COMPLETED SUCCESSFULLY")
print("="*80)
print(f"\nFiles generated:")
print(f"  1. testLabel_rf.txt ({len(test_labels)} predictions)")
print(f"  2. testLabel_rf_confidence.txt (test confidence scores)")
try:
    print(f"  3. blindLabel_rf.txt ({len(blind_labels)} predictions)")
    print(f"  4. blindLabel_rf_confidence.txt (blind confidence scores)")
except:
    pass

print(f"\nModel Performance Summary:")
print(f"  - Algorithm: Random Forest (Noise-Robust)")
print(f"  - Ensemble size: {len(seeds)} models")
print(f"  - Expected validation accuracy: {best_score:.4f} ± {best_std:.4f}")
print(f"  - Expected macro-average AUC: {macro_auc:.4f}")
print(f"  - Best configuration: Config {best_config_idx + 1}")
print(f"  - Noise resistance: bagging, depth limits, min_samples constraints")
print("\n" + "="*80)

RANDOM FOREST MULTI-CLASS CLASSIFICATION (NOISE-ROBUST)

[1/8] Loading data...
Training data shape: (27617, 411)
Test data shape: (13082, 411)

[2/8] Preprocessing data...
After cleaning: 27617 samples, 411 features
Features with missing values: 1/411
  Max missing %: 0.50%
  Mean missing %: 0.50%

Class distribution:
  Class 1: 8874 samples (32.13%)
  Class 2: 6127 samples (22.19%)
  Class 3: 8483 samples (30.72%)
  Class 4: 4133 samples (14.97%)
⚠️  Dataset appears imbalanced - will use balanced class weights

[3/8] Applying median imputation...
Feature count: 411

[4/8] Configuring Random Forest parameters (noise-robust)...

[5/8] Training with 5-fold cross-validation...

--- Configuration 1/4 ---
n_estimators: 500, max_depth: 20, min_samples_split: 20, max_features: sqrt
  Fold 1: Accuracy = 0.8240
  Fold 2: Accuracy = 0.8143
  Fold 3: Accuracy = 0.8083
  Fold 4: Accuracy = 0.8099
  Fold 5: Accuracy = 0.8102
  → CV Score: 0.8133 ± 0.0057

--- Configuration 2/4 ---
n_estimators: 500