In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.utils.class_weight import compute_class_weight
import warnings
warnings.filterwarnings('ignore')

# Import XGBoost
from xgboost import XGBClassifier

print("‚úì All libraries imported successfully")


‚úì All libraries imported successfully


In [2]:
train_raw = pd.read_csv("../data/train.csv")
test_raw = pd.read_csv("../data/test.csv")
sample_sub = pd.read_csv("../data/sample_submission.csv")

In [3]:
train_raw = train_raw.drop(columns=["id", "Feature_5"])
test_raw = test_raw.drop(columns=["id", "Feature_5"])


In [4]:
# Use median instead of mode for better imputation
median_values = train_raw.median()

train = train_raw.fillna(median_values)
test = test_raw.fillna(median_values)

# Additional handling for any remaining NaNs
train = train.fillna(train.mean())
test = test.fillna(train.mean())


In [5]:
print(train.isnull().sum())
print(test.isnull().sum())


Feature_1      0
Feature_2      0
Feature_3      0
Feature_4      0
Feature_6      0
Outage_Risk    0
dtype: int64
Feature_1    0
Feature_2    0
Feature_3    0
Feature_4    0
Feature_6    0
dtype: int64


In [6]:
# Prepare features and target
X = train.drop(columns=["Outage_Risk"])
y = train["Outage_Risk"]

# Check class distribution (to handle imbalance)
print(f"Class distribution:")
print(y.value_counts())
print(f"Class imbalance ratio: {y.value_counts()[0] / y.value_counts()[1]:.2f}:1")

# Feature Engineering - Create interaction terms and polynomial features
print("\nüìä Feature Engineering...")
X_engineered = X.copy()

# Create interaction terms for important features
X_engineered['Feature_1_x_Feature_2'] = X['Feature_1'] * X['Feature_2']
X_engineered['Feature_1_x_Feature_3'] = X['Feature_1'] * X['Feature_3']
X_engineered['Feature_3_x_Feature_4'] = X['Feature_3'] * X['Feature_4']

# Create polynomial features
X_engineered['Feature_1_squared'] = X['Feature_1'] ** 2
X_engineered['Feature_3_squared'] = X['Feature_3'] ** 2

# Normalize features
scaler = StandardScaler()
X_engineered_scaled = scaler.fit_transform(X_engineered)
X_engineered_scaled = pd.DataFrame(X_engineered_scaled, columns=X_engineered.columns)

print(f"Original features: {X.shape[1]}, Engineered features: {X_engineered_scaled.shape[1]}")


Class distribution:
Outage_Risk
0    5422
1    2078
Name: count, dtype: int64
Class imbalance ratio: 2.61:1

üìä Feature Engineering...
Original features: 5, Engineered features: 10


In [7]:
# ============================================================
# IMPROVED CROSS-VALIDATION WITH ALL TECHNIQUES
# ============================================================
from sklearn.ensemble import VotingClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier

print("\nüîÑ Running 5-Fold Cross-Validation with Best Techniques...")
print("="*60)

# Calculate class weights to handle imbalance
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}
print(f"Class weights: {class_weight_dict}")

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = {
    'Logistic Regression': [],
    'Random Forest': [],
    'Gradient Boosting': [],
    'XGBoost': [],
    'Voting Ensemble': [],
    'Super Ensemble': []
}

optimal_thresholds = {model: [] for model in results.keys()}

for fold_num, (train_idx, val_idx) in enumerate(skf.split(X_engineered_scaled, y), 1):
    X_train, X_val = X_engineered_scaled.iloc[train_idx], X_engineered_scaled.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Logistic Regression with class weights
    lr = LogisticRegression(C=0.5, max_iter=1000, class_weight='balanced', random_state=42)
    lr.fit(X_train, y_train)
    lr_probs = lr.predict_proba(X_val)[:, 1]
    lr_auc = roc_auc_score(y_val, lr_probs)
    results['Logistic Regression'].append(lr_auc)
    
    # Random Forest with class weights
    rf = RandomForestClassifier(
        n_estimators=200, 
        max_depth=12,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
    rf.fit(X_train, y_train)
    rf_probs = rf.predict_proba(X_val)[:, 1]
    rf_auc = roc_auc_score(y_val, rf_probs)
    results['Random Forest'].append(rf_auc)
    
    # Gradient Boosting with scale_pos_weight
    gb = GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        random_state=42
    )
    gb.fit(X_train, y_train)
    gb_probs = gb.predict_proba(X_val)[:, 1]
    gb_auc = roc_auc_score(y_val, gb_probs)
    results['Gradient Boosting'].append(gb_auc)
    
    # XGBoost with scale_pos_weight
    xgb = XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=class_weight_dict[0] / class_weight_dict[1],
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss',
        verbosity=0
    )
    xgb.fit(X_train, y_train)
    xgb_probs = xgb.predict_proba(X_val)[:, 1]
    xgb_auc = roc_auc_score(y_val, xgb_probs)
    results['XGBoost'].append(xgb_auc)
    
    # Voting Ensemble
    voting = VotingClassifier(
        estimators=[
            ('lr', LogisticRegression(C=0.5, max_iter=1000, class_weight='balanced', random_state=42)),
            ('rf', RandomForestClassifier(n_estimators=150, max_depth=10, class_weight='balanced', random_state=42, n_jobs=-1)),
            ('gb', GradientBoostingClassifier(n_estimators=150, learning_rate=0.05, max_depth=4, random_state=42))
        ],
        voting='soft'
    )
    voting.fit(X_train, y_train)
    voting_probs = voting.predict_proba(X_val)[:, 1]
    voting_auc = roc_auc_score(y_val, voting_probs)
    results['Voting Ensemble'].append(voting_auc)
    
    # Super Ensemble (average all models)
    ensemble_probs = (lr_probs + rf_probs + gb_probs + xgb_probs + voting_probs) / 5
    super_auc = roc_auc_score(y_val, ensemble_probs)
    results['Super Ensemble'].append(super_auc)
    
    print(f"Fold {fold_num}: LR={lr_auc:.4f} | RF={rf_auc:.4f} | GB={gb_auc:.4f} | XGB={xgb_auc:.4f} | Vote={voting_auc:.4f} | Super={super_auc:.4f}")

print("="*60)



üîÑ Running 5-Fold Cross-Validation with Best Techniques...
Class weights: {0: np.float64(0.6916267060125415), 1: np.float64(1.8046198267564966)}
Fold 1: LR=0.7027 | RF=0.6817 | GB=0.6880 | XGB=0.6847 | Vote=0.6988 | Super=0.6950
Fold 2: LR=0.7373 | RF=0.7049 | GB=0.7186 | XGB=0.7249 | Vote=0.7356 | Super=0.7304
Fold 3: LR=0.7008 | RF=0.6744 | GB=0.6740 | XGB=0.6694 | Vote=0.6899 | Super=0.6862
Fold 4: LR=0.7217 | RF=0.6930 | GB=0.6965 | XGB=0.6978 | Vote=0.7140 | Super=0.7110
Fold 5: LR=0.7183 | RF=0.6969 | GB=0.6956 | XGB=0.7038 | Vote=0.7124 | Super=0.7091


In [8]:
# Train final Super Ensemble on all data and generate predictions
print("\nüöÄ Training final Super Ensemble on full dataset...")

# Scale test data
X_test_engineered = test.copy()
X_test_engineered['Feature_1_x_Feature_2'] = test['Feature_1'] * test['Feature_2']
X_test_engineered['Feature_1_x_Feature_3'] = test['Feature_1'] * test['Feature_3']
X_test_engineered['Feature_3_x_Feature_4'] = test['Feature_3'] * test['Feature_4']
X_test_engineered['Feature_1_squared'] = test['Feature_1'] ** 2
X_test_engineered['Feature_3_squared'] = test['Feature_3'] ** 2

X_test_scaled = scaler.transform(X_test_engineered)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test_engineered.columns)

# Train all models on full data
lr_final = LogisticRegression(C=0.5, max_iter=1000, class_weight='balanced', random_state=42)
lr_final.fit(X_engineered_scaled, y)

rf_final = RandomForestClassifier(n_estimators=200, max_depth=12, class_weight='balanced', random_state=42, n_jobs=-1)
rf_final.fit(X_engineered_scaled, y)

gb_final = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=5, subsample=0.8, random_state=42)
gb_final.fit(X_engineered_scaled, y)

xgb_final = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=class_weight_dict[0] / class_weight_dict[1],
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss',
    verbosity=0
)
xgb_final.fit(X_engineered_scaled, y)

voting_final = VotingClassifier(
    estimators=[
        ('lr', lr_final),
        ('rf', rf_final),
        ('gb', gb_final)
    ],
    voting='soft'
)
voting_final.fit(X_engineered_scaled, y)

# Generate predictions
lr_test_probs = lr_final.predict_proba(X_test_scaled)[:, 1]
rf_test_probs = rf_final.predict_proba(X_test_scaled)[:, 1]
gb_test_probs = gb_final.predict_proba(X_test_scaled)[:, 1]
xgb_test_probs = xgb_final.predict_proba(X_test_scaled)[:, 1]
voting_test_probs = voting_final.predict_proba(X_test_scaled)[:, 1]

# Super Ensemble - Average all predictions
test_probs = (lr_test_probs + rf_test_probs + gb_test_probs + xgb_test_probs + voting_test_probs) / 5

print("‚úì Super Ensemble predictions generated")
print(f"Mean prediction: {test_probs.mean():.4f}")
print(f"Min prediction: {test_probs.min():.4f}, Max prediction: {test_probs.max():.4f}")



üöÄ Training final Super Ensemble on full dataset...
‚úì Super Ensemble predictions generated
Mean prediction: 0.3382
Min prediction: 0.0673, Max prediction: 0.9616


In [9]:
# Save improved submission
import os
from pathlib import Path

sample_sub["Outage_Risk"] = test_probs

# Create submissions directory if it doesn't exist
submission_dir = Path("../submissions")
submission_dir.mkdir(parents=True, exist_ok=True)

submission_path = submission_dir / "final_submission_improved.csv"
sample_sub.to_csv(submission_path, index=False)
print(f"‚úì Improved submission saved to {submission_path}")


‚úì Improved submission saved to ..\submissions\final_submission_improved.csv


In [10]:
# Prepare features for cross-validation
X = train.drop(columns=["Outage_Risk"])
y = train["Outage_Risk"]

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target distribution:\n{y.value_counts()}")


Features shape: (7500, 5)
Target shape: (7500,)
Target distribution:
Outage_Risk
0    5422
1    2078
Name: count, dtype: int64


In [11]:
# Display comprehensive results
print("\nüìà CROSS-VALIDATION RESULTS SUMMARY")
print("="*70)

for model_name, scores in results.items():
    mean_auc = np.mean(scores)
    std_auc = np.std(scores)
    print(f"{model_name:20s} | Mean AUC: {mean_auc:.4f} | Std Dev: {std_auc:.4f} | Scores: {[f'{s:.4f}' for s in scores]}")

print("="*70)

# Find best model
best_model = max(results.items(), key=lambda x: np.mean(x[1]))
print(f"\nüèÜ BEST MODEL: {best_model[0]} with Mean AUC = {np.mean(best_model[1]):.4f}")
print(f"Improvement over Logistic Regression: {(np.mean(best_model[1]) - np.mean(results['Logistic Regression'])) * 100:.2f}%")



üìà CROSS-VALIDATION RESULTS SUMMARY
Logistic Regression  | Mean AUC: 0.7162 | Std Dev: 0.0134 | Scores: ['0.7027', '0.7373', '0.7008', '0.7217', '0.7183']
Random Forest        | Mean AUC: 0.6902 | Std Dev: 0.0109 | Scores: ['0.6817', '0.7049', '0.6744', '0.6930', '0.6969']
Gradient Boosting    | Mean AUC: 0.6945 | Std Dev: 0.0145 | Scores: ['0.6880', '0.7186', '0.6740', '0.6965', '0.6956']
XGBoost              | Mean AUC: 0.6961 | Std Dev: 0.0186 | Scores: ['0.6847', '0.7249', '0.6694', '0.6978', '0.7038']
Voting Ensemble      | Mean AUC: 0.7101 | Std Dev: 0.0155 | Scores: ['0.6988', '0.7356', '0.6899', '0.7140', '0.7124']
Super Ensemble       | Mean AUC: 0.7063 | Std Dev: 0.0151 | Scores: ['0.6950', '0.7304', '0.6862', '0.7110', '0.7091']

üèÜ BEST MODEL: Logistic Regression with Mean AUC = 0.7162
Improvement over Logistic Regression: 0.00%
