In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from pathlib import Path
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, auc

# Set up paths
Path('./outputs/figures').mkdir(parents=True, exist_ok=True)
Path('./outputs/models').mkdir(parents=True, exist_ok=True)
Path('./outputs/data').mkdir(parents=True, exist_ok=True)

In [6]:
def load_or_create_data():
    try:
        X = np.load('./outputs/data/X_test.npy')
        y = np.load('./outputs/data/y_test.npy')
        print("Loaded existing test data")
        return X, y
    except:
        print("Creating new test data")
        # Load original data
        df = pd.read_csv('./data/higgs_sample.csv', header=None)
        X = df.iloc[:, 1:].values
        y = df.iloc[:, 0].values
        
        # Simple preprocessing
        from sklearn.preprocessing import MinMaxScaler
        scaler = MinMaxScaler()
        X = scaler.fit_transform(X)
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        # Save test data
        np.save('./outputs/data/X_test.npy', X_test)
        np.save('./outputs/data/y_test.npy', y_test)
        return X_test, y_test

In [7]:
def generate_roc_comparison(models, X_test, y_test):
    plt.figure(figsize=(10, 8))
    
    for name, model in models.items():
        if hasattr(model, "predict_proba"):
            y_proba = model.predict_proba(X_test)[:, 1]
        else:  # Handle SVM
            decision = model.decision_function(X_test)
            y_proba = (decision - decision.min()) / (decision.max() - decision.min())
            
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=2.5, label=f'{name} (AUC = {roc_auc:.3f})')
    
    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlabel('False Positive Rate', fontsize=12, weight='bold')
    plt.ylabel('True Positive Rate', fontsize=12, weight='bold')
    plt.title('ROC Curve Comparison', fontsize=14, weight='bold')
    plt.legend(loc="lower right", fontsize=10)
    plt.grid(True, alpha=0.3)
    plt.savefig('./outputs/figures/roc_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()
    print("Generated ROC comparison")

def plot_anova_feature_importance(X, y, feature_names):
    """Calculate and plot ANOVA F-scores"""
    selector = SelectKBest(score_func=f_classif, k=15)
    selector.fit(X, y)
    
    plt.figure(figsize=(12, 8))
    scores = selector.scores_
    sorted_idx = scores.argsort()[::-1]
    sorted_scores = scores[sorted_idx][:20]
    sorted_features = [feature_names[i] for i in sorted_idx][:20]
    
    plt.barh(sorted_features, sorted_scores, color='#3498db', height=0.7)
    plt.xlabel('F-Score', fontsize=12, weight='bold')
    plt.ylabel('Features', fontsize=12, weight='bold')
    plt.title('Top 20 Features by ANOVA F-Score', fontsize=14, weight='bold')
    plt.gca().invert_yaxis()
    plt.grid(axis='x', alpha=0.3)
    plt.savefig('./outputs/figures/feature_importance.png', dpi=300, bbox_inches='tight')
    plt.close()
    print("Generated ANOVA feature importance")
    
    # Save selector for later use
    joblib.dump(selector, './outputs/feature_selector.pkl')

def plot_xgb_feature_importance(model, feature_names):
    """Plot XGBoost feature importance"""
    plt.figure(figsize=(12, 8))
    importance = model.feature_importances_
    sorted_idx = importance.argsort()[::-1]
    sorted_imp = importance[sorted_idx][:15]
    sorted_features = [feature_names[i] for i in sorted_idx][:15]
    
    plt.barh(sorted_features, sorted_imp, color='#e74c3c', height=0.7)
    plt.xlabel('Importance Score', fontsize=12, weight='bold')
    plt.ylabel('Features', fontsize=12, weight='bold')
    plt.title('XGBoost Feature Importance', fontsize=14, weight='bold')
    plt.gca().invert_yaxis()
    plt.grid(axis='x', alpha=0.3)
    plt.savefig('./outputs/figures/xgboost_feature_importance.png', dpi=300, bbox_inches='tight')
    plt.close()
    print("Generated XGBoost feature importance")

In [8]:
def main():
    # Load or create data
    X_test, y_test = load_or_create_data()
    feature_names = [f'feature_{i}' for i in range(1, 29)]
    
    # Generate feature importance plot
    plot_anova_feature_importance(X_test, y_test, feature_names)
    
    # Train a simple XGBoost model if needed
    if not Path('./outputs/models/XGBoost.pkl').exists():
        print("Training XGBoost model...")
        xgb_model = XGBClassifier(
            n_estimators=100,
            max_depth=5,
            learning_rate=0.1,
            random_state=42,
            use_label_encoder=False,
            eval_metric='logloss'
        )
        xgb_model.fit(X_test, y_test)  # Note: Using test data just for visualization
        joblib.dump(xgb_model, './outputs/models/XGBoost.pkl')
    else:
        xgb_model = joblib.load('./outputs/models/XGBoost.pkl')
    
    # Generate XGBoost feature importance
    plot_xgb_feature_importance(xgb_model, feature_names)
    
    # Create model dictionary for ROC comparison
    models = {
        'XGBoost': xgb_model
    }
    
    # Generate ROC comparison
    generate_roc_comparison(models, X_test, y_test)
    
    print("\nAll visualizations generated successfully!")
    print("Check the outputs/figures directory for your plots.")

if __name__ == "__main__":
    main()

Loaded existing test data
Generated ANOVA feature importance
Generated XGBoost feature importance
Generated ROC comparison

All visualizations generated successfully!
Check the outputs/figures directory for your plots.
