{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import joblib\n",
    "from pathlib import Path\n",
    "from sklearn.feature_selection import SelectKBest, f_classif\n",
    "from sklearn.model_selection import train_test_split\n",
    "from xgboost import XGBClassifier\n",
    "from sklearn.metrics import roc_curve, auc\n",
    "\n",
    "# Set up paths\n",
    "Path('./outputs/figures').mkdir(parents=True, exist_ok=True)\n",
    "Path('./outputs/models').mkdir(parents=True, exist_ok=True)\n",
    "Path('./outputs/data').mkdir(parents=True, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load or recreate data\n",
    "def load_or_create_data():\n",
    "    try:\n",
    "        X = np.load('./outputs/data/X_test.npy')\n",
    "        y = np.load('./outputs/data/y_test.npy')\n",
    "        print(\"Loaded existing test data\")\n",
    "        return X, y\n",
    "    except:\n",
    "        print(\"Creating new test data\")\n",
    "        # Load original data\n",
    "        df = pd.read_csv('./data/higgs_sample.csv', header=None)\n",
    "        X = df.iloc[:, 1:].values\n",
    "        y = df.iloc[:, 0].values\n",
    "        \n",
    "        # Simple preprocessing\n",
    "        from sklearn.preprocessing import MinMaxScaler\n",
    "        scaler = MinMaxScaler()\n",
    "        X = scaler.fit_transform(X)\n",
    "        \n",
    "        # Split data\n",
    "        X_train, X_test, y_train, y_test = train_test_split(\n",
    "            X, y, test_size=0.2, random_state=42, stratify=y\n",
    "        )\n",
    "        \n",
    "        # Save test data\n",
    "        np.save('./outputs/data/X_test.npy', X_test)\n",
    "        np.save('./outputs/data/y_test.npy', y_test)\n",
    "        return X_test, y_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualization functions\n",
    "def generate_roc_comparison(models, X_test, y_test):\n",
    "    plt.figure(figsize=(10, 8))\n",
    "    \n",
    "    for name, model in models.items():\n",
    "        if hasattr(model, \"predict_proba\"):\n",
    "            y_proba = model.predict_proba(X_test)[:, 1]\n",
    "        else:  # Handle SVM\n",
    "            decision = model.decision_function(X_test)\n",
    "            y_proba = (decision - decision.min()) / (decision.max() - decision.min())\n",
    "            \n",
    "        fpr, tpr, _ = roc_curve(y_test, y_proba)\n",
    "        roc_auc = auc(fpr, tpr)\n",
    "        plt.plot(fpr, tpr, lw=2.5, label=f'{name} (AUC = {roc_auc:.3f})')\n",
    "    \n",
    "    plt.plot([0, 1], [0, 1], 'k--', lw=2)\n",
    "    plt.xlabel('False Positive Rate', fontsize=12, weight='bold')\n",
    "    plt.ylabel('True Positive Rate', fontsize=12, weight='bold')\n",
    "    plt.title('ROC Curve Comparison', fontsize=14, weight='bold')\n",
    "    plt.legend(loc=\"lower right\", fontsize=10)\n",
    "    plt.grid(True, alpha=0.3)\n",
    "    plt.savefig('./outputs/figures/roc_comparison.png', dpi=300, bbox_inches='tight')\n",
    "    plt.close()\n",
    "    print(\"Generated ROC comparison\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_anova_feature_importance(X, y, feature_names):\n",
    "    \"\"\"Calculate and plot ANOVA F-scores\"\"\"\n",
    "    selector = SelectKBest(score_func=f_classif, k=15)\n",
    "    selector.fit(X, y)\n",
    "    \n",
    "    plt.figure(figsize=(12, 8))\n",
    "    scores = selector.scores_\n",
    "    sorted_idx = scores.argsort()[::-1]\n",
    "    sorted_scores = scores[sorted_idx][:20]\n",
    "    sorted_features = [feature_names[i] for i in sorted_idx][:20]\n",
    "    \n",
    "    plt.barh(sorted_features, sorted_scores, color='#3498db', height=0.7)\n",
    "    plt.xlabel('F-Score', fontsize=12, weight='bold')\n",
    "    plt.ylabel('Features', fontsize=12, weight='bold')\n",
    "    plt.title('Top 20 Features by ANOVA F-Score', fontsize=14, weight='bold')\n",
    "    plt.gca().invert_yaxis()\n",
    "    plt.grid(axis='x', alpha=0.3)\n",
    "    plt.savefig('./outputs/figures/feature_importance.png', dpi=300, bbox_inches='tight')\n",
    "    plt.close()\n",
    "    print(\"Generated ANOVA feature importance\")\n",
    "    \n",
    "    # Save selector for later use\n",
    "    joblib.dump(selector, './outputs/feature_selector.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_xgb_feature_importance(model, feature_names):\n",
    "    \"\"\"Plot XGBoost feature importance\"\"\"\n",
    "    plt.figure(figsize=(12, 8))\n",
    "    importance = model.feature_importances_\n",
    "    sorted_idx = importance.argsort()[::-1]\n",
    "    sorted_imp = importance[sorted_idx][:15]\n",
    "    sorted_features = [feature_names[i] for i in sorted_idx][:15]\n",
    "    \n",
    "    plt.barh(sorted_features, sorted_imp, color='#e74c3c', height=0.7)\n",
    "    plt.xlabel('Importance Score', fontsize=12, weight='bold')\n",
    "    plt.ylabel('Features', fontsize=12, weight='bold')\n",
    "    plt.title('XGBoost Feature Importance', fontsize=14, weight='bold')\n",
    "    plt.gca().invert_yaxis()\n",
    "    plt.grid(axis='x', alpha=0.3)\n",
    "    plt.savefig('./outputs/figures/xgboost_feature_importance.png', dpi=300, bbox_inches='tight')\n",
    "    plt.close()\n",
    "    print(\"Generated XGBoost feature importance\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Main execution\n",
    "def main():\n",
    "    # Load or create data\n",
    "    X_test, y_test = load_or_create_data()\n",
    "    feature_names = [f'feature_{i}' for i in range(1, 29)]\n",
    "    \n",
    "    # Generate feature importance plot\n",
    "    plot_anova_feature_importance(X_test, y_test, feature_names)\n",
    "    \n",
    "    # Train a simple XGBoost model if needed\n",
    "    if not Path('./outputs/models/XGBoost.pkl').exists():\n",
    "        print(\"Training XGBoost model...\")\n",
    "        xgb_model = XGBClassifier(\n",
    "            n_estimators=100,\n",
    "            max_depth=5,\n",
    "            learning_rate=0.1,\n",
    "            random_state=42,\n",
    "            use_label_encoder=False,\n",
    "            eval_metric='logloss'\n",
    "        )\n",
    "        xgb_model.fit(X_test, y_test)  # Note: Using test data just for visualization\n",
    "        joblib.dump(xgb_model, './outputs/models/XGBoost.pkl')\n",
    "    else:\n",
    "        xgb_model = joblib.load('./outputs/models/XGBoost.pkl')\n",
    "    \n",
    "    # Generate XGBoost feature importance\n",
    "    plot_xgb_feature_importance(xgb_model, feature_names)\n",
    "    \n",
    "    # Create model dictionary for ROC comparison\n",
    "    models = {\n",
    "        'XGBoost': xgb_model\n",
    "    }\n",
    "    \n",
    "    # Generate ROC comparison\n",
    "    generate_roc_comparison(models, X_test, y_test)\n",
    "    \n",
    "    print(\"\\nAll visualizations generated successfully!\")\n",
    "    print(\"Check the outputs/figures directory for your plots.\")\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    main()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "higgs-env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}