# Catboost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, confusion_matrix, classification_report, 
                           roc_auc_score, roc_curve, precision_recall_curve, 
                           average_precision_score, f1_score)
import catboost as cb  # Using CatBoost
import optuna  # For hyperparameter optimization
import time
import warnings
import os
warnings.filterwarnings('ignore')
os.environ['CUDA_VISIBLE_DEVICES'] = '7'  

# Ensure output directories exist
plots_dir = '/data/jinming/ee_stable/catboost/plots'
models_dir = '/data/jinming/ee_stable/catboost/models'
results_dir = '/data/jinming/ee_stable/catboost/results'
for directory in [plots_dir, models_dir, results_dir]:
    os.makedirs(directory, exist_ok=True)

# 1. Load data
print("===== Loading Data =====")
train_df = pd.read_csv('/data/jinming/ee_stable/data/train.csv')
test_df = pd.read_csv('/data/jinming/ee_stable/data/test.csv')
val_df = pd.read_csv('/data/jinming/ee_stable/data/val.csv')

# 2. Data preparation
print("===== Preparing Data =====")
X_train = train_df.drop(['stab', 'stabf_encoded', 'stabf', 'p1', 'p2', 'p3', 'p4'], axis=1)
X_test = test_df.drop(['stab', 'stabf_encoded', 'stabf', 'p1', 'p2', 'p3', 'p4'], axis=1)
X_val = val_df.drop(['stab', 'stabf_encoded', 'stabf', 'p1', 'p2', 'p3', 'p4'], axis=1)

y_train = train_df['stabf_encoded']
y_test = test_df['stabf_encoded']
y_val = val_df['stabf_encoded']

print(f"Dataset dimensions - Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")

# 3. Feature engineering function
def create_features(X_train, X_test, X_val):
    # Deep copy to avoid modifying original data
    X_train_new = X_train.copy()
    X_test_new = X_test.copy()
    X_val_new = X_val.copy()
    
    # Basic interaction features
    for df in [X_train_new, X_test_new, X_val_new]:
        df['tau1_g1'] = df['tau1'] * df['g1']
        df['tau2_g2'] = df['tau2'] * df['g2']
        df['tau3_g3'] = df['tau3'] * df['g3']
        df['tau4_g4'] = df['tau4'] * df['g4']
        
        # Delay ratio
        df['tau_ratio'] = df[['tau1', 'tau2', 'tau3', 'tau4']].max(axis=1) / df[['tau1', 'tau2', 'tau3', 'tau4']].min(axis=1).replace(0, 0.001)
        
        # Delay-elasticity ratio: response sensitivity of each node
        df['tau1_g1_ratio'] = df['tau1'] / df['g1'].replace(0, 0.001)
        df['tau2_g2_ratio'] = df['tau2'] / df['g2'].replace(0, 0.001)
        df['tau3_g3_ratio'] = df['tau3'] / df['g3'].replace(0, 0.001)
        df['tau4_g4_ratio'] = df['tau4'] / df['g4'].replace(0, 0.001)
        
        # System total elasticity
        df['total_elasticity'] = df['g1'] + df['g2'] + df['g3'] + df['g4']
        
        # Elasticity distribution non-uniformity
        df['elasticity_disparity'] = df[['g1', 'g2', 'g3', 'g4']].max(axis=1) / df[['g1', 'g2', 'g3', 'g4']].min(axis=1).replace(0, 0.001)
        
        # Non-linear features - quadratic terms
        df['tau1_squared'] = df['tau1'] ** 2
        df['tau2_squared'] = df['tau2'] ** 2
        df['tau3_squared'] = df['tau3'] ** 2
        df['tau4_squared'] = df['tau4'] ** 2
        
        # Node relationship features
        df['tau_g_correlation'] = (
            (df['tau1'] * df['g1']) + 
            (df['tau2'] * df['g2']) + 
            (df['tau3'] * df['g3']) + 
            (df['tau4'] * df['g4'])
        ) / (df['tau1'] + df['tau2'] + df['tau3'] + df['tau4'] + 0.001)
        
        # System overall response speed indicator
        df['system_response_speed'] = 4 / (
            (1/df['tau1'].replace(0, 0.001)) + 
            (1/df['tau2'].replace(0, 0.001)) + 
            (1/df['tau3'].replace(0, 0.001)) + 
            (1/df['tau4'].replace(0, 0.001))
        )
    
    return X_train_new, X_test_new, X_val_new

# Manual feature selection function
def select_features_manual(X_train, X_test, X_val):
    """Manually select specified feature set"""
    print("\n===== Using Manually Specified Features =====")
    
    # Specify features to keep
    selected_features = [
        # Original tau features
        'tau1', 'tau2', 'tau3', 'tau4',
        
        # Original g features
        'g1', 'g2', 'g3', 'g4',
        
        # tau and g interaction terms
        'tau1_g1', 'tau2_g2', 'tau3_g3', 'tau4_g4',
        
        # tau ratio features
        'tau_ratio'
    ]
    
    # Verify all specified features exist
    missing_features = [f for f in selected_features if f not in X_train.columns]
    if missing_features:
        print(f"Warning: The following specified features do not exist: {', '.join(missing_features)}")
        # Filter out non-existent features
        selected_features = [f for f in selected_features if f in X_train.columns]
    
    print(f"Using {len(selected_features)} specified features:")
    print(f"Selected features: {', '.join(selected_features)}")
    
    return X_train[selected_features], X_test[selected_features], X_val[selected_features], selected_features

# Apply feature engineering
print("\n===== Performing Feature Engineering =====")
X_train_featured, X_test_featured, X_val_featured = create_features(X_train, X_test, X_val)
print(f"Number of features after engineering: {X_train_featured.shape[1]}")

# Apply feature selection using manual method
X_train_final, X_test_final, X_val_final, selected_features = select_features_manual(
   X_train_featured, X_test_featured, X_val_featured)

print(f"Number of features after selection: {X_train_final.shape[1]}")

# Check if GPU is available for CatBoost
print("\n===== Checking GPU availability =====")
try:
    # Create a more realistic test dataset for GPU test
    X_test_gpu = np.random.rand(100, 8)  # 100 samples, 8 features
    y_test_gpu = np.random.randint(0, 2, 100)  # Binary labels
    
    # Create a test CatBoost model with GPU support
    test_model = cb.CatBoostClassifier(
        iterations=10,
        task_type='GPU',
        devices='0',
        verbose=False
    )
    test_model.fit(X_test_gpu, y_test_gpu, verbose=False)
    print("GPU acceleration is available for CatBoost!")
    GPU_AVAILABLE = True
except Exception as e:
    print(f"GPU acceleration is NOT available: {e}")
    print("Falling back to CPU training")
    GPU_AVAILABLE = False
    
# 5. Optuna hyperparameter optimization for AUC with GPU
print("\n===== Starting Optuna GPU hyperparameter tuning process (AUC) =====")

def objective_auc(trial):
    """Optuna optimization objective function using AUC as evaluation metric with GPU acceleration"""
    # Define CatBoost parameter search space
    params = {
        'loss_function': 'Logloss',  # For binary classification
        'eval_metric': 'AUC',
        'verbose': 0,
        
        # Core parameters
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'iterations': 2000,          # Will use early stopping to select the best iteration
        'depth': trial.suggest_int('depth', 4, 10),
        
        # Regularization parameters
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
        'random_strength': trial.suggest_float('random_strength', 1e-8, 10.0, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 10.0),
        
        # Other parameters
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations', 1, 10),
        'rsm': trial.suggest_float('rsm', 0.1, 1.0),  # Column sample ratio
        
        'random_seed': 42
    }
    
    # Add GPU parameters if GPU is available
    if GPU_AVAILABLE:
        params['task_type'] = 'GPU'
        params['devices'] = '0'
    
    # Create CatBoost model
    model = cb.CatBoostClassifier(**params)
    
    # Train model on training set, using validation set for early stopping
    model.fit(
        X_train_final, y_train,
        eval_set=[(X_val_final, y_val)],
        early_stopping_rounds=50,
        verbose=False
    )
    
    # Predict on validation set
    y_val_prob = model.predict_proba(X_val_final)[:, 1]
    
    # Calculate AUC score
    auc_score = roc_auc_score(y_val, y_val_prob)
    
    # Print current trial results
    print(f"Trial {trial.number}: AUC = {auc_score:.4f}")
    
    return auc_score

# Create Optuna study object with direction to maximize AUC
study_auc = optuna.create_study(direction='maximize', study_name='catboost_gpu_auc_optimization')

# Run optimization
n_trials = 100  # Can adjust based on computational resources and time
print(f"Starting {n_trials} GPU-accelerated hyperparameter tuning trials...")
start_time = time.time()
study_auc.optimize(objective_auc, n_trials=n_trials)
end_time = time.time()
print(f"Tuning completed! Duration: {end_time - start_time:.2f} seconds")

# Print best parameters and results
print("\n===== Best Parameters (AUC) =====")
print(f"Best AUC score: {study_auc.best_value:.4f}")
print("Best parameter combination:")
for key, value in study_auc.best_params.items():
    print(f"    {key}: {value}")

# Visualize optimization process
plt.figure(figsize=(12, 8))
optuna.visualization.matplotlib.plot_optimization_history(study_auc)
plt.title('Optimization History - AUC')
plt.tight_layout()
plt.savefig(f'{plots_dir}/optuna_catboost_auc_history.png')
plt.close()

# Visualize hyperparameter importance
plt.figure(figsize=(12, 8))
optuna.visualization.matplotlib.plot_param_importances(study_auc)
plt.title('Hyperparameter Importance')
plt.tight_layout()
plt.savefig(f'{plots_dir}/optuna_catboost_auc_param_importances.png')
plt.close()

# 6. Train final model with best parameters
print("\n===== Training Final Model with Best Parameters on GPU =====")
best_params = study_auc.best_params.copy()
best_params['loss_function'] = 'Logloss'
best_params['eval_metric'] = 'AUC'
best_params['random_seed'] = 42

# Add GPU parameters if available
if GPU_AVAILABLE:
    best_params['task_type'] = 'GPU'
    best_params['devices'] = '0'
    print("Using GPU acceleration for final model training")
else:
    print("Using CPU for final model training")

# Create final model
final_model = cb.CatBoostClassifier(**best_params)

# Train final model
start_time = time.time()
final_model.fit(
    X_train_final, y_train,
    eval_set=[(X_val_final, y_val)],
    early_stopping_rounds=100,
    verbose=50  # Show progress every 50 iterations
)
end_time = time.time()
print(f"Final model training completed! Duration: {end_time - start_time:.2f} seconds")

# 7. Evaluate final model on validation set
y_val_prob = final_model.predict_proba(X_val_final)[:, 1]
y_val_pred = final_model.predict(X_val_final)

# Calculate evaluation metrics
val_auc = roc_auc_score(y_val, y_val_prob)
val_acc = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)
val_f1_weighted = f1_score(y_val, y_val_pred, average='weighted')

print("\n===== Final Model Performance on Validation Set =====")
print(f"AUC: {val_auc:.4f}")  # Highlight AUC
print(f"Weighted F1 Score: {val_f1_weighted:.4f}")
print(f"F1 Score: {val_f1:.4f}")
print(f"Accuracy: {val_acc:.4f}")

print("\nValidation Set Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))
print("\nValidation Set Classification Report:")
print(classification_report(y_val, y_val_pred, digits=4))

# 8. Evaluate final model on test set
y_test_prob = final_model.predict_proba(X_test_final)[:, 1]
y_test_pred = final_model.predict(X_test_final)

# Calculate evaluation metrics
test_auc = roc_auc_score(y_test, y_test_prob)
test_acc = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
test_f1_weighted = f1_score(y_test, y_test_pred, average='weighted')

print("\n===== Final Model Performance on Test Set =====")
print(f"AUC: {test_auc:.4f}")  # Highlight AUC
print(f"Weighted F1 Score: {test_f1_weighted:.4f}")
print(f"F1 Score: {test_f1:.4f}")
print(f"Accuracy: {test_acc:.4f}")

print("\nTest Set Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))
print("\nTest Set Classification Report:")
print(classification_report(y_test, y_test_pred, digits=4))

# 9. Plot ROC and PR curves
# ROC curve
plt.figure(figsize=(12, 10))
# Validation set
fpr_val, tpr_val, _ = roc_curve(y_val, y_val_prob)
plt.plot(fpr_val, tpr_val, label=f'Validation Set (AUC = {val_auc:.4f})')
# Test set
fpr_test, tpr_test, _ = roc_curve(y_test, y_test_prob)
plt.plot(fpr_test, tpr_test, label=f'Test Set (AUC = {test_auc:.4f})')
# Reference line
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison (CatBoost GPU Model)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(f'{plots_dir}/catboost_gpu_roc_curves.png')
plt.close()

# PR curve
plt.figure(figsize=(12, 10))
# Validation set
prec_val, rec_val, _ = precision_recall_curve(y_val, y_val_prob)
avg_prec_val = average_precision_score(y_val, y_val_prob)
plt.plot(rec_val, prec_val, label=f'Validation Set (AP = {avg_prec_val:.4f})')
# Test set
prec_test, rec_test, _ = precision_recall_curve(y_test, y_test_prob)
avg_prec_test = average_precision_score(y_test, y_test_prob)
plt.plot(rec_test, prec_test, label=f'Test Set (AP = {avg_prec_test:.4f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('PR Curve Comparison (CatBoost GPU Model)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(f'{plots_dir}/catboost_gpu_pr_curves.png')
plt.close()

# 10. Feature importance visualization
feature_importance = final_model.get_feature_importance()
feature_names = X_train_final.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values('Importance', ascending=False)

plt.figure(figsize=(14, 10))
sns.barplot(x='Importance', y='Feature', data=importance_df.head(20))
plt.title("CatBoost Feature Importance (GPU Model)")
plt.tight_layout()
plt.savefig(f'{plots_dir}/catboost_gpu_feature_importance.png')
plt.close()

# 11. Save best model
model_path = f'{models_dir}/catboost_gpu_auc.cbm'
final_model.save_model(model_path)
print(f"\nBest model saved to: {model_path}")

# 12. Performance summary
print("\n===== Model Performance Summary =====")
results_df = pd.DataFrame({
    'Metric': ['AUC', 'Accuracy', 'F1 Score', 'Weighted F1 Score'],
    'Validation Set': [val_auc, val_acc, val_f1, val_f1_weighted],
    'Test Set': [test_auc, test_acc, test_f1, test_f1_weighted]
})
print(results_df)

# Save results
results_df.to_csv(f'{results_dir}/catboost_gpu_auc_performance.csv', index=False)
print(f"Model performance saved to: {results_dir}/catboost_gpu_auc_performance.csv")

# 13. Save feature list for future use
with open(f'{models_dir}/catboost_gpu_selected_features.txt', 'w') as f:
    f.write('\n'.join(selected_features))
print(f"Feature list saved to: {models_dir}/catboost_gpu_selected_features.txt")

print("\nAnalysis completed! All results and visualizations have been saved to the specified directories.")

# LightGBM

In [None]:
import lightgbm as lgb
import optuna  # For hyperparameter optimization
import time
import os
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, confusion_matrix, classification_report, 
                           roc_auc_score, roc_curve, precision_recall_curve, 
                           average_precision_score, f1_score)
warnings.filterwarnings('ignore')
os.environ['CUDA_VISIBLE_DEVICES'] = '0'  

# Set up directory structure for outputs
model_name = "lightgbm"  # Can be changed to different model names
base_dir = f'/data/jinming/ee_stable/{model_name}'
plots_dir = f'{base_dir}/plots'
models_dir = f'{base_dir}/models'
results_dir = f'{base_dir}/results'

# Create directories if they don't exist
for directory in [plots_dir, models_dir, results_dir]:
    os.makedirs(directory, exist_ok=True)

# 1. Load data
print("===== Loading Data =====")
train_df = pd.read_csv('/data/jinming/ee_stable/data/train.csv')
test_df = pd.read_csv('/data/jinming/ee_stable/data/test.csv')
val_df = pd.read_csv('/data/jinming/ee_stable/data/val.csv')

# 2. Data preparation
print("===== Preparing Data =====")
X_train = train_df.drop(['stab', 'stabf_encoded', 'stabf', 'p1', 'p2', 'p3', 'p4'], axis=1)
X_test = test_df.drop(['stab', 'stabf_encoded', 'stabf', 'p1', 'p2', 'p3', 'p4'], axis=1)
X_val = val_df.drop(['stab', 'stabf_encoded', 'stabf', 'p1', 'p2', 'p3', 'p4'], axis=1)

y_train = train_df['stabf_encoded']
y_test = test_df['stabf_encoded']
y_val = val_df['stabf_encoded']

print(f"Dataset dimensions - Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")

# 3. Feature engineering function
def create_features(X_train, X_test, X_val):
    # Deep copy to avoid modifying original data
    X_train_new = X_train.copy()
    X_test_new = X_test.copy()
    X_val_new = X_val.copy()
    
    # Basic interaction features
    for df in [X_train_new, X_test_new, X_val_new]:
        df['tau1_g1'] = df['tau1'] * df['g1']
        df['tau2_g2'] = df['tau2'] * df['g2']
        df['tau3_g3'] = df['tau3'] * df['g3']
        df['tau4_g4'] = df['tau4'] * df['g4']
        
        # Delay ratio
        df['tau_ratio'] = df[['tau1', 'tau2', 'tau3', 'tau4']].max(axis=1) / df[['tau1', 'tau2', 'tau3', 'tau4']].min(axis=1).replace(0, 0.001)
        
        # Delay-elasticity ratio: response sensitivity of each node
        df['tau1_g1_ratio'] = df['tau1'] / df['g1'].replace(0, 0.001)
        df['tau2_g2_ratio'] = df['tau2'] / df['g2'].replace(0, 0.001)
        df['tau3_g3_ratio'] = df['tau3'] / df['g3'].replace(0, 0.001)
        df['tau4_g4_ratio'] = df['tau4'] / df['g4'].replace(0, 0.001)
        
        # System total elasticity
        df['total_elasticity'] = df['g1'] + df['g2'] + df['g3'] + df['g4']
        
        # Elasticity distribution non-uniformity
        df['elasticity_disparity'] = df[['g1', 'g2', 'g3', 'g4']].max(axis=1) / df[['g1', 'g2', 'g3', 'g4']].min(axis=1).replace(0, 0.001)
        
        # Non-linear features - quadratic terms
        df['tau1_squared'] = df['tau1'] ** 2
        df['tau2_squared'] = df['tau2'] ** 2
        df['tau3_squared'] = df['tau3'] ** 2
        df['tau4_squared'] = df['tau4'] ** 2
        
        # Node relationship features
        df['tau_g_correlation'] = (
            (df['tau1'] * df['g1']) + 
            (df['tau2'] * df['g2']) + 
            (df['tau3'] * df['g3']) + 
            (df['tau4'] * df['g4'])
        ) / (df['tau1'] + df['tau2'] + df['tau3'] + df['tau4'] + 0.001)
        
        # System overall response speed indicator
        df['system_response_speed'] = 4 / (
            (1/df['tau1'].replace(0, 0.001)) + 
            (1/df['tau2'].replace(0, 0.001)) + 
            (1/df['tau3'].replace(0, 0.001)) + 
            (1/df['tau4'].replace(0, 0.001))
        )
    
    return X_train_new, X_test_new, X_val_new

# 4. Feature selection functions
def select_features(X_train, X_test, X_val, y_train):
    """
    Feature selection based on correlation and importance
    """
    print("\n===== Starting Feature Selection =====")
    
    # Step 1: Calculate correlation with target
    print("Step 1: Calculating feature-target correlations")
    
    feature_target_corr = {}
    for col in X_train.columns:
        corr = abs(np.corrcoef(X_train[col], y_train)[0, 1])
        feature_target_corr[col] = corr
    
    feature_corr_df = pd.DataFrame({
        'Feature': list(feature_target_corr.keys()),
        'Target_Correlation': list(feature_target_corr.values())
    }).sort_values('Target_Correlation', ascending=False)
    
    # Visualize correlations with target
    plt.figure(figsize=(12, 10))
    sns.barplot(x='Target_Correlation', y='Feature', data=feature_corr_df.head(20))
    plt.title('Feature Correlation with Target Variable')
    plt.tight_layout()
    plt.savefig(f'{plots_dir}/target_correlation.png')
    plt.close()
    
    print("Top 10 features with highest target correlation:")
    print(feature_corr_df.head(10))
    
    # Step 2: Remove highly correlated features
    print("\nStep 2: Removing redundant features")
    
    corr_matrix = X_train.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    # Visualize correlation matrix
    plt.figure(figsize=(16, 14))
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    sns.heatmap(corr_matrix, cmap='coolwarm', center=0, mask=mask,
                square=True, linewidths=.5, annot=False, fmt='.2f')
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.savefig(f'{plots_dir}/feature_correlation.png')
    plt.close()
    
    # Highly correlated feature pairs
    correlation_threshold = 0.7
    to_drop = set()
    
    for i, row_name in enumerate(upper.index):
        for col_name in upper.columns[i:]:
            if upper.loc[row_name, col_name] > correlation_threshold:
                if feature_target_corr[row_name] > feature_target_corr[col_name]:
                    to_drop.add(col_name)
                else:
                    to_drop.add(row_name)
    
    print(f"Removing {len(to_drop)} highly correlated redundant features:")
    print(", ".join(list(to_drop)))
    
    # Remove redundant features
    X_train_filtered = X_train.drop(columns=list(to_drop))
    X_test_filtered = X_test.drop(columns=list(to_drop))
    X_val_filtered = X_val.drop(columns=list(to_drop))
    
    # Step 3: Model-based feature importance
    print("\nStep 3: Feature selection based on model importance")
    
    # Train a LightGBM model for feature importance assessment
    feature_selector = lgb.LGBMClassifier(
        objective='binary',
        metric='auc',
        boosting_type='gbdt',
        num_leaves=31,
        learning_rate=0.1,
        n_estimators=100,
        random_state=42
    )
    
    feature_selector.fit(X_train_filtered, y_train)
    
    # Get feature importance
    importances = feature_selector.feature_importances_
    feature_importance_df = pd.DataFrame({
        'Feature': X_train_filtered.columns,
        'Importance': importances
    }).sort_values('Importance', ascending=False)
    
    # Visualize feature importance
    plt.figure(figsize=(12, 10))
    sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(20))
    plt.title('LightGBM Feature Importance')
    plt.tight_layout()
    plt.savefig(f'{plots_dir}/feature_importance.png')
    plt.close()
    
    # Dynamic threshold setting
    mean_importance = feature_importance_df['Importance'].mean()
    importance_threshold = mean_importance * 0.5
    print(f"Dynamic threshold: {importance_threshold:.2f} (50% of mean importance)")
    selected_features = feature_importance_df[feature_importance_df['Importance'] > importance_threshold]['Feature'].tolist()
    
    # Keep at least 10 most important features if filtered list is too small
    if len(selected_features) < 10:
        selected_features = feature_importance_df.head(10)['Feature'].tolist()
    
    print(f"\nFinally selected {len(selected_features)}/{X_train.shape[1]} features")
    print(f"Selected features: {', '.join(selected_features)}")
    
    return X_train[selected_features], X_test[selected_features], X_val[selected_features], selected_features

# Manual feature selection function
def select_features_manual(X_train, X_test, X_val):
    """Manually select specified feature set"""
    print("\n===== Using Manually Specified Features =====")
    
    # Specify features to keep
    selected_features = [
        # Original tau features
        'tau1', 'tau2', 'tau3', 'tau4',
        
        # Original g features
        'g1', 'g2', 'g3', 'g4',
        
        # tau and g interaction terms
        'tau1_g1', 'tau2_g2', 'tau3_g3', 'tau4_g4',
        
        # tau ratio features
        'tau_ratio'
    ]
    
    # Verify all specified features exist
    missing_features = [f for f in selected_features if f not in X_train.columns]
    if missing_features:
        print(f"Warning: The following specified features do not exist: {', '.join(missing_features)}")
        # Filter out non-existent features
        selected_features = [f for f in selected_features if f in X_train.columns]
    
    print(f"Using {len(selected_features)} specified features:")
    print(f"Selected features: {', '.join(selected_features)}")
    
    return X_train[selected_features], X_test[selected_features], X_val[selected_features], selected_features

# Apply feature engineering
print("\n===== Performing Feature Engineering =====")
X_train_featured, X_test_featured, X_val_featured = create_features(X_train, X_test, X_val)
print(f"Number of features after engineering: {X_train_featured.shape[1]}")

# Apply feature selection - can choose automatic or manual method
# Use manual feature selection
X_train_selected, X_test_selected, X_val_selected, selected_features = select_features_manual(
   X_train_featured, X_test_featured, X_val_featured)

print(f"Number of features after selection: {X_train_selected.shape[1]}")

# Use the selected features
X_train_final = X_train_selected
X_test_final = X_test_selected
X_val_final = X_val_selected

# 5. Optuna hyperparameter optimization for AUC
print("\n===== Starting Optuna Hyperparameter Tuning Process (AUC) =====")


def objective_auc(trial):
    # Define LightGBM parameter search space
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'verbosity': -1,
        
        # Core parameters
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 30, 1000),
        'num_leaves': trial.suggest_int('num_leaves', 10, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        
        # Regularization parameters
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        
        # Other parameters
        'min_split_gain': trial.suggest_float('min_split_gain', 0, 0.5),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-5, 10.0, log=True),
        
        # GPU acceleration parameters 
        'device': 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0,
        'num_gpu': 1, 
        'n_jobs': 16,    

        'random_state': 42
    }
    
    # Rest of the function remains the same
    # ...
    
    # Create LightGBM model
    model = lgb.LGBMClassifier(**params)
    
    # Train model on the training set, using validation set for early stopping
    model.fit(
        X_train_final, y_train,
        eval_set=[(X_val_final, y_val)],
        eval_metric='auc',
        callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
    )
    
    # Predict on validation set
    y_val_prob = model.predict_proba(X_val_final)[:, 1]
    
    # Calculate AUC score
    auc_score = roc_auc_score(y_val, y_val_prob)
    
    # Print current trial results
    print(f"Trial {trial.number}: AUC = {auc_score:.4f}")
    
    return auc_score  # Return AUC as optimization target

# Create Optuna study object - optimization direction is to maximize AUC
study_auc = optuna.create_study(direction='maximize', study_name='lightgbm_auc_optimization')

# Run optimization
n_trials = 10  # Can be adjusted based on computational resources and time
print(f"Starting {n_trials} hyperparameter tuning trials...")
start_time = time.time()
study_auc.optimize(objective_auc, n_trials=n_trials)
end_time = time.time()
print(f"Tuning completed! Duration: {end_time - start_time:.2f} seconds")

# Print best parameters and results
print("\n===== Best Parameters (AUC) =====")
print(f"Best AUC score: {study_auc.best_value:.4f}")
print("Best parameter combination:")
for key, value in study_auc.best_params.items():
    print(f"    {key}: {value}")

# Visualize optimization process
plt.figure(figsize=(12, 8))
optuna.visualization.matplotlib.plot_optimization_history(study_auc)
plt.title('Optimization History - AUC')
plt.tight_layout()
plt.savefig(f'{plots_dir}/optuna_auc_history.png')
plt.close()

# Visualize hyperparameter importance
plt.figure(figsize=(12, 8))
optuna.visualization.matplotlib.plot_param_importances(study_auc)
plt.tight_layout()
plt.savefig(f'{plots_dir}/optuna_auc_param_importances.png')
plt.close()

# 6. Train final model with best parameters for weighted F1
print("\n===== Training Final Model with Best Parameters =====")
best_params_auc = study_auc.best_params.copy()
best_params_auc['objective'] = 'binary'
best_params_auc['metric'] = 'auc'
best_params_auc['boosting_type'] = 'gbdt'
best_params_auc['random_state'] = 42
best_params_auc['verbosity'] = -1


best_params_auc['device'] = 'gpu'
best_params_auc['gpu_platform_id'] = 0  
best_params_auc['gpu_device_id'] = 0    
best_params_auc['num_gpu'] = 1
best_params_auc['n_jobs'] = 2   

if 'num_threads' in best_params_auc:
    del best_params_auc['num_threads']

final_model_auc = lgb.LGBMClassifier(**best_params_auc)

start_time = time.time()
final_model_auc.fit(
    X_train_final, y_train,
    eval_set=[(X_val_final, y_val)],
    eval_metric='auc',
    callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=True)]
)
end_time = time.time()
print(f"Final model training completed! Duration: {end_time - start_time:.2f} seconds")

# 7. Evaluate final model on validation set
y_val_prob = final_model_auc.predict_proba(X_val_final)[:, 1]
y_val_pred = final_model_auc.predict(X_val_final)

# Calculate evaluation metrics
val_auc = roc_auc_score(y_val, y_val_prob)
val_acc = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)
val_f1_weighted = f1_score(y_val, y_val_pred, average='weighted')

print("\n===== Final Model Performance on Validation Set =====")
print(f"AUC: {val_auc:.4f}")  # Highlight AUC
print(f"Weighted F1 Score: {val_f1_weighted:.4f}")
print(f"F1 Score: {val_f1:.4f}")
print(f"Accuracy: {val_acc:.4f}")

print("\nValidation Set Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))
print("\nValidation Set Classification Report:")
print(classification_report(y_val, y_val_pred, digits=4))

# 8. Evaluate final model on test set
y_test_prob = final_model_auc.predict_proba(X_test_final)[:, 1]
y_test_pred = final_model_auc.predict(X_test_final)

# Calculate evaluation metrics
test_auc = roc_auc_score(y_test, y_test_prob)
test_acc = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
test_f1_weighted = f1_score(y_test, y_test_pred, average='weighted')

print("\n===== Final Model Performance on Test Set =====")
print(f"AUC: {test_auc:.4f}")  # Highlight AUC
print(f"Weighted F1 Score: {test_f1_weighted:.4f}")
print(f"F1 Score: {test_f1:.4f}")
print(f"Accuracy: {test_acc:.4f}")

print("\nTest Set Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))
print("\nTest Set Classification Report:")
print(classification_report(y_test, y_test_pred, digits=4))

# 9. Plot ROC and PR curves
# ROC curve
plt.figure(figsize=(12, 10))
# Validation set
fpr_val, tpr_val, _ = roc_curve(y_val, y_val_prob)
plt.plot(fpr_val, tpr_val, label=f'Validation Set (AUC = {val_auc:.4f})')
# Test set
fpr_test, tpr_test, _ = roc_curve(y_test, y_test_prob)
plt.plot(fpr_test, tpr_test, label=f'Test Set (AUC = {test_auc:.4f})')
# Reference line
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(f'{plots_dir}/final_model_roc_curves.png')
plt.close()

# PR curve
plt.figure(figsize=(12, 10))
# Validation set
prec_val, rec_val, _ = precision_recall_curve(y_val, y_val_prob)
avg_prec_val = average_precision_score(y_val, y_val_prob)
plt.plot(rec_val, prec_val, label=f'Validation Set (AP = {avg_prec_val:.4f})')
# Test set
prec_test, rec_test, _ = precision_recall_curve(y_test, y_test_prob)
avg_prec_test = average_precision_score(y_test, y_test_prob)
plt.plot(rec_test, prec_test, label=f'Test Set (AP = {avg_prec_test:.4f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('PR Curve Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(f'{plots_dir}/final_model_pr_curves.png')
plt.close()

# 10. Feature importance visualization
plt.figure(figsize=(14, 10))
lgb.plot_importance(final_model_auc, max_num_features=20, importance_type='gain')
plt.title("LightGBM Feature Importance (Gain)")
plt.tight_layout()
plt.savefig(f'{plots_dir}/final_model_feature_importance.png')
plt.close()

# 11. Save best model
import joblib
model_path = f'{models_dir}/lgbm_optuna_auc.pkl'
joblib.dump(final_model_auc, model_path)
print(f"\nBest model saved to: {model_path}")

# 12. Performance summary
print("\n===== Model Performance Summary =====")
results_df = pd.DataFrame({
    'Metric': ['AUC', 'Accuracy', 'F1 Score', 'Weighted F1 Score'],
    'Validation Set': [val_auc, val_acc, val_f1, val_f1_weighted],
    'Test Set': [test_auc, test_acc, test_f1, test_f1_weighted]
})
print(results_df)

# Save results
results_df.to_csv(f'{results_dir}/model_performance.csv', index=False)
print(f"Model performance saved to: {results_dir}/model_performance_auc.csv")

# 13. Save feature list for future use
with open(f'{models_dir}/selected_features.txt', 'w') as f:
    f.write('\n'.join(selected_features))
print(f"Feature list saved to: {models_dir}/selected_features.txt")

print("\nAnalysis completed! All results and visualizations have been saved to the specified directories.")

# XGBoost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, confusion_matrix, classification_report, 
                           roc_auc_score, roc_curve, precision_recall_curve, 
                           average_precision_score, f1_score)
import xgboost as xgb  # Using XGBoost instead of LightGBM
import optuna  # For hyperparameter optimization
import time
import warnings
import os
warnings.filterwarnings('ignore')

# Ensure output directories exist
plots_dir = '/data/jinming/ee_stable/xgboost/plots'
models_dir = '/data/jinming/ee_stable/xgboost/models'
results_dir = '/data/jinming/ee_stable/xgboost/results'
for directory in [plots_dir, models_dir, results_dir]:
    os.makedirs(directory, exist_ok=True)

# 1. Load data
print("===== Loading Data =====")
train_df = pd.read_csv('/data/jinming/ee_stable/data/train.csv')
test_df = pd.read_csv('/data/jinming/ee_stable/data/test.csv')
val_df = pd.read_csv('/data/jinming/ee_stable/data/val.csv')

# 2. Data preparation
print("===== Preparing Data =====")
X_train = train_df.drop(['stab', 'stabf_encoded', 'stabf', 'p1', 'p2', 'p3', 'p4'], axis=1)
X_test = test_df.drop(['stab', 'stabf_encoded', 'stabf', 'p1', 'p2', 'p3', 'p4'], axis=1)
X_val = val_df.drop(['stab', 'stabf_encoded', 'stabf', 'p1', 'p2', 'p3', 'p4'], axis=1)

y_train = train_df['stabf_encoded']
y_test = test_df['stabf_encoded']
y_val = val_df['stabf_encoded']

print(f"Dataset dimensions - Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")

# 3. Feature engineering function
def create_features(X_train, X_test, X_val):
    # Deep copy to avoid modifying original data
    X_train_new = X_train.copy()
    X_test_new = X_test.copy()
    X_val_new = X_val.copy()
    
    # Basic interaction features
    for df in [X_train_new, X_test_new, X_val_new]:
        df['tau1_g1'] = df['tau1'] * df['g1']
        df['tau2_g2'] = df['tau2'] * df['g2']
        df['tau3_g3'] = df['tau3'] * df['g3']
        df['tau4_g4'] = df['tau4'] * df['g4']
        
        # Delay ratio
        df['tau_ratio'] = df[['tau1', 'tau2', 'tau3', 'tau4']].max(axis=1) / df[['tau1', 'tau2', 'tau3', 'tau4']].min(axis=1).replace(0, 0.001)
        
        # Delay-elasticity ratio: response sensitivity of each node
        df['tau1_g1_ratio'] = df['tau1'] / df['g1'].replace(0, 0.001)
        df['tau2_g2_ratio'] = df['tau2'] / df['g2'].replace(0, 0.001)
        df['tau3_g3_ratio'] = df['tau3'] / df['g3'].replace(0, 0.001)
        df['tau4_g4_ratio'] = df['tau4'] / df['g4'].replace(0, 0.001)
        
        # System total elasticity
        df['total_elasticity'] = df['g1'] + df['g2'] + df['g3'] + df['g4']
        
        # Elasticity distribution non-uniformity
        df['elasticity_disparity'] = df[['g1', 'g2', 'g3', 'g4']].max(axis=1) / df[['g1', 'g2', 'g3', 'g4']].min(axis=1).replace(0, 0.001)
        
        # Non-linear features - quadratic terms
        df['tau1_squared'] = df['tau1'] ** 2
        df['tau2_squared'] = df['tau2'] ** 2
        df['tau3_squared'] = df['tau3'] ** 2
        df['tau4_squared'] = df['tau4'] ** 2
        
        # Node relationship features
        df['tau_g_correlation'] = (
            (df['tau1'] * df['g1']) + 
            (df['tau2'] * df['g2']) + 
            (df['tau3'] * df['g3']) + 
            (df['tau4'] * df['g4'])
        ) / (df['tau1'] + df['tau2'] + df['tau3'] + df['tau4'] + 0.001)
        
        # System overall response speed indicator
        df['system_response_speed'] = 4 / (
            (1/df['tau1'].replace(0, 0.001)) + 
            (1/df['tau2'].replace(0, 0.001)) + 
            (1/df['tau3'].replace(0, 0.001)) + 
            (1/df['tau4'].replace(0, 0.001))
        )
    
    return X_train_new, X_test_new, X_val_new

# Manual feature selection function
def select_features_manual(X_train, X_test, X_val):
    """Manually select specified feature set"""
    print("\n===== Using Manually Specified Features =====")
    
    # Specify features to keep
    selected_features = [
        # Original tau features
        'tau1', 'tau2', 'tau3', 'tau4',
        
        # Original g features
        'g1', 'g2', 'g3', 'g4',
        
        # tau and g interaction terms
        'tau1_g1', 'tau2_g2', 'tau3_g3', 'tau4_g4',
        
        # tau ratio features
        'tau_ratio'
    ]
    
    # Verify all specified features exist
    missing_features = [f for f in selected_features if f not in X_train.columns]
    if missing_features:
        print(f"Warning: The following specified features do not exist: {', '.join(missing_features)}")
        # Filter out non-existent features
        selected_features = [f for f in selected_features if f in X_train.columns]
    
    print(f"Using {len(selected_features)} specified features:")
    print(f"Selected features: {', '.join(selected_features)}")
    
    return X_train[selected_features], X_test[selected_features], X_val[selected_features], selected_features

# Apply feature engineering
print("\n===== Performing Feature Engineering =====")
X_train_featured, X_test_featured, X_val_featured = create_features(X_train, X_test, X_val)
print(f"Number of features after engineering: {X_train_featured.shape[1]}")

# Apply feature selection using manual method
X_train_final, X_test_final, X_val_final, selected_features = select_features_manual(
   X_train_featured, X_test_featured, X_val_featured)

print(f"Number of features after selection: {X_train_final.shape[1]}")

# 5. 使用 Optuna 进行 XGBoost 超参数优化（GPU加速）
print("\n===== 开始 Optuna GPU加速调参过程 (AUC) =====")

def objective_auc(trial):
    """Optuna 优化目标函数 - 使用验证集上的 AUC 作为评价指标，并使用GPU加速"""
    # 定义XGBoost参数搜索空间
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'verbosity': 0,
        
        # 核心参数
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        
        # 正则化参数
        'min_child_weight': trial.suggest_float('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 10),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        
        # GPU加速相关参数
        'tree_method': 'gpu_hist',  # 使用GPU直方图算法
        'device': 'cuda',          

        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.1, 10.0),
        'random_state': 42
    }
    
    # 直接使用低级API (xgb.train) 进行训练，这在所有版本上都兼容并支持GPU
    dtrain = xgb.DMatrix(X_train_final, label=y_train)
    dval = xgb.DMatrix(X_val_final, label=y_val)
    
    # 设置评估集
    evals = [(dtrain, 'train'), (dval, 'val')]
    evals_result = {}
    
    # 训练模型，使用早停
    model = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=2000,  # 最大迭代次数
        evals=evals,
        early_stopping_rounds=50,
        evals_result=evals_result,
        verbose_eval=False  # 只在试验结束时打印结果
    )
    
    # 在验证集上预测概率
    y_val_prob = model.predict(dval)
    
    # 计算 AUC 分数
    auc_score = roc_auc_score(y_val, y_val_prob)
    
    # 打印当前试验的结果
    print(f"Trial {trial.number}: AUC = {auc_score:.4f}")
    
    return auc_score  # 返回 AUC 作为优化目标

# 创建Optuna study对象 - 优化方向是最大化AUC
study_auc = optuna.create_study(direction='maximize', study_name='xgboost_gpu_auc_optimization')

# 运行优化
n_trials = 10  # 可以根据计算资源和时间调整
print(f"开始运行 {n_trials} 次GPU加速调参试验...")
start_time = time.time()
study_auc.optimize(objective_auc, n_trials=n_trials)
end_time = time.time()
print(f"GPU调参完成! 耗时: {end_time - start_time:.2f}秒")

# 打印最佳参数和结果
print("\n===== 最佳参数 (AUC) =====")
print(f"最佳AUC分数: {study_auc.best_value:.4f}")
print("最佳参数组合:")
for key, value in study_auc.best_params.items():
    print(f"    {key}: {value}")


# Visualize optimization process
plt.figure(figsize=(12, 8))
optuna.visualization.matplotlib.plot_optimization_history(study_auc)
plt.title('Optimization History - AUC')
plt.tight_layout()
plt.savefig(f'{plots_dir}/optuna_auc_history.png')
plt.close()

# Visualize hyperparameter importance
plt.figure(figsize=(12, 8))
optuna.visualization.matplotlib.plot_param_importances(study_auc)
plt.tight_layout()
plt.savefig(f'{plots_dir}/optuna_auc_param_importances.png')
plt.close()

# 6. 使用最佳参数在GPU上训练最终模型
print("\n===== 使用GPU训练最终模型 =====")

# 获取最佳参数
best_params_auc = study_auc.best_params.copy()

# 添加必要的固定参数
best_params_auc.update({
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'gpu_hist',  # 使用GPU加速
    'device': 'cuda',           # 指定CUDA设备
    'verbosity': 1,
    'random_state': 42
})

# 准备数据矩阵
dtrain = xgb.DMatrix(X_train_final, label=y_train)
dval = xgb.DMatrix(X_val_final, label=y_val)
dtest = xgb.DMatrix(X_test_final, label=y_test)

# 设置评估集
evals = [(dtrain, 'train'), (dval, 'val')]
evals_result = {}

# 开始训练计时
start_time = time.time()

# 使用低级API训练最终模型
final_model = xgb.train(
    params=best_params_auc,
    dtrain=dtrain,
    num_boost_round=3000,  # 设置足够大的最大迭代次数
    evals=evals,
    early_stopping_rounds=50,
    evals_result=evals_result,
    verbose_eval=50  # 每50轮显示一次进度
)

# 结束训练计时并输出时间
end_time = time.time()
print(f"GPU训练完成！耗时: {end_time - start_time:.2f}秒")

# 7. 评估最终模型在验证集上的表现
y_val_prob = final_model.predict(dval)
y_val_pred = (y_val_prob > 0.5).astype(int)  # 将概率转换为预测类别

# 计算评估指标
val_auc = roc_auc_score(y_val, y_val_prob)
val_acc = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)
val_f1_weighted = f1_score(y_val, y_val_pred, average='weighted')

print("\n===== 最终模型在验证集上的性能 =====")
print(f"AUC: {val_auc:.4f}")
print(f"Weighted F1 Score: {val_f1_weighted:.4f}")
print(f"F1 Score: {val_f1:.4f}")
print(f"Accuracy: {val_acc:.4f}")

print("\n验证集混淆矩阵:")
print(confusion_matrix(y_val, y_val_pred))
print("\n验证集分类报告:")
print(classification_report(y_val, y_val_pred, digits=4))

# 8. 评估最终模型在测试集上的表现
y_test_prob = final_model.predict(dtest)
y_test_pred = (y_test_prob > 0.5).astype(int)

# 计算评估指标
test_auc = roc_auc_score(y_test, y_test_prob)
test_acc = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
test_f1_weighted = f1_score(y_test, y_test_pred, average='weighted')

print("\n===== 最终模型在测试集上的性能 =====")
print(f"AUC: {test_auc:.4f}")
print(f"Weighted F1 Score: {test_f1_weighted:.4f}")
print(f"F1 Score: {test_f1:.4f}")
print(f"Accuracy: {test_acc:.4f}")

# 9. Plot ROC and PR curves
# ROC curve
plt.figure(figsize=(12, 10))
# Validation set
fpr_val, tpr_val, _ = roc_curve(y_val, y_val_prob)
plt.plot(fpr_val, tpr_val, label=f'Validation Set (AUC = {val_auc:.4f})')
# Test set
fpr_test, tpr_test, _ = roc_curve(y_test, y_test_prob)
plt.plot(fpr_test, tpr_test, label=f'Test Set (AUC = {test_auc:.4f})')
# Reference line
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison (XGBoost Model)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(f'{plots_dir}/final_model_roc_curves.png')
plt.close()

# PR curve
plt.figure(figsize=(12, 10))
# Validation set
prec_val, rec_val, _ = precision_recall_curve(y_val, y_val_prob)
avg_prec_val = average_precision_score(y_val, y_val_prob)
plt.plot(rec_val, prec_val, label=f'Validation Set (AP = {avg_prec_val:.4f})')
# Test set
prec_test, rec_test, _ = precision_recall_curve(y_test, y_test_prob)
avg_prec_test = average_precision_score(y_test, y_test_prob)
plt.plot(rec_test, prec_test, label=f'Test Set (AP = {avg_prec_test:.4f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('PR Curve Comparison (XGBoost Model)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(f'{plots_dir}/final_model_pr_curves.png')
plt.close()

# 10. Feature importance visualization
plt.figure(figsize=(14, 10))
xgb.plot_importance(final_model_auc, max_num_features=20, importance_type='gain')
plt.title("XGBoost Feature Importance (Gain)")
plt.tight_layout()
plt.savefig(f'{plots_dir}/final_model_feature_importance.png')
plt.close()

# 11. Save best model
import joblib
model_path = f'{models_dir}/xgb_optuna_auc.pkl'
joblib.dump(final_model_auc, model_path)
print(f"\nBest model saved to: {model_path}")

# 12. Performance summary
print("\n===== Model Performance Summary =====")
results_df = pd.DataFrame({
    'Metric': ['AUC', 'Accuracy', 'F1 Score', 'Weighted F1 Score'],
    'Validation Set': [val_auc, val_acc, val_f1, val_f1_weighted],
    'Test Set': [test_auc, test_acc, test_f1, test_f1_weighted]
})
print(results_df)

# Save results
results_df.to_csv(f'{results_dir}/model_performance_auc.csv', index=False)
print(f"Model performance saved to: {results_dir}/model_performance_auc.csv")

# 13. Save feature list for future use
with open(f'{models_dir}/selected_features.txt', 'w') as f:
    f.write('\n'.join(selected_features))
print(f"Feature list saved to: {models_dir}/selected_features.txt")

print("\nAnalysis completed! All results and visualizations have been saved to the specified directories.")