# XGBoost, LightBGM, CatBoost (normal + advanced)

### 1: Install Required Libraries

In [70]:
# Install required libraries (run this first)
%pip install optuna xgboost lightgbm catboost scikit-learn imbalanced-learn shap plotly matplotlib seaborn nbformat>=4.2.0

zsh:1: 4.2.0 not found
Note: you may need to restart the kernel to use updated packages.


In [71]:

# Import all necessary libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine Learning
from sklearn.model_selection import KFold, StratifiedKFold, cross_validate
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, matthews_corrcoef, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.base import clone

# Imbalanced Learning
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier, BalancedBaggingClassifier

# Gradient Boosting
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

# Hyperparameter Optimization
import optuna
from optuna.samplers import TPESampler

# Feature Selection
from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFECV

# Visualization
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Interpretation
import shap

print("All libraries imported successfully!")

All libraries imported successfully!


### Step 2: Enhanced Data Preprocessing with Advanced Techniques

In [72]:
# Load Training Dataset
data = pd.read_csv('DIA_trainingset_RDKit_descriptors.csv')

# extract features and target variable
X_train = data.iloc[:, 2:]
Y_train = data.iloc[:, 0]

# Load Test Dataset
test_data = pd.read_csv('DIA_testset_RDKit_descriptors.csv')
X_test = test_data.iloc[:, 2:]
Y_test = test_data.iloc[:, 0]

In [73]:
X_train.head()

Unnamed: 0,BalabanJ,BertzCT,Chi0,Chi0n,Chi0v,Chi1,Chi1n,Chi1v,Chi2n,Chi2v,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,1.821,1266.407,22.121,16.781,16.781,14.901,9.203,9.203,6.668,6.668,...,0,0,0,0,0,0,0,0,0,0
1,2.363,490.434,11.707,8.752,9.569,7.592,4.854,5.67,3.545,4.661,...,0,0,0,0,0,0,0,1,0,1
2,3.551,93.092,6.784,5.471,5.471,3.417,2.42,2.42,2.82,2.82,...,0,0,0,0,0,0,0,0,0,0
3,2.076,1053.003,21.836,16.995,16.995,14.274,9.926,9.926,7.662,7.662,...,0,0,0,0,0,0,0,0,0,0
4,2.888,549.823,14.629,9.746,9.746,8.752,5.04,5.04,3.601,3.601,...,0,0,0,0,0,0,0,0,0,0


In [74]:
X_test.head()

Unnamed: 0,BalabanJ,BertzCT,Chi0,Chi0n,Chi0v,Chi1,Chi1n,Chi1v,Chi2n,Chi2v,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,1.484,743.207,21.466,18.764,18.764,14.292,12.106,12.106,10.736,10.736,...,0,0,0,0,0,0,0,0,0,0
1,1.472,868.947,21.14,16.736,17.553,14.453,10.268,11.084,7.662,8.746,...,0,0,0,0,0,0,0,0,0,0
2,0.837,1409.004,39.189,32.904,32.904,26.011,20.941,20.941,18.816,18.816,...,0,0,0,0,0,0,0,0,0,0
3,2.406,621.298,13.828,10.297,10.297,9.092,5.847,5.847,4.217,4.217,...,0,0,0,0,0,0,0,0,0,0
4,1.32,2127.996,37.955,30.849,31.666,25.91,18.066,19.115,14.93,16.06,...,1,0,0,0,0,0,0,0,0,0


In [75]:
def advanced_preprocessing(X_train, X_test, y_train, method='robust'):
    """
    Advanced preprocessing with multiple scaling options and outlier handling
    
    Parameters:
    - method: 'standard', 'robust', 'minmax', 'quantile'
    """
    from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, QuantileTransformer
    
    scalers = {
        'standard': StandardScaler(),
        'robust': RobustScaler(),
        'minmax': MinMaxScaler(),
        'quantile': QuantileTransformer(output_distribution='normal')
    }
    
    scaler = scalers[method]
    
    # Apply scaling
    X_train_scaled = pd.DataFrame(
        scaler.fit_transform(X_train),
        columns=X_train.columns,
        index=X_train.index
    )
    
    X_test_scaled = pd.DataFrame(
        scaler.transform(X_test),
        columns=X_test.columns,
        index=X_test.index
    )
    
    print(f"Applied {method} scaling")
    return X_train_scaled, X_test_scaled, scaler

def remove_highly_correlated_features(X, threshold=0.95):
    """
    Remove highly correlated features using advanced correlation analysis
    """
    corr_matrix = X.corr().abs()
    upper_triangle = corr_matrix.where(
        np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
    )
    
    # Find features to drop
    to_drop = [column for column in upper_triangle.columns 
               if any(upper_triangle[column] > threshold)]
    
    X_reduced = X.drop(columns=to_drop)
    print(f"Removed {len(to_drop)} highly correlated features (threshold: {threshold})")
    print(f"Features reduced from {X.shape[1]} to {X_reduced.shape[1]}")
    
    return X_reduced, to_drop

# Apply advanced preprocessing
X_train_enhanced, X_test_enhanced, scaler = advanced_preprocessing(
    X_train, X_test, Y_train, method='robust'
)

# Remove highly correlated features
X_train_final, dropped_features = remove_highly_correlated_features(
    X_train_enhanced, threshold=0.95
)
X_test_final = X_test_enhanced.drop(columns=dropped_features)

print(f"Final dataset shape: Training {X_train_final.shape}, Test {X_test_final.shape}")

Applied robust scaling
Removed 31 highly correlated features (threshold: 0.95)
Features reduced from 196 to 165
Final dataset shape: Training (477, 165), Test (120, 165)


### Step 3: Advanced Feature Selection with Multiple Methods

In [76]:
class AdvancedFeatureSelector:
    """
    Comprehensive feature selection using multiple methods
    """
    
    def __init__(self, random_state=42):
        self.random_state = random_state
        self.selected_features = {}
        
    def mutual_information_selection(self, X, y, k='auto'):
        """Enhanced mutual information selection"""
        if k == 'auto':
            k = min(50, X.shape[1] // 2)  # Adaptive k selection
            
        mi_scores = mutual_info_classif(X, y, random_state=self.random_state)
        feature_scores = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)
        
        # Select top k features
        selected_features = feature_scores.head(k).index.tolist()
        self.selected_features['mutual_info'] = selected_features
        
        return selected_features, feature_scores
    
    def recursive_feature_elimination(self, X, y, estimator=None):
        """RFECV with cross-validation"""
        if estimator is None:
            estimator = RandomForestClassifier(n_estimators=100, random_state=self.random_state)
            
        rfecv = RFECV(
            estimator=estimator,
            step=1,
            cv=5,
            scoring='roc_auc',
            n_jobs=-1
        )
        
        rfecv.fit(X, y)
        selected_features = X.columns[rfecv.support_].tolist()
        self.selected_features['rfecv'] = selected_features
        
        return selected_features, rfecv
    
    def variance_threshold_selection(self, X, threshold=0.01):
        """Remove low variance features"""
        from sklearn.feature_selection import VarianceThreshold
        
        selector = VarianceThreshold(threshold=threshold)
        selector.fit(X)
        selected_features = X.columns[selector.get_support()].tolist()
        self.selected_features['variance'] = selected_features
        
        return selected_features, selector
    
    def statistical_selection(self, X, y, method='f_classif', k=50):
        """Statistical feature selection"""
        from sklearn.feature_selection import f_classif, chi2
        
        if method == 'f_classif':
            selector = SelectKBest(f_classif, k=k)
        elif method == 'chi2':
            # Ensure non-negative values for chi2
            X_positive = X - X.min() + 1e-5
            selector = SelectKBest(chi2, k=k)
            X = X_positive
            
        selector.fit(X, y)
        selected_features = X.columns[selector.get_support()].tolist()
        self.selected_features['statistical'] = selected_features
        
        return selected_features, selector
    
    def ensemble_selection(self, X, y, methods=['mutual_info', 'rfecv', 'statistical']):
        """Combine multiple selection methods"""
        all_selected = []
        
        if 'mutual_info' in methods:
            features, _ = self.mutual_information_selection(X, y)
            all_selected.extend(features)
            
        if 'rfecv' in methods:
            features, _ = self.recursive_feature_elimination(X, y)
            all_selected.extend(features)
            
        if 'statistical' in methods:
            features, _ = self.statistical_selection(X, y)
            all_selected.extend(features)
            
        # Count feature frequency
        feature_counts = pd.Series(all_selected).value_counts()
        
        # Select features that appear in at least 2 methods
        ensemble_features = feature_counts[feature_counts >= 2].index.tolist()
        self.selected_features['ensemble'] = ensemble_features
        
        return ensemble_features, feature_counts

# Apply advanced feature selection
feature_selector = AdvancedFeatureSelector(random_state=42)

# Run ensemble selection
ensemble_features, feature_counts = feature_selector.ensemble_selection(
    X_train_final, Y_train, methods=['mutual_info', 'rfecv', 'statistical']
)

print(f"Ensemble selection chose {len(ensemble_features)} features")
print(f"Top 10 most selected features:")
print(feature_counts.head(10))

# Create final feature set
X_train_selected = X_train_final[ensemble_features]
X_test_selected = X_test_final[ensemble_features]

print(f"Final feature set shape: {X_train_selected.shape}")

Ensemble selection chose 70 features
Top 10 most selected features:
fr_Ar_NH                   3
fr_NH2                     3
fr_piperdine               3
SlogP_VSA10                3
fr_amide                   3
EState_VSA4                3
NumSaturatedCarbocycles    3
HallKierAlpha              3
SMR_VSA9                   3
EState_VSA10               3
Name: count, dtype: int64
Final feature set shape: (477, 70)


### Step 4: Advanced Model Development with Optuna Optimization

In [77]:
class OptunaModelOptimizer:
    """
    Advanced hyperparameter optimization using Optuna
    """
    
    def __init__(self, n_trials=100, cv_folds=5, random_state=42):
        self.n_trials = n_trials
        self.cv_folds = cv_folds
        self.random_state = random_state
        self.best_params = {}
        self.study_results = {}
    
    def objective_function(self, trial, model_type, X, y):
        """Unified objective function for all models"""
        
        if model_type == 'random_forest':
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 100, 500),
                'max_depth': trial.suggest_int('max_depth', 5, 30),
                'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
                'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', 0.5, 0.7]),
                'random_state': self.random_state,
                'n_jobs': -1
            }
            model = RandomForestClassifier(**params)
            
        elif model_type == 'xgboost':
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
                'max_depth': trial.suggest_int('max_depth', 3, 15),
                'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
                'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
                'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
                'random_state': self.random_state,
                'eval_metric': 'logloss',
                'verbosity': 0
            }
            model = xgb.XGBClassifier(**params)
            
        elif model_type == 'lightgbm':
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
                'max_depth': trial.suggest_int('max_depth', 3, 15),
                'num_leaves': trial.suggest_int('num_leaves', 10, 300),
                'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
                'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
                'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
                'random_state': self.random_state,
                'verbosity': -1,
                'force_col_wise': True
            }
            model = lgb.LGBMClassifier(**params)
            
        elif model_type == 'catboost':
            params = {
                'iterations': trial.suggest_int('iterations', 100, 1000),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
                'depth': trial.suggest_int('depth', 3, 10),
                'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
                'random_state': self.random_state,
                'verbose': False
            }
            model = cb.CatBoostClassifier(**params)
            
        elif model_type == 'balanced_rf':
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 100, 500),
                'max_depth': trial.suggest_int('max_depth', 5, 30),
                'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
                'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', 0.5, 0.7]),
                'random_state': self.random_state,
                'n_jobs': -1
            }
            model = BalancedRandomForestClassifier(**params)
        
        # Cross-validation
        cv = StratifiedKFold(n_splits=self.cv_folds, shuffle=True, random_state=self.random_state)
        scores = cross_validate(
            model, X, y, 
            cv=cv, 
            scoring=['roc_auc', 'accuracy', 'f1'],
            n_jobs=-1
        )
        
        # Return weighted score (prioritize AUC and F1)
        return 0.5 * scores['test_roc_auc'].mean() + 0.3 * scores['test_f1'].mean() + 0.2 * scores['test_accuracy'].mean()
    
    def optimize_model(self, model_type, X, y):
        """Optimize a specific model type"""
        print(f"Optimizing {model_type}...")

        # Create or load Optuna study
        study_name = f"{model_type}_optimization"
        storage_name = f'optuna_{model_type}.db'
        study = optuna.create_study(
                direction='maximize',
                sampler=TPESampler(seed=self.random_state),
                study_name=study_name,
                storage=f'sqlite:///{storage_name}',
                load_if_exists=True
            )
        
        study.optimize(
            lambda trial: self.objective_function(trial, model_type, X, y),
            n_trials=self.n_trials,
            show_progress_bar=True
        )
        
        self.best_params[model_type] = study.best_params
        self.study_results[model_type] = study
        
        print(f"Best {model_type} score: {study.best_value:.4f}")
        print(f"Best {model_type} params: {study.best_params}")
        
        return study.best_params, study.best_value
    
    def optimize_all_models(self, X, y, models=['balanced_rf', 'xgboost', 'lightgbm', 'catboost']):
        """Optimize all specified models"""
        results = {}
        
        for model_type in models:
            params, score = self.optimize_model(model_type, X, y)
            results[model_type] = {'params': params, 'score': score}
            
        return results

    # Add this method to your OptunaModelOptimizer class
    def create_optimized_models(self, optimization_results):
        """Create models with optimized parameters"""
        models = {}
        
        for model_type, result in optimization_results.items():
            params = result['params']
            params['random_state'] = self.random_state
            
            if model_type == 'balanced_rf':
                params['n_jobs'] = -1
                models[model_type] = BalancedRandomForestClassifier(**params)
            elif model_type == 'xgboost':
                params['eval_metric'] = 'logloss'
                params['verbosity'] = 0
                models[model_type] = xgb.XGBClassifier(**params)
            elif model_type == 'lightgbm':
                params['verbosity'] = -1
                params['force_col_wise'] = True
                models[model_type] = lgb.LGBMClassifier(**params)
            elif model_type == 'catboost':
                params['verbose'] = False
                models[model_type] = cb.CatBoostClassifier(**params)
                
        return models

# Initialize optimizer
optimizer = OptunaModelOptimizer(n_trials=20, cv_folds=5)  # Reduce trials for demo

# Optimize models (this will take some time)
optimization_results = optimizer.optimize_all_models(
    X_train_selected, Y_train, 
    models=['balanced_rf', 'xgboost', 'lightgbm']  # Start with these three
)

print("\nOptimization completed!")
for model, result in optimization_results.items():
    print(f"{model}: Score = {result['score']:.4f}")

[I 2025-08-16 20:48:43,686] Using an existing study with name 'balanced_rf_optimization' instead of creating a new one.


Optimizing balanced_rf...


Best trial: 32. Best value: 0.776476:   5%|▌         | 1/20 [00:01<00:21,  1.12s/it]

[I 2025-08-16 20:48:44,811] Trial 40 finished with value: 0.7561016587002256 and parameters: {'n_estimators': 365, 'max_depth': 23, 'min_samples_split': 9, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 32 with value: 0.7764759149028794.


Best trial: 32. Best value: 0.776476:  10%|█         | 2/20 [00:02<00:21,  1.20s/it]

[I 2025-08-16 20:48:46,067] Trial 41 finished with value: 0.7706543646074315 and parameters: {'n_estimators': 423, 'max_depth': 11, 'min_samples_split': 12, 'min_samples_leaf': 3, 'max_features': 0.7}. Best is trial 32 with value: 0.7764759149028794.


Best trial: 32. Best value: 0.776476:  15%|█▌        | 3/20 [00:03<00:20,  1.18s/it]

[I 2025-08-16 20:48:47,227] Trial 42 finished with value: 0.7761980104344234 and parameters: {'n_estimators': 418, 'max_depth': 11, 'min_samples_split': 13, 'min_samples_leaf': 3, 'max_features': 0.7}. Best is trial 32 with value: 0.7764759149028794.


Best trial: 32. Best value: 0.776476:  20%|██        | 4/20 [00:04<00:16,  1.02s/it]

[I 2025-08-16 20:48:47,986] Trial 43 finished with value: 0.7601862025572432 and parameters: {'n_estimators': 449, 'max_depth': 14, 'min_samples_split': 14, 'min_samples_leaf': 4, 'max_features': 0.7}. Best is trial 32 with value: 0.7764759149028794.


Best trial: 32. Best value: 0.776476:  25%|██▌       | 5/20 [00:04<00:13,  1.13it/s]

[I 2025-08-16 20:48:48,644] Trial 44 finished with value: 0.764828361065712 and parameters: {'n_estimators': 403, 'max_depth': 9, 'min_samples_split': 16, 'min_samples_leaf': 2, 'max_features': 0.7}. Best is trial 32 with value: 0.7764759149028794.


Best trial: 32. Best value: 0.776476:  30%|███       | 6/20 [00:05<00:10,  1.31it/s]

[I 2025-08-16 20:48:49,164] Trial 45 finished with value: 0.7683612083937699 and parameters: {'n_estimators': 314, 'max_depth': 7, 'min_samples_split': 13, 'min_samples_leaf': 3, 'max_features': 0.7}. Best is trial 32 with value: 0.7764759149028794.


Best trial: 32. Best value: 0.776476:  35%|███▌      | 7/20 [00:05<00:08,  1.53it/s]

[I 2025-08-16 20:48:49,597] Trial 46 finished with value: 0.7342674307521253 and parameters: {'n_estimators': 348, 'max_depth': 16, 'min_samples_split': 6, 'min_samples_leaf': 7, 'max_features': 'sqrt'}. Best is trial 32 with value: 0.7764759149028794.


Best trial: 32. Best value: 0.776476:  40%|████      | 8/20 [00:06<00:06,  1.85it/s]

[I 2025-08-16 20:48:49,896] Trial 47 finished with value: 0.7757295242659242 and parameters: {'n_estimators': 163, 'max_depth': 13, 'min_samples_split': 15, 'min_samples_leaf': 1, 'max_features': 0.7}. Best is trial 32 with value: 0.7764759149028794.


Best trial: 32. Best value: 0.776476:  45%|████▌     | 9/20 [00:06<00:05,  2.13it/s]

[I 2025-08-16 20:48:50,206] Trial 48 finished with value: 0.7653505376666666 and parameters: {'n_estimators': 177, 'max_depth': 15, 'min_samples_split': 15, 'min_samples_leaf': 1, 'max_features': 0.5}. Best is trial 32 with value: 0.7764759149028794.


Best trial: 32. Best value: 0.776476:  50%|█████     | 10/20 [00:06<00:04,  2.28it/s]

[I 2025-08-16 20:48:50,578] Trial 49 finished with value: 0.7751345202646509 and parameters: {'n_estimators': 200, 'max_depth': 18, 'min_samples_split': 11, 'min_samples_leaf': 1, 'max_features': 0.7}. Best is trial 32 with value: 0.7764759149028794.


Best trial: 32. Best value: 0.776476:  55%|█████▌    | 11/20 [00:07<00:03,  2.65it/s]

[I 2025-08-16 20:48:50,817] Trial 50 finished with value: 0.7604589275546857 and parameters: {'n_estimators': 168, 'max_depth': 18, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 32 with value: 0.7764759149028794.


Best trial: 51. Best value: 0.782974:  60%|██████    | 12/20 [00:07<00:02,  2.70it/s]

[I 2025-08-16 20:48:51,172] Trial 51 finished with value: 0.7829742420445939 and parameters: {'n_estimators': 187, 'max_depth': 19, 'min_samples_split': 11, 'min_samples_leaf': 1, 'max_features': 0.7}. Best is trial 51 with value: 0.7829742420445939.


Best trial: 51. Best value: 0.782974:  65%|██████▌   | 13/20 [00:07<00:02,  2.62it/s]

[I 2025-08-16 20:48:51,577] Trial 52 finished with value: 0.7790575294294797 and parameters: {'n_estimators': 219, 'max_depth': 21, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 0.7}. Best is trial 51 with value: 0.7829742420445939.


Best trial: 51. Best value: 0.782974:  70%|███████   | 14/20 [00:08<00:02,  2.63it/s]

[I 2025-08-16 20:48:51,953] Trial 53 finished with value: 0.7770926811039597 and parameters: {'n_estimators': 206, 'max_depth': 22, 'min_samples_split': 11, 'min_samples_leaf': 1, 'max_features': 0.7}. Best is trial 51 with value: 0.7829742420445939.


Best trial: 51. Best value: 0.782974:  75%|███████▌  | 15/20 [00:08<00:01,  2.95it/s]

[I 2025-08-16 20:48:52,198] Trial 54 finished with value: 0.7703187241095982 and parameters: {'n_estimators': 116, 'max_depth': 23, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 0.7}. Best is trial 51 with value: 0.7829742420445939.


Best trial: 51. Best value: 0.782974:  80%|████████  | 16/20 [00:08<00:01,  2.74it/s]

[I 2025-08-16 20:48:52,621] Trial 55 finished with value: 0.7713599052124769 and parameters: {'n_estimators': 237, 'max_depth': 25, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 0.7}. Best is trial 51 with value: 0.7829742420445939.


Best trial: 51. Best value: 0.782974:  85%|████████▌ | 17/20 [00:09<00:01,  2.85it/s]

[I 2025-08-16 20:48:52,940] Trial 56 finished with value: 0.7747666559997808 and parameters: {'n_estimators': 191, 'max_depth': 22, 'min_samples_split': 6, 'min_samples_leaf': 1, 'max_features': 0.5}. Best is trial 51 with value: 0.7829742420445939.


Best trial: 51. Best value: 0.782974:  90%|█████████ | 18/20 [00:09<00:00,  3.00it/s]

[I 2025-08-16 20:48:53,232] Trial 57 finished with value: 0.7747560252252101 and parameters: {'n_estimators': 151, 'max_depth': 21, 'min_samples_split': 11, 'min_samples_leaf': 2, 'max_features': 0.7}. Best is trial 51 with value: 0.7829742420445939.


Best trial: 51. Best value: 0.782974:  95%|█████████▌| 19/20 [00:10<00:00,  2.69it/s]

[I 2025-08-16 20:48:53,693] Trial 58 finished with value: 0.7675995611281432 and parameters: {'n_estimators': 254, 'max_depth': 26, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': 0.7}. Best is trial 51 with value: 0.7829742420445939.


Best trial: 51. Best value: 0.782974: 100%|██████████| 20/20 [00:10<00:00,  1.96it/s]
[I 2025-08-16 20:48:53,938] Using an existing study with name 'xgboost_optimization' instead of creating a new one.


[I 2025-08-16 20:48:53,918] Trial 59 finished with value: 0.7481427916227212 and parameters: {'n_estimators': 143, 'max_depth': 19, 'min_samples_split': 12, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 51 with value: 0.7829742420445939.
Best balanced_rf score: 0.7830
Best balanced_rf params: {'n_estimators': 187, 'max_depth': 19, 'min_samples_split': 11, 'min_samples_leaf': 1, 'max_features': 0.7}
Optimizing xgboost...


Best trial: 28. Best value: 0.767612:   5%|▌         | 1/20 [00:00<00:04,  4.63it/s]

[I 2025-08-16 20:48:54,153] Trial 40 finished with value: 0.6698805525615902 and parameters: {'n_estimators': 764, 'learning_rate': 0.10335034035291735, 'max_depth': 4, 'subsample': 0.7346033694891198, 'colsample_bytree': 0.9453637722522475, 'reg_alpha': 9.076647952825178, 'reg_lambda': 7.255286348540881}. Best is trial 28 with value: 0.7676124729674524.


Best trial: 28. Best value: 0.767612:  10%|█         | 2/20 [00:00<00:04,  4.10it/s]

[I 2025-08-16 20:48:54,416] Trial 41 finished with value: 0.7519497240202109 and parameters: {'n_estimators': 940, 'learning_rate': 0.13021064098726698, 'max_depth': 3, 'subsample': 0.8860717823102173, 'colsample_bytree': 0.684407032876449, 'reg_alpha': 0.4868924843565608, 'reg_lambda': 2.5914417721343153}. Best is trial 28 with value: 0.7676124729674524.


Best trial: 28. Best value: 0.767612:  15%|█▌        | 3/20 [00:00<00:03,  4.27it/s]

[I 2025-08-16 20:48:54,639] Trial 42 finished with value: 0.7554441278579156 and parameters: {'n_estimators': 881, 'learning_rate': 0.15335089415737546, 'max_depth': 3, 'subsample': 0.8747051944758344, 'colsample_bytree': 0.6228115785815763, 'reg_alpha': 0.36916190316031794, 'reg_lambda': 1.5012432938674918}. Best is trial 28 with value: 0.7676124729674524.


Best trial: 28. Best value: 0.767612:  25%|██▌       | 5/20 [00:01<00:03,  4.36it/s]

[I 2025-08-16 20:48:54,927] Trial 43 finished with value: 0.7559854827328237 and parameters: {'n_estimators': 932, 'learning_rate': 0.13590476348186503, 'max_depth': 4, 'subsample': 0.8563356293120413, 'colsample_bytree': 0.7423117444775065, 'reg_alpha': 0.7914114206438176, 'reg_lambda': 2.949401859386044}. Best is trial 28 with value: 0.7676124729674524.
[I 2025-08-16 20:48:55,111] Trial 44 finished with value: 0.7481907819597473 and parameters: {'n_estimators': 686, 'learning_rate': 0.16018820677074277, 'max_depth': 5, 'subsample': 0.9183737392792987, 'colsample_bytree': 0.6931856444214527, 'reg_alpha': 2.159866405584694, 'reg_lambda': 2.0542819729404003}. Best is trial 28 with value: 0.7676124729674524.


Best trial: 28. Best value: 0.767612:  35%|███▌      | 7/20 [00:01<00:02,  4.87it/s]

[I 2025-08-16 20:48:55,369] Trial 45 finished with value: 0.7541725480352262 and parameters: {'n_estimators': 379, 'learning_rate': 0.11154755425405063, 'max_depth': 11, 'subsample': 0.8976932191374282, 'colsample_bytree': 0.7821773401702524, 'reg_alpha': 1.1031144520498692, 'reg_lambda': 4.599502548887239}. Best is trial 28 with value: 0.7676124729674524.
[I 2025-08-16 20:48:55,504] Trial 46 finished with value: 0.7512914151343584 and parameters: {'n_estimators': 305, 'learning_rate': 0.18033943700313404, 'max_depth': 3, 'subsample': 0.8781705026244634, 'colsample_bytree': 0.7337835976606891, 'reg_alpha': 0.27537669656919983, 'reg_lambda': 0.39705393024338964}. Best is trial 28 with value: 0.7676124729674524.


Best trial: 28. Best value: 0.767612:  40%|████      | 8/20 [00:01<00:02,  4.51it/s]

[I 2025-08-16 20:48:55,760] Trial 47 finished with value: 0.7430869495115308 and parameters: {'n_estimators': 851, 'learning_rate': 0.08913303467488073, 'max_depth': 4, 'subsample': 0.6062041527976908, 'colsample_bytree': 0.7643816740445362, 'reg_alpha': 1.5108640676543241, 'reg_lambda': 0.9559190607363379}. Best is trial 28 with value: 0.7676124729674524.


Best trial: 28. Best value: 0.767612:  50%|█████     | 10/20 [00:02<00:02,  4.68it/s]

[I 2025-08-16 20:48:55,986] Trial 48 finished with value: 0.7654326527260927 and parameters: {'n_estimators': 719, 'learning_rate': 0.20052038277428774, 'max_depth': 13, 'subsample': 0.9068413111625275, 'colsample_bytree': 0.6739035838784846, 'reg_alpha': 0.9857379047459555, 'reg_lambda': 3.764109032797897}. Best is trial 28 with value: 0.7676124729674524.
[I 2025-08-16 20:48:56,179] Trial 49 finished with value: 0.7468179738531429 and parameters: {'n_estimators': 696, 'learning_rate': 0.20275405347809844, 'max_depth': 13, 'subsample': 0.8146161175070487, 'colsample_bytree': 0.6107096782175453, 'reg_alpha': 3.050810835635949, 'reg_lambda': 3.8172800236183533}. Best is trial 28 with value: 0.7676124729674524.


Best trial: 28. Best value: 0.767612:  55%|█████▌    | 11/20 [00:02<00:01,  5.10it/s]

[I 2025-08-16 20:48:56,336] Trial 50 finished with value: 0.728745571853629 and parameters: {'n_estimators': 636, 'learning_rate': 0.25889789248441736, 'max_depth': 14, 'subsample': 0.9119265046237728, 'colsample_bytree': 0.6428067760397875, 'reg_alpha': 4.327456111804915, 'reg_lambda': 0.0718441916695658}. Best is trial 28 with value: 0.7676124729674524.


Best trial: 28. Best value: 0.767612:  60%|██████    | 12/20 [00:02<00:01,  4.74it/s]

[I 2025-08-16 20:48:56,581] Trial 51 finished with value: 0.7483330280031225 and parameters: {'n_estimators': 733, 'learning_rate': 0.22040746024504226, 'max_depth': 13, 'subsample': 0.9367551847248284, 'colsample_bytree': 0.6728989675065828, 'reg_alpha': 0.73430220388892, 'reg_lambda': 5.2725350288792505}. Best is trial 28 with value: 0.7676124729674524.


Best trial: 28. Best value: 0.767612:  70%|███████   | 14/20 [00:03<00:01,  5.03it/s]

[I 2025-08-16 20:48:56,840] Trial 52 finished with value: 0.7553295338505654 and parameters: {'n_estimators': 774, 'learning_rate': 0.1951297226600607, 'max_depth': 11, 'subsample': 0.8561533616437544, 'colsample_bytree': 0.7029797910015944, 'reg_alpha': 0.2501497473256008, 'reg_lambda': 3.2881057064065975}. Best is trial 28 with value: 0.7676124729674524.
[I 2025-08-16 20:48:56,977] Trial 53 finished with value: 0.7280581997378547 and parameters: {'n_estimators': 448, 'learning_rate': 0.13872509816398101, 'max_depth': 12, 'subsample': 0.9705740975282133, 'colsample_bytree': 0.6673455931658673, 'reg_alpha': 5.626660389773196, 'reg_lambda': 2.780157976968472}. Best is trial 28 with value: 0.7676124729674524.


Best trial: 28. Best value: 0.767612:  75%|███████▌  | 15/20 [00:03<00:01,  4.76it/s]

[I 2025-08-16 20:48:57,213] Trial 54 finished with value: 0.7579035184496942 and parameters: {'n_estimators': 817, 'learning_rate': 0.16823410756019894, 'max_depth': 9, 'subsample': 0.8366015776523555, 'colsample_bytree': 0.7246721199044196, 'reg_alpha': 0.003116133063574722, 'reg_lambda': 1.7350449557212335}. Best is trial 28 with value: 0.7676124729674524.


Best trial: 28. Best value: 0.767612:  80%|████████  | 16/20 [00:03<00:00,  4.74it/s]

[I 2025-08-16 20:48:57,427] Trial 55 finished with value: 0.7462675861301679 and parameters: {'n_estimators': 722, 'learning_rate': 0.14998216146914423, 'max_depth': 14, 'subsample': 0.8981109836886564, 'colsample_bytree': 0.6903645714874682, 'reg_alpha': 2.5116568211396997, 'reg_lambda': 2.2764574450358497}. Best is trial 28 with value: 0.7676124729674524.


Best trial: 28. Best value: 0.767612:  90%|█████████ | 18/20 [00:03<00:00,  4.61it/s]

[I 2025-08-16 20:48:57,717] Trial 56 finished with value: 0.7515077878765898 and parameters: {'n_estimators': 896, 'learning_rate': 0.17656616480397774, 'max_depth': 12, 'subsample': 0.9334478792447326, 'colsample_bytree': 0.8584044264924123, 'reg_alpha': 0.8886002546111245, 'reg_lambda': 3.8536358305186096}. Best is trial 28 with value: 0.7676124729674524.
[I 2025-08-16 20:48:57,891] Trial 57 finished with value: 0.7512462782887152 and parameters: {'n_estimators': 579, 'learning_rate': 0.2997951083542941, 'max_depth': 15, 'subsample': 0.9090637280765512, 'colsample_bytree': 0.8121091507313352, 'reg_alpha': 1.5684117707469425, 'reg_lambda': 1.2269581787466592}. Best is trial 28 with value: 0.7676124729674524.


Best trial: 28. Best value: 0.767612:  95%|█████████▌| 19/20 [00:04<00:00,  5.06it/s]

[I 2025-08-16 20:48:58,045] Trial 58 finished with value: 0.6738478759612545 and parameters: {'n_estimators': 516, 'learning_rate': 0.1163799868511694, 'max_depth': 10, 'subsample': 0.9622401767167231, 'colsample_bytree': 0.618208052207102, 'reg_alpha': 7.162330641997612, 'reg_lambda': 3.4422966596961517}. Best is trial 28 with value: 0.7676124729674524.


Best trial: 28. Best value: 0.767612: 100%|██████████| 20/20 [00:04<00:00,  4.54it/s]
[I 2025-08-16 20:48:58,362] Using an existing study with name 'lightgbm_optimization' instead of creating a new one.


[I 2025-08-16 20:48:58,331] Trial 59 finished with value: 0.7561211450969815 and parameters: {'n_estimators': 960, 'learning_rate': 0.1991362016003196, 'max_depth': 5, 'subsample': 0.8800194152972963, 'colsample_bytree': 0.7113788246053597, 'reg_alpha': 0.43731931318666895, 'reg_lambda': 1.9219661633136145}. Best is trial 28 with value: 0.7676124729674524.
Best xgboost score: 0.7676
Best xgboost params: {'n_estimators': 764, 'learning_rate': 0.20524881932175143, 'max_depth': 4, 'subsample': 0.8209923900556683, 'colsample_bytree': 0.6004182558703042, 'reg_alpha': 0.969628322595059, 'reg_lambda': 1.1229386647770523}
Optimizing lightgbm...


Best trial: 14. Best value: 0.756083:   5%|▌         | 1/20 [00:00<00:17,  1.07it/s]

[I 2025-08-16 20:48:59,294] Trial 40 finished with value: 0.7456990712590448 and parameters: {'n_estimators': 260, 'learning_rate': 0.13878813111399568, 'max_depth': 3, 'num_leaves': 276, 'subsample': 0.8458151991085348, 'colsample_bytree': 0.9161388535754149, 'reg_alpha': 0.5646964943916417, 'reg_lambda': 5.4098726161578945}. Best is trial 14 with value: 0.7560832698154473.


Best trial: 14. Best value: 0.756083:  10%|█         | 2/20 [00:04<00:45,  2.51s/it]

[I 2025-08-16 20:49:02,905] Trial 41 finished with value: 0.750744854194259 and parameters: {'n_estimators': 635, 'learning_rate': 0.26010320653990265, 'max_depth': 8, 'num_leaves': 156, 'subsample': 0.7556694052313638, 'colsample_bytree': 0.7352863617818275, 'reg_alpha': 0.1002867746535977, 'reg_lambda': 6.972256737555671}. Best is trial 14 with value: 0.7560832698154473.


Best trial: 14. Best value: 0.756083:  15%|█▌        | 3/20 [00:05<00:29,  1.76s/it]

[I 2025-08-16 20:49:03,772] Trial 42 finished with value: 0.7438267252923237 and parameters: {'n_estimators': 610, 'learning_rate': 0.2859870370365867, 'max_depth': 8, 'num_leaves': 157, 'subsample': 0.7629557845329318, 'colsample_bytree': 0.6963231750003641, 'reg_alpha': 1.499389092437149, 'reg_lambda': 6.3055478604538155}. Best is trial 14 with value: 0.7560832698154473.


Best trial: 14. Best value: 0.756083:  20%|██        | 4/20 [00:06<00:24,  1.50s/it]

[I 2025-08-16 20:49:04,877] Trial 43 finished with value: 0.7500287892092812 and parameters: {'n_estimators': 472, 'learning_rate': 0.2352617533394202, 'max_depth': 10, 'num_leaves': 144, 'subsample': 0.7305878493795969, 'colsample_bytree': 0.7182950906479509, 'reg_alpha': 0.6394977045644507, 'reg_lambda': 4.332298039454928}. Best is trial 14 with value: 0.7560832698154473.


Best trial: 14. Best value: 0.756083:  25%|██▌       | 5/20 [00:07<00:19,  1.31s/it]

[I 2025-08-16 20:49:05,842] Trial 44 finished with value: 0.750836711548096 and parameters: {'n_estimators': 467, 'learning_rate': 0.23784258301424455, 'max_depth': 14, 'num_leaves': 105, 'subsample': 0.7210744556369695, 'colsample_bytree': 0.7137137987680793, 'reg_alpha': 0.8135079271900929, 'reg_lambda': 3.306267945408572}. Best is trial 14 with value: 0.7560832698154473.


Best trial: 14. Best value: 0.756083:  30%|███       | 6/20 [00:08<00:16,  1.21s/it]

[I 2025-08-16 20:49:06,867] Trial 45 finished with value: 0.7438076535369256 and parameters: {'n_estimators': 352, 'learning_rate': 0.22224458440918035, 'max_depth': 14, 'num_leaves': 109, 'subsample': 0.7351653481290746, 'colsample_bytree': 0.6533351464007665, 'reg_alpha': 0.5712706169759799, 'reg_lambda': 3.1149393320271646}. Best is trial 14 with value: 0.7560832698154473.


Best trial: 14. Best value: 0.756083:  35%|███▌      | 7/20 [00:09<00:13,  1.04s/it]

[I 2025-08-16 20:49:07,560] Trial 46 finished with value: 0.7255055348743923 and parameters: {'n_estimators': 449, 'learning_rate': 0.2121201038675031, 'max_depth': 14, 'num_leaves': 96, 'subsample': 0.6863666924850793, 'colsample_bytree': 0.7125462637686515, 'reg_alpha': 2.1711247882282647, 'reg_lambda': 2.4529803687228573}. Best is trial 14 with value: 0.7560832698154473.


Best trial: 14. Best value: 0.756083:  40%|████      | 8/20 [00:09<00:10,  1.15it/s]

[I 2025-08-16 20:49:08,070] Trial 47 finished with value: 0.5990028331258523 and parameters: {'n_estimators': 526, 'learning_rate': 0.2536411658011435, 'max_depth': 12, 'num_leaves': 140, 'subsample': 0.7058935866634473, 'colsample_bytree': 0.6822557323723427, 'reg_alpha': 6.66535639307719, 'reg_lambda': 4.330244311775689}. Best is trial 14 with value: 0.7560832698154473.


Best trial: 14. Best value: 0.756083:  45%|████▌     | 9/20 [00:10<00:10,  1.08it/s]

[I 2025-08-16 20:49:09,110] Trial 48 finished with value: 0.7413674848395133 and parameters: {'n_estimators': 384, 'learning_rate': 0.18197187485608668, 'max_depth': 13, 'num_leaves': 127, 'subsample': 0.7450472137648715, 'colsample_bytree': 0.7679974185763062, 'reg_alpha': 0.8787968619946274, 'reg_lambda': 3.8728359887405115}. Best is trial 14 with value: 0.7560832698154473.


Best trial: 14. Best value: 0.756083:  50%|█████     | 10/20 [00:11<00:07,  1.29it/s]

[I 2025-08-16 20:49:09,541] Trial 49 finished with value: 0.5421347536315443 and parameters: {'n_estimators': 458, 'learning_rate': 0.20365981037938688, 'max_depth': 11, 'num_leaves': 59, 'subsample': 0.7147356707313783, 'colsample_bytree': 0.6376806021556689, 'reg_alpha': 9.268429296792046, 'reg_lambda': 3.1797219296746757}. Best is trial 14 with value: 0.7560832698154473.


Best trial: 14. Best value: 0.756083:  55%|█████▌    | 11/20 [00:12<00:08,  1.03it/s]

[I 2025-08-16 20:49:10,951] Trial 50 finished with value: 0.7478799053410677 and parameters: {'n_estimators': 305, 'learning_rate': 0.2369094562162869, 'max_depth': 10, 'num_leaves': 155, 'subsample': 0.973788080244512, 'colsample_bytree': 0.7297060116751697, 'reg_alpha': 0.3787623919021263, 'reg_lambda': 4.725489553975338}. Best is trial 14 with value: 0.7560832698154473.


Best trial: 14. Best value: 0.756083:  60%|██████    | 12/20 [00:13<00:08,  1.09s/it]

[I 2025-08-16 20:49:12,306] Trial 51 finished with value: 0.7527141126346979 and parameters: {'n_estimators': 299, 'learning_rate': 0.2299736378763973, 'max_depth': 10, 'num_leaves': 157, 'subsample': 0.9856055562679132, 'colsample_bytree': 0.7213870281341387, 'reg_alpha': 0.4045034860124143, 'reg_lambda': 3.9765063153055866}. Best is trial 14 with value: 0.7560832698154473.


Best trial: 14. Best value: 0.756083:  65%|██████▌   | 13/20 [00:14<00:06,  1.09it/s]

[I 2025-08-16 20:49:12,842] Trial 52 finished with value: 0.7508802945847617 and parameters: {'n_estimators': 291, 'learning_rate': 0.23267744756139294, 'max_depth': 11, 'num_leaves': 111, 'subsample': 0.9532180332970788, 'colsample_bytree': 0.7046314487643761, 'reg_alpha': 1.0910319444072822, 'reg_lambda': 0.0885459843374603}. Best is trial 14 with value: 0.7560832698154473.


Best trial: 14. Best value: 0.756083:  70%|███████   | 14/20 [00:15<00:05,  1.14it/s]

[I 2025-08-16 20:49:13,632] Trial 53 finished with value: 0.7450271041986826 and parameters: {'n_estimators': 229, 'learning_rate': 0.15636714006760855, 'max_depth': 11, 'num_leaves': 114, 'subsample': 0.9619031655368444, 'colsample_bytree': 0.7041113781124764, 'reg_alpha': 0.9115166155994587, 'reg_lambda': 1.5577762819989274}. Best is trial 14 with value: 0.7560832698154473.


Best trial: 14. Best value: 0.756083:  75%|███████▌  | 15/20 [00:15<00:03,  1.31it/s]

[I 2025-08-16 20:49:14,116] Trial 54 finished with value: 0.7524721214075707 and parameters: {'n_estimators': 149, 'learning_rate': 0.23041436905563145, 'max_depth': 12, 'num_leaves': 85, 'subsample': 0.9337931187958682, 'colsample_bytree': 0.6665705168165104, 'reg_alpha': 1.796277912130651, 'reg_lambda': 2.7357227878644856}. Best is trial 14 with value: 0.7560832698154473.


Best trial: 14. Best value: 0.756083:  80%|████████  | 16/20 [00:16<00:02,  1.58it/s]

[I 2025-08-16 20:49:14,453] Trial 55 finished with value: 0.712593581255494 and parameters: {'n_estimators': 104, 'learning_rate': 0.2535200342098315, 'max_depth': 12, 'num_leaves': 81, 'subsample': 0.9259308962023542, 'colsample_bytree': 0.6658720865333969, 'reg_alpha': 1.9037732466434845, 'reg_lambda': 0.19515034516453067}. Best is trial 14 with value: 0.7560832698154473.


Best trial: 14. Best value: 0.756083:  85%|████████▌ | 17/20 [00:16<00:01,  1.84it/s]

[I 2025-08-16 20:49:14,787] Trial 56 finished with value: 0.7165728455507788 and parameters: {'n_estimators': 148, 'learning_rate': 0.22472995532306614, 'max_depth': 13, 'num_leaves': 38, 'subsample': 0.9507639311760173, 'colsample_bytree': 0.6802266634603089, 'reg_alpha': 3.107626668229673, 'reg_lambda': 0.5615435941524733}. Best is trial 14 with value: 0.7560832698154473.


Best trial: 14. Best value: 0.756083:  90%|█████████ | 18/20 [00:17<00:01,  1.65it/s]

[I 2025-08-16 20:49:15,542] Trial 57 finished with value: 0.7489123613418824 and parameters: {'n_estimators': 282, 'learning_rate': 0.2061580358335472, 'max_depth': 13, 'num_leaves': 97, 'subsample': 0.9935396777182091, 'colsample_bytree': 0.613312123328212, 'reg_alpha': 1.1758934690391927, 'reg_lambda': 2.3995869599416686}. Best is trial 14 with value: 0.7560832698154473.


Best trial: 58. Best value: 0.763431:  95%|█████████▌| 19/20 [00:18<00:00,  1.18it/s]

[I 2025-08-16 20:49:16,959] Trial 58 finished with value: 0.7634312692747843 and parameters: {'n_estimators': 207, 'learning_rate': 0.10361220634219925, 'max_depth': 15, 'num_leaves': 71, 'subsample': 0.9433302713240929, 'colsample_bytree': 0.7527652842516583, 'reg_alpha': 0.015561250865734233, 'reg_lambda': 1.004541398592119}. Best is trial 58 with value: 0.7634312692747843.


Best trial: 58. Best value: 0.763431: 100%|██████████| 20/20 [00:19<00:00,  1.02it/s]

[I 2025-08-16 20:49:18,019] Trial 59 finished with value: 0.7340378005172781 and parameters: {'n_estimators': 176, 'learning_rate': 0.0629522221633018, 'max_depth': 14, 'num_leaves': 42, 'subsample': 0.9007639910679948, 'colsample_bytree': 0.6236573207739401, 'reg_alpha': 1.738359251731334, 'reg_lambda': 1.1919871953369754}. Best is trial 58 with value: 0.7634312692747843.
Best lightgbm score: 0.7634
Best lightgbm params: {'n_estimators': 207, 'learning_rate': 0.10361220634219925, 'max_depth': 15, 'num_leaves': 71, 'subsample': 0.9433302713240929, 'colsample_bytree': 0.7527652842516583, 'reg_alpha': 0.015561250865734233, 'reg_lambda': 1.004541398592119}

Optimization completed!
balanced_rf: Score = 0.7830
xgboost: Score = 0.7676
lightgbm: Score = 0.7634





In [78]:
"""
Best balanced_rf score: 0.7872
Best balanced_rf params: {'n_estimators': 145, 'max_depth': 6, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': 0.7}

Best xgboost score: 0.7713
Best xgboost params: {'n_estimators': 203, 'learning_rate': 0.18756324941006502, 'max_depth': 5, 'subsample': 0.8852212478925905, 'colsample_bytree': 0.7917396403781124, 'reg_alpha': 0.03686255734918342, 'reg_lambda': 4.805664902112656}

Best lightgbm score: 0.7634
Best lightgbm params: {'n_estimators': 207, 'learning_rate': 0.10361220634219925, 'max_depth': 15, 'num_leaves': 71, 'subsample': 0.9433302713240929, 'colsample_bytree': 0.7527652842516583, 'reg_alpha': 0.015561250865734233, 'reg_lambda': 1.004541398592119}
"""

"\nBest balanced_rf score: 0.7872\nBest balanced_rf params: {'n_estimators': 145, 'max_depth': 6, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': 0.7}\n\nBest xgboost score: 0.7713\nBest xgboost params: {'n_estimators': 203, 'learning_rate': 0.18756324941006502, 'max_depth': 5, 'subsample': 0.8852212478925905, 'colsample_bytree': 0.7917396403781124, 'reg_alpha': 0.03686255734918342, 'reg_lambda': 4.805664902112656}\n\nBest lightgbm score: 0.7613\nBest lightgbm params: {'n_estimators': 948, 'learning_rate': 0.29885405261154424, 'max_depth': 6, 'num_leaves': 171, 'subsample': 0.9537021727515936, 'colsample_bytree': 0.6550935656686305, 'reg_alpha': 0.9623887328034461, 'reg_lambda': 6.22836178165546}\n"

### Step 5: Advanced Model Ensemble and Stacking

In [79]:
class AdvancedEnsemble:
    """
    Advanced ensemble methods including stacking and blending
    """
    
    def __init__(self, base_models, meta_model=None, cv_folds=5, random_state=42):
        self.base_models = base_models
        self.meta_model = meta_model or lgb.LGBMClassifier(random_state=random_state, verbosity=-1)
        self.cv_folds = cv_folds
        self.random_state = random_state
        self.trained_models = {}
        
    def create_optimized_models(self, optimization_results):
        """Create models with optimized parameters"""
        models = {}
        
        for model_type, result in optimization_results.items():
            params = result['params']
            params['random_state'] = self.random_state
            
            if model_type == 'balanced_rf':
                params['n_jobs'] = -1
                models[model_type] = BalancedRandomForestClassifier(**params)
            elif model_type == 'xgboost':
                params['eval_metric'] = 'logloss'
                params['verbosity'] = 0
                models[model_type] = xgb.XGBClassifier(**params)
            elif model_type == 'lightgbm':
                params['verbosity'] = -1
                params['force_col_wise'] = True
                models[model_type] = lgb.LGBMClassifier(**params)
            elif model_type == 'catboost':
                params['verbose'] = False
                models[model_type] = cb.CatBoostClassifier(**params)
                
        return models
    
    def stacking_cv(self, X, y):
        """Generate meta-features using cross-validation"""
        cv = StratifiedKFold(n_splits=self.cv_folds, shuffle=True, random_state=self.random_state)
        meta_features = np.zeros((X.shape[0], len(self.base_models)))
        
        for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
            X_train_fold = X.iloc[train_idx]
            y_train_fold = y.iloc[train_idx]
            X_val_fold = X.iloc[val_idx]
            
            for i, (name, model) in enumerate(self.base_models.items()):
                # Clone and train model
                model_clone = clone(model)
                model_clone.fit(X_train_fold, y_train_fold)
                
                # Predict on validation set
                pred_proba = model_clone.predict_proba(X_val_fold)[:, 1]
                meta_features[val_idx, i] = pred_proba
                
        return meta_features
    
    def fit_stacking(self, X, y):
        """Fit stacking ensemble"""
        print("Generating meta-features...")
        meta_features = self.stacking_cv(X, y)
        
        print("Training meta-model...")
        self.meta_model.fit(meta_features, y)
        
        # Train base models on full data
        print("Training base models on full data...")
        for name, model in self.base_models.items():
            model.fit(X, y)
            self.trained_models[name] = model
            
        return self
    
    def predict_stacking(self, X):
        """Predict using stacking ensemble"""
        meta_features = np.zeros((X.shape[0], len(self.base_models)))
        
        for i, (name, model) in enumerate(self.trained_models.items()):
            pred_proba = model.predict_proba(X)[:, 1]
            meta_features[:, i] = pred_proba
            
        return self.meta_model.predict(meta_features)
    
    def predict_proba_stacking(self, X):
        """Predict probabilities using stacking ensemble"""
        meta_features = np.zeros((X.shape[0], len(self.base_models)))
        
        for i, (name, model) in enumerate(self.trained_models.items()):
            pred_proba = model.predict_proba(X)[:, 1]
            meta_features[:, i] = pred_proba
            
        return self.meta_model.predict_proba(meta_features)
    
    def weighted_average_ensemble(self, X, weights=None):
        """Simple weighted average ensemble"""
        if weights is None:
            weights = np.ones(len(self.trained_models)) / len(self.trained_models)
            
        predictions = np.zeros(X.shape[0])
        
        for i, (name, model) in enumerate(self.trained_models.items()):
            pred_proba = model.predict_proba(X)[:, 1]
            predictions += weights[i] * pred_proba
            
        return (predictions > 0.5).astype(int), predictions

# Create optimized models
optimized_models = optimizer.create_optimized_models(optimization_results)

# Create ensemble
ensemble = AdvancedEnsemble(optimized_models)

# Fit stacking ensemble
ensemble.fit_stacking(X_train_selected, Y_train)

print("Ensemble training completed!")

Generating meta-features...
Training meta-model...
Training base models on full data...
Ensemble training completed!


### Step 6: Comprehensive Model Evaluation

In [80]:
class ComprehensiveEvaluator:
    """
    Comprehensive model evaluation with multiple metrics and visualizations
    """
    
    def __init__(self):
        self.results = {}
        
    def calculate_metrics(self, y_true, y_pred, y_pred_proba=None):
        """Calculate comprehensive metrics"""
        from sklearn.metrics import (
            accuracy_score, precision_score, recall_score, f1_score,
            roc_auc_score, matthews_corrcoef, confusion_matrix,
            classification_report
        )
        
        metrics = {
            'accuracy': accuracy_score(y_true, y_pred),
            'precision': precision_score(y_true, y_pred),
            'recall': recall_score(y_true, y_pred),
            'f1': f1_score(y_true, y_pred),
            'mcc': matthews_corrcoef(y_true, y_pred)
        }
        
        if y_pred_proba is not None:
            metrics['auc'] = roc_auc_score(y_true, y_pred_proba)
            
        # Confusion matrix components
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        metrics.update({
            'sensitivity': tp / (tp + fn),
            'specificity': tn / (tn + fp),
            'tp': tp, 'tn': tn, 'fp': fp, 'fn': fn
        })
        
        return metrics
    
    def cross_validation_evaluation(self, model, X, y, cv_folds=5):
        """Comprehensive cross-validation evaluation"""
        cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
        
        cv_results = {
            'accuracy': [], 'precision': [], 'recall': [], 'f1': [],
            'auc': [], 'mcc': [], 'sensitivity': [], 'specificity': []
        }
        
        for train_idx, val_idx in cv.split(X, y):
            X_train_fold = X.iloc[train_idx]
            y_train_fold = y.iloc[train_idx]
            X_val_fold = X.iloc[val_idx]
            y_val_fold = y.iloc[val_idx]
            
            # Train and predict
            model_clone = clone(model)
            model_clone.fit(X_train_fold, y_train_fold)
            y_pred = model_clone.predict(X_val_fold)
            y_pred_proba = model_clone.predict_proba(X_val_fold)[:, 1]
            
            # Calculate metrics
            fold_metrics = self.calculate_metrics(y_val_fold, y_pred, y_pred_proba)
            
            for metric in cv_results:
                if metric in fold_metrics:
                    cv_results[metric].append(fold_metrics[metric])
        
        # Calculate means and stds
        cv_summary = {}
        for metric, values in cv_results.items():
            cv_summary[f'{metric}_mean'] = np.mean(values)
            cv_summary[f'{metric}_std'] = np.std(values)
            
        return cv_summary
    
    def evaluate_model(self, model, X_train, y_train, X_test, y_test, model_name):
        """Complete model evaluation"""
        print(f"Evaluating {model_name}...")
        
        # Cross-validation results
        cv_results = self.cross_validation_evaluation(model, X_train, y_train)
        
        # Train on full training set and test
        model.fit(X_train, y_train)
        y_pred_test = model.predict(X_test)
        y_pred_proba_test = model.predict_proba(X_test)[:, 1]
        
        # Test set metrics
        test_metrics = self.calculate_metrics(y_test, y_pred_test, y_pred_proba_test)
        
        # Store results
        self.results[model_name] = {
            'cv_results': cv_results,
            'test_metrics': test_metrics,
            'predictions': {
                'y_pred': y_pred_test,
                'y_pred_proba': y_pred_proba_test
            }
        }
        
        return cv_results, test_metrics
    
    def create_results_dataframe(self):
        """Create comprehensive results DataFrame"""
        data = []
        
        for model_name, result in self.results.items():
            row = {'Model': model_name}
            
            # Add CV results (only if they exist)
            if 'cv_results' in result:
                for metric, value in result['cv_results'].items():
                    row[f'CV_{metric}'] = value
            else:
                # Fill with NaN for missing CV results
                cv_metrics = ['accuracy_mean', 'accuracy_std', 'precision_mean', 'precision_std', 
                            'recall_mean', 'recall_std', 'f1_mean', 'f1_std', 'auc_mean', 'auc_std',
                            'mcc_mean', 'mcc_std', 'sensitivity_mean', 'sensitivity_std', 
                            'specificity_mean', 'specificity_std']
                for metric in cv_metrics:
                    row[f'CV_{metric}'] = np.nan
                
            # Add test results
            for metric, value in result['test_metrics'].items():
                if metric not in ['tp', 'tn', 'fp', 'fn']:
                    row[f'Test_{metric}'] = value
                    
            data.append(row)
            
        return pd.DataFrame(data)
    
    def plot_roc_curves(self, X_test, y_test):
        """Plot ROC curves for all models"""
        fig = go.Figure()
        
        for model_name, results in self.results.items():
            y_pred_proba = results['predictions']['y_pred_proba']
            
            from sklearn.metrics import roc_curve
            fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
            auc_score = results['test_metrics']['auc']
            
            fig.add_trace(go.Scatter(
                x=fpr, y=tpr,
                mode='lines',
                name=f'{model_name} (AUC = {auc_score:.3f})',
                line=dict(width=3)
            ))
        
        # Add diagonal line
        fig.add_trace(go.Scatter(
            x=[0, 1], y=[0, 1],
            mode='lines',
            name='Random (AUC = 0.5)',
            line=dict(dash='dash', color='gray')
        ))
        
        fig.update_layout(
            title='ROC Curves Comparison',
            xaxis_title='False Positive Rate',
            yaxis_title='True Positive Rate',
            width=800, height=600
        )
        
        fig.show()
        
    def plot_feature_importance(self, model, feature_names, model_name, top_n=20):
        """Plot feature importance"""
        if hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
        elif hasattr(model, 'coef_'):
            importances = np.abs(model.coef_[0])
        else:
            print(f"Cannot extract feature importance for {model_name}")
            return
            
        feature_imp = pd.DataFrame({
            'feature': feature_names,
            'importance': importances
        }).sort_values('importance', ascending=False)
        
        top_features = feature_imp.head(top_n)
        
        fig = px.bar(
            top_features.iloc[::-1],  # Reverse for better visualization
            x='importance',
            y='feature',
            orientation='h',
            title=f'Top {top_n} Feature Importances - {model_name}'
        )
        
        fig.update_layout(height=600)
        fig.show()

# Evaluate individual models
evaluator = ComprehensiveEvaluator()

# Evaluate optimized models
for model_name, model in optimized_models.items():
    evaluator.evaluate_model(
        model, X_train_selected, Y_train, X_test_selected, Y_test, model_name
    )

# Evaluate ensemble
ensemble_pred = ensemble.predict_stacking(X_test_selected)
ensemble_pred_proba = ensemble.predict_proba_stacking(X_test_selected)[:, 1]

# Add ensemble results manually
ensemble_test_metrics = evaluator.calculate_metrics(Y_test, ensemble_pred, ensemble_pred_proba)

# # cross-validate ensemble
# cv_results = evaluator.cross_validation_evaluation(ensemble, X_train_selected, Y_train)

evaluator.results['Stacking_Ensemble'] = {
    # 'cv_results': cv_results,
    'test_metrics': ensemble_test_metrics,
    'predictions': {
        'y_pred': ensemble_pred,
        'y_pred_proba': ensemble_pred_proba
    }
}

# Create results summary
results_df = evaluator.create_results_dataframe()
print("\nModel Comparison Results:")
print(results_df.round(4))

# Plot ROC curves
evaluator.plot_roc_curves(X_test_selected, Y_test)

Evaluating balanced_rf...
Evaluating xgboost...
Evaluating lightgbm...

Model Comparison Results:
               Model  CV_accuracy_mean  CV_accuracy_std  CV_precision_mean  \
0        balanced_rf            0.8240           0.0341             0.6459   
1            xgboost            0.8365           0.0183             0.7685   
2           lightgbm            0.8302           0.0165             0.7326   
3  Stacking_Ensemble               NaN              NaN                NaN   

   CV_precision_std  CV_recall_mean  CV_recall_std  CV_f1_mean  CV_f1_std  \
0            0.0817          0.6699         0.0649      0.6546     0.0553   
1            0.0324          0.4833         0.0691      0.5914     0.0601   
2            0.0627          0.5091         0.0830      0.5940     0.0542   
3               NaN             NaN            NaN         NaN        NaN   

   CV_auc_mean  ...  CV_specificity_mean  CV_specificity_std  Test_accuracy  \
0       0.8436  ...               0.8746      

# Advancement for Higher Performance

### Step 7: Advanced Techniques for Higher Performance

In [81]:
### Advanced Feature Engineering and Selection

# 1. Polynomial Features for Selected Important Features
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectFromModel

def create_polynomial_features(X_train, X_test, top_features, degree=2):
    """Create polynomial features for most important features"""
    # Select top features from best model
    best_model_name = results_df.loc[results_df['Test_auc'].idxmax(), 'Model']
    best_model = optimized_models[best_model_name]
    
    if hasattr(best_model, 'feature_importances_'):
        importances = best_model.feature_importances_
        top_idx = np.argsort(importances)[-top_features:]
        selected_cols = X_train.columns[top_idx]
    else:
        # Use top features from mutual information
        selected_cols = X_train.columns[:top_features]
    
    print(f"Creating polynomial features for top {len(selected_cols)} features")
    
    # Create polynomial features
    poly = PolynomialFeatures(degree=degree, interaction_only=True, include_bias=False)
    
    X_train_poly = poly.fit_transform(X_train[selected_cols])
    X_test_poly = poly.transform(X_test[selected_cols])
    
    # Get feature names
    poly_names = poly.get_feature_names_out(selected_cols)
    
    # Convert back to DataFrame
    X_train_poly_df = pd.DataFrame(X_train_poly, columns=poly_names, index=X_train.index)
    X_test_poly_df = pd.DataFrame(X_test_poly, columns=poly_names, index=X_test.index)
    
    # Combine with original features
    X_train_enhanced = pd.concat([X_train, X_train_poly_df], axis=1)
    X_test_enhanced = pd.concat([X_test, X_test_poly_df], axis=1)
    
    print(f"Enhanced features: {X_train_enhanced.shape[1]} (added {X_train_poly_df.shape[1]} polynomial features)")
    
    return X_train_enhanced, X_test_enhanced

# Create polynomial features
X_train_poly, X_test_poly = create_polynomial_features(
    X_train_selected, X_test_selected, top_features=15, degree=2
)

Creating polynomial features for top 15 features
Enhanced features: 190 (added 120 polynomial features)


In [82]:
# 2. Advanced Feature Selection with Recursive Feature Elimination
from sklearn.feature_selection import RFECV

def advanced_feature_selection_v2(X_train, X_test, y_train):
    """More aggressive feature selection"""
    
    # Remove low variance features more aggressively
    from sklearn.feature_selection import VarianceThreshold
    var_selector = VarianceThreshold(threshold=0.05)  # More aggressive
    X_train_var = var_selector.fit_transform(X_train)
    X_test_var = var_selector.transform(X_test)
    
    var_features = X_train.columns[var_selector.get_support()]
    X_train_var_df = pd.DataFrame(X_train_var, columns=var_features, index=X_train.index)
    X_test_var_df = pd.DataFrame(X_test_var, columns=var_features, index=X_test.index)
    
    print(f"After variance filtering: {X_train_var_df.shape[1]} features")
    
    # Use XGBoost for feature selection (often better than RF)
    xgb_selector = xgb.XGBClassifier(
        n_estimators=100, 
        random_state=42, 
        eval_metric='logloss',
        verbosity=0
    )
    
    # RFECV with XGBoost
    rfecv = RFECV(
        estimator=xgb_selector,
        step=5,  # Remove 5 features at a time
        cv=5,
        scoring='roc_auc',
        n_jobs=-1,
        min_features_to_select=30  # Minimum features to keep
    )
    
    rfecv.fit(X_train_var_df, y_train)
    
    selected_features = var_features[rfecv.support_]
    X_train_selected = X_train_var_df[selected_features]
    X_test_selected = X_test_var_df[selected_features]
    
    print(f"RFECV selected {len(selected_features)} features")
    print(f"Optimal number of features: {rfecv.n_features_}")
    
    return X_train_selected, X_test_selected, selected_features

# Apply advanced feature selection
X_train_final_v2, X_test_final_v2, final_features = advanced_feature_selection_v2(
    X_train_poly, X_test_poly, Y_train
)

After variance filtering: 166 features
RFECV selected 46 features
Optimal number of features: 46


In [83]:
# 3. Advanced Hyperparameter Optimization with More Trials and Better Objectives
class AdvancedOptunaOptimizer:
    """Enhanced Optuna optimizer with better objective functions"""
    
    def __init__(self, n_trials=200, cv_folds=5, random_state=42):
        self.n_trials = n_trials
        self.cv_folds = cv_folds
        self.random_state = random_state
        self.best_params = {}
        self.study_results = {}
    
    def objective_function_v2(self, trial, model_type, X, y):
        """Enhanced objective function focusing on AUC"""
        
        if model_type == 'xgboost_v2':
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
                'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.2, log=True),
                'max_depth': trial.suggest_int('max_depth', 3, 12),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
                'subsample': trial.suggest_float('subsample', 0.7, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
                'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.7, 1.0),
                'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
                'reg_lambda': trial.suggest_float('reg_lambda', 1, 10),
                'gamma': trial.suggest_float('gamma', 0, 5),
                'random_state': self.random_state,
                'eval_metric': 'auc',
                'verbosity': 0,
                'tree_method': 'hist'
            }
            model = xgb.XGBClassifier(**params)
            
        elif model_type == 'lightgbm_v2':
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
                'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.2, log=True),
                'max_depth': trial.suggest_int('max_depth', 3, 12),
                'num_leaves': trial.suggest_int('num_leaves', 31, 500),
                'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
                'subsample': trial.suggest_float('subsample', 0.7, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
                'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
                'reg_lambda': trial.suggest_float('reg_lambda', 1, 10),
                'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0, 1),
                'random_state': self.random_state,
                'verbosity': -1,
                'force_col_wise': True,
                'objective': 'binary',
                'metric': 'auc'
            }
            model = lgb.LGBMClassifier(**params)
            
        elif model_type == 'catboost_v2':
            params = {
                'iterations': trial.suggest_int('iterations', 500, 2000),
                'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.2, log=True),
                'depth': trial.suggest_int('depth', 4, 10),
                'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
                'border_count': trial.suggest_int('border_count', 32, 255),
                'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 10),
                'random_strength': trial.suggest_float('random_strength', 0, 10),
                'random_state': self.random_state,
                'verbose': False,
                'eval_metric': 'AUC',
                'task_type': 'CPU'
            }
            model = cb.CatBoostClassifier(**params)
        
        # Enhanced cross-validation with stratification
        from sklearn.model_selection import StratifiedKFold
        cv = StratifiedKFold(n_splits=self.cv_folds, shuffle=True, random_state=self.random_state)
        
        # Convert to numpy arrays to avoid DataFrame issues
        X_array = X.values if hasattr(X, 'values') else X
        y_array = y.values if hasattr(y, 'values') else y
        
        auc_scores = []
        for train_idx, val_idx in cv.split(X_array, y_array):
            X_train_fold = X_array[train_idx]
            y_train_fold = y_array[train_idx]
            X_val_fold = X_array[val_idx]
            y_val_fold = y_array[val_idx]
            
            try:
                model_clone = clone(model)
                model_clone.fit(X_train_fold, y_train_fold)
                y_pred_proba = model_clone.predict_proba(X_val_fold)[:, 1]
                auc = roc_auc_score(y_val_fold, y_pred_proba)
                auc_scores.append(auc)
            except Exception as e:
                print(f"Error in fold: {e}")
                return 0.5  # Return baseline score if error occurs
        
        return np.mean(auc_scores) if auc_scores else 0.5
    
    def optimize_model_v2(self, model_type, X, y):
        """Optimize with focus on AUC"""
        print(f"Optimizing {model_type} with enhanced parameters...")
        
        # Reset DataFrame indices to avoid issues
        if hasattr(X, 'reset_index'):
            X = X.reset_index(drop=True)
        if hasattr(y, 'reset_index'):
            y = y.reset_index(drop=True)

        # Create or load Optuna study
        study_name = f"{model_type}_optimization"
        storage_name = f'optuna_{model_type}.db'
        if os.path.exists(storage_name):
            print(f"Found existing study: {study_name}")
            study = optuna.load_study(
                study_name=study_name,
                storage=f'sqlite:///{storage_name}'
            )
        else:
            study = optuna.create_study(
                direction='maximize',
                sampler=TPESampler(seed=self.random_state, n_startup_trials=50),
                study_name=study_name,
                storage=f'sqlite:///{storage_name}',
                load_if_exists=True
            )

        try:
            study.optimize(
                lambda trial: self.objective_function_v2(trial, model_type, X, y),
                n_trials=self.n_trials,
                show_progress_bar=True
            )
            
            self.best_params[model_type] = study.best_params
            self.study_results[model_type] = study
            
            print(f"Best {model_type} AUC: {study.best_value:.4f}")
            print(f"Best {model_type} params: {study.best_params}")
            
            return study.best_params, study.best_value
            
        except Exception as e:
            print(f"Error optimizing {model_type}: {e}")
            # Return default parameters if optimization fails
            default_params = self._get_default_params(model_type)
            return default_params, 0.5
    
    def _get_default_params(self, model_type):
        """Get default parameters for models"""
        defaults = {
            'xgboost_v2': {
                'n_estimators': 1000,
                'learning_rate': 0.1,
                'max_depth': 6,
                'min_child_weight': 1,
                'subsample': 0.8,
                'colsample_bytree': 0.8,
                'reg_alpha': 0,
                'reg_lambda': 1,
                'gamma': 0
            },
            'lightgbm_v2': {
                'n_estimators': 1000,
                'learning_rate': 0.1,
                'max_depth': 6,
                'num_leaves': 31,
                'min_child_samples': 20,
                'subsample': 0.8,
                'colsample_bytree': 0.8,
                'reg_alpha': 0,
                'reg_lambda': 0,
                'min_gain_to_split': 0
            },
            'catboost_v2': {
                'iterations': 1000,
                'learning_rate': 0.1,
                'depth': 6,
                'l2_leaf_reg': 3,
                'border_count': 128,
                'bagging_temperature': 1,
                'random_strength': 1
            }
        }
        return defaults.get(model_type, {})

# Reset indices of the data before optimization
print("Resetting DataFrame indices...")
X_train_final_v2 = X_train_final_v2.reset_index(drop=True)
X_test_final_v2 = X_test_final_v2.reset_index(drop=True)
Y_train = Y_train.reset_index(drop=True)

print(f"Data shapes after reset: X_train: {X_train_final_v2.shape}, X_test: {X_test_final_v2.shape}, Y_train: {len(Y_train)}")

# Initialize enhanced optimizer
advanced_optimizer = AdvancedOptunaOptimizer(n_trials=20, cv_folds=5)  # Reduce trials for testing

# Optimize advanced models
advanced_models = ['xgboost_v2', 'lightgbm_v2', 'catboost_v2']
advanced_results = {}

for model_type in advanced_models:
    params, score = advanced_optimizer.optimize_model_v2(model_type, X_train_final_v2, Y_train)
    advanced_results[model_type] = {'params': params, 'score': score}

print("\nAdvanced Optimization Results:")
for model, result in advanced_results.items():
    print(f"{model}: AUC = {result['score']:.4f}")

# current study results
for model_type, study in advanced_optimizer.study_results.items():
    print(f"\n{model_type} Study Results:")
    print(f"Best AUC: {study.best_value:.4f}")
    print(f"Best Params: {study.best_params}")


Resetting DataFrame indices...
Data shapes after reset: X_train: (477, 49), X_test: (120, 49), Y_train: 477
Optimizing xgboost_v2 with enhanced parameters...
Found existing study: xgboost_v2_optimization


Best trial: 7. Best value: 0.841819:   5%|▌         | 1/20 [00:03<01:12,  3.81s/it]

[I 2025-08-16 20:49:38,217] Trial 40 finished with value: 0.7978597531242204 and parameters: {'n_estimators': 1598, 'learning_rate': 0.009624344711169207, 'max_depth': 3, 'min_child_weight': 6, 'subsample': 0.8486960438146444, 'colsample_bytree': 0.9071139263432786, 'colsample_bylevel': 0.7860789779281987, 'reg_alpha': 4.340125228657158, 'reg_lambda': 1.80490434363234, 'gamma': 0.44679319787014754}. Best is trial 7 with value: 0.8418189794969495.


Best trial: 7. Best value: 0.841819:  10%|█         | 2/20 [00:08<01:12,  4.05s/it]

[I 2025-08-16 20:49:42,441] Trial 41 finished with value: 0.8240199247011862 and parameters: {'n_estimators': 1820, 'learning_rate': 0.01607494476000965, 'max_depth': 11, 'min_child_weight': 5, 'subsample': 0.9366799116757838, 'colsample_bytree': 0.9510091249042728, 'colsample_bylevel': 0.7474270990995993, 'reg_alpha': 1.4989131734406675, 'reg_lambda': 4.9411488478510295, 'gamma': 0.8199138640738364}. Best is trial 7 with value: 0.8418189794969495.


Best trial: 7. Best value: 0.841819:  15%|█▌        | 3/20 [00:11<01:06,  3.92s/it]

[I 2025-08-16 20:49:46,214] Trial 42 finished with value: 0.8173303508652559 and parameters: {'n_estimators': 1684, 'learning_rate': 0.016272231991282946, 'max_depth': 11, 'min_child_weight': 5, 'subsample': 0.9301471344073848, 'colsample_bytree': 0.9525759308665354, 'colsample_bylevel': 0.7531148082934542, 'reg_alpha': 2.0168708473329096, 'reg_lambda': 4.548169169044528, 'gamma': 0.8369353695062623}. Best is trial 7 with value: 0.8418189794969495.


Best trial: 7. Best value: 0.841819:  20%|██        | 4/20 [00:19<01:24,  5.29s/it]

[I 2025-08-16 20:49:53,588] Trial 43 finished with value: 0.8309025651493502 and parameters: {'n_estimators': 1908, 'learning_rate': 0.009838856491004924, 'max_depth': 10, 'min_child_weight': 4, 'subsample': 0.9711825404980207, 'colsample_bytree': 0.9362814173803287, 'colsample_bylevel': 0.7786967026102976, 'reg_alpha': 1.4943898050714701, 'reg_lambda': 5.296406619611679, 'gamma': 0.24357860175418555}. Best is trial 7 with value: 0.8418189794969495.


Best trial: 7. Best value: 0.841819:  25%|██▌       | 5/20 [00:25<01:26,  5.74s/it]

[I 2025-08-16 20:50:00,133] Trial 44 finished with value: 0.8201279739629405 and parameters: {'n_estimators': 1899, 'learning_rate': 0.009772168365195682, 'max_depth': 10, 'min_child_weight': 4, 'subsample': 0.9793104306067941, 'colsample_bytree': 0.8769889365774267, 'colsample_bylevel': 0.8459256559147132, 'reg_alpha': 2.9784091409552738, 'reg_lambda': 5.514554298601058, 'gamma': 0.13006531236813623}. Best is trial 7 with value: 0.8418189794969495.


Best trial: 7. Best value: 0.841819:  30%|███       | 6/20 [00:31<01:22,  5.86s/it]

[I 2025-08-16 20:50:06,226] Trial 45 finished with value: 0.8254572928715611 and parameters: {'n_estimators': 1736, 'learning_rate': 0.0059408295713598205, 'max_depth': 4, 'min_child_weight': 2, 'subsample': 0.9660544450766388, 'colsample_bytree': 0.9326983993787773, 'colsample_bylevel': 0.7750829368199087, 'reg_alpha': 2.2768263469156476, 'reg_lambda': 6.062726006657568, 'gamma': 0.2743271293481951}. Best is trial 7 with value: 0.8418189794969495.


Best trial: 7. Best value: 0.841819:  35%|███▌      | 7/20 [00:36<01:10,  5.40s/it]

[I 2025-08-16 20:50:10,680] Trial 46 finished with value: 0.8184446655780091 and parameters: {'n_estimators': 1331, 'learning_rate': 0.006545493207998033, 'max_depth': 4, 'min_child_weight': 1, 'subsample': 0.9970578706532582, 'colsample_bytree': 0.9357344674906678, 'colsample_bylevel': 0.7857952775068974, 'reg_alpha': 2.337424666119528, 'reg_lambda': 5.935965790572935, 'gamma': 0.29201820417351854}. Best is trial 7 with value: 0.8418189794969495.


Best trial: 7. Best value: 0.841819:  40%|████      | 8/20 [00:41<01:05,  5.46s/it]

[I 2025-08-16 20:50:16,265] Trial 47 finished with value: 0.834088447755778 and parameters: {'n_estimators': 1738, 'learning_rate': 0.005773325584768018, 'max_depth': 3, 'min_child_weight': 2, 'subsample': 0.9636383332470989, 'colsample_bytree': 0.9299579436672746, 'colsample_bylevel': 0.8098123211455714, 'reg_alpha': 0.704271072813108, 'reg_lambda': 6.016226903626196, 'gamma': 0.5533702048246028}. Best is trial 7 with value: 0.8418189794969495.


Best trial: 7. Best value: 0.841819:  45%|████▌     | 9/20 [00:46<00:56,  5.14s/it]

[I 2025-08-16 20:50:20,692] Trial 48 finished with value: 0.835982187407861 and parameters: {'n_estimators': 1457, 'learning_rate': 0.00814751656105649, 'max_depth': 3, 'min_child_weight': 3, 'subsample': 0.9600816875177612, 'colsample_bytree': 0.8796669512301063, 'colsample_bylevel': 0.8133612386944844, 'reg_alpha': 1.082855500680737, 'reg_lambda': 5.230171268925546, 'gamma': 0.5244462340938914}. Best is trial 7 with value: 0.8418189794969495.


Best trial: 7. Best value: 0.841819:  50%|█████     | 10/20 [00:49<00:46,  4.67s/it]

[I 2025-08-16 20:50:24,319] Trial 49 finished with value: 0.8296515303576694 and parameters: {'n_estimators': 1450, 'learning_rate': 0.008226464440765559, 'max_depth': 3, 'min_child_weight': 3, 'subsample': 0.9797706628466141, 'colsample_bytree': 0.9113545320136074, 'colsample_bylevel': 0.8250269147976185, 'reg_alpha': 0.6738868388172811, 'reg_lambda': 5.130652606787932, 'gamma': 1.1534927722771329}. Best is trial 7 with value: 0.8418189794969495.


Best trial: 7. Best value: 0.841819:  55%|█████▌    | 11/20 [00:54<00:41,  4.60s/it]

[I 2025-08-16 20:50:28,748] Trial 50 finished with value: 0.8377045485473225 and parameters: {'n_estimators': 1418, 'learning_rate': 0.008012519694607995, 'max_depth': 3, 'min_child_weight': 3, 'subsample': 0.983333453699027, 'colsample_bytree': 0.9072047563746894, 'colsample_bylevel': 0.7986332538647494, 'reg_alpha': 0.7488849401898565, 'reg_lambda': 5.328409675563343, 'gamma': 0.5208597145273124}. Best is trial 7 with value: 0.8418189794969495.


Best trial: 7. Best value: 0.841819:  60%|██████    | 12/20 [00:58<00:36,  4.56s/it]

[I 2025-08-16 20:50:33,241] Trial 51 finished with value: 0.838653438343426 and parameters: {'n_estimators': 1426, 'learning_rate': 0.008587062090861476, 'max_depth': 3, 'min_child_weight': 3, 'subsample': 0.985194797317823, 'colsample_bytree': 0.9082270124163133, 'colsample_bylevel': 0.8014725529732711, 'reg_alpha': 0.6375776526465202, 'reg_lambda': 5.326072782798781, 'gamma': 0.4849439808622817}. Best is trial 7 with value: 0.8418189794969495.


Best trial: 7. Best value: 0.841819:  65%|██████▌   | 13/20 [01:02<00:29,  4.25s/it]

[I 2025-08-16 20:50:36,761] Trial 52 finished with value: 0.8337763092241046 and parameters: {'n_estimators': 1330, 'learning_rate': 0.010452692266862835, 'max_depth': 3, 'min_child_weight': 2, 'subsample': 0.988078170077593, 'colsample_bytree': 0.8764154760144002, 'colsample_bylevel': 0.8054092810373709, 'reg_alpha': 1.0490776257328882, 'reg_lambda': 5.284816323148881, 'gamma': 0.5485942791033092}. Best is trial 7 with value: 0.8418189794969495.


Best trial: 7. Best value: 0.841819:  70%|███████   | 14/20 [01:05<00:23,  3.99s/it]

[I 2025-08-16 20:50:40,170] Trial 53 finished with value: 0.8380095994647434 and parameters: {'n_estimators': 1198, 'learning_rate': 0.011435276862603832, 'max_depth': 3, 'min_child_weight': 2, 'subsample': 0.9878890120907502, 'colsample_bytree': 0.8801666149918443, 'colsample_bylevel': 0.7967535983509877, 'reg_alpha': 0.48207937841742, 'reg_lambda': 4.3384711030347365, 'gamma': 0.5869975644588814}. Best is trial 7 with value: 0.8418189794969495.


Best trial: 7. Best value: 0.841819:  75%|███████▌  | 15/20 [01:09<00:19,  3.96s/it]

[I 2025-08-16 20:50:44,064] Trial 54 finished with value: 0.8399650722369645 and parameters: {'n_estimators': 1197, 'learning_rate': 0.00822370919700981, 'max_depth': 3, 'min_child_weight': 1, 'subsample': 0.9612279782689885, 'colsample_bytree': 0.9028568821729006, 'colsample_bylevel': 0.7977299415816208, 'reg_alpha': 0.5108633751367904, 'reg_lambda': 4.436692699242173, 'gamma': 0.00234362745284189}. Best is trial 7 with value: 0.8418189794969495.


Best trial: 7. Best value: 0.841819:  80%|████████  | 16/20 [01:11<00:13,  3.34s/it]

[I 2025-08-16 20:50:45,965] Trial 55 finished with value: 0.7858059184414052 and parameters: {'n_estimators': 1177, 'learning_rate': 0.012742486890386723, 'max_depth': 5, 'min_child_weight': 1, 'subsample': 0.9969628422957763, 'colsample_bytree': 0.8853708608959154, 'colsample_bylevel': 0.7975386653131105, 'reg_alpha': 0.5515311068059998, 'reg_lambda': 4.030807268046513, 'gamma': 4.185348710619435}. Best is trial 7 with value: 0.8418189794969495.


Best trial: 56. Best value: 0.841929:  85%|████████▌ | 17/20 [01:16<00:11,  3.84s/it]

[I 2025-08-16 20:50:50,953] Trial 56 finished with value: 0.8419291210224309 and parameters: {'n_estimators': 1191, 'learning_rate': 0.008376253330168735, 'max_depth': 4, 'min_child_weight': 1, 'subsample': 0.984163770722859, 'colsample_bytree': 0.9048538044078746, 'colsample_bylevel': 0.8400115749735785, 'reg_alpha': 0.2686668245446797, 'reg_lambda': 4.5802037425196485, 'gamma': 0.02287654854844914}. Best is trial 56 with value: 0.8419291210224309.


Best trial: 56. Best value: 0.841929:  90%|█████████ | 18/20 [01:21<00:08,  4.19s/it]

[I 2025-08-16 20:50:55,954] Trial 57 finished with value: 0.8416011062348325 and parameters: {'n_estimators': 1198, 'learning_rate': 0.010773701576166975, 'max_depth': 4, 'min_child_weight': 1, 'subsample': 0.9782490583040195, 'colsample_bytree': 0.9017424299207322, 'colsample_bylevel': 0.8393160319192592, 'reg_alpha': 0.31688954690418525, 'reg_lambda': 4.483235933734258, 'gamma': 0.11320704321709674}. Best is trial 56 with value: 0.8419291210224309.


Best trial: 58. Best value: 0.843243:  95%|█████████▌| 19/20 [01:26<00:04,  4.41s/it]

[I 2025-08-16 20:51:00,898] Trial 58 finished with value: 0.8432433773332427 and parameters: {'n_estimators': 1189, 'learning_rate': 0.014204944704666236, 'max_depth': 4, 'min_child_weight': 1, 'subsample': 0.9765285466327456, 'colsample_bytree': 0.8960690353489539, 'colsample_bylevel': 0.8630461236701165, 'reg_alpha': 0.029034135693057267, 'reg_lambda': 4.272454247302952, 'gamma': 0.0069277025975850864}. Best is trial 58 with value: 0.8432433773332427.


Best trial: 58. Best value: 0.843243: 100%|██████████| 20/20 [01:31<00:00,  4.55s/it]


[I 2025-08-16 20:51:05,417] Trial 59 finished with value: 0.8419336570955525 and parameters: {'n_estimators': 1091, 'learning_rate': 0.01430330602770791, 'max_depth': 4, 'min_child_weight': 1, 'subsample': 0.9766994315796015, 'colsample_bytree': 0.8629966977280794, 'colsample_bylevel': 0.8742905504713555, 'reg_alpha': 0.2883774386815118, 'reg_lambda': 4.604805337249147, 'gamma': 0.013329221063163822}. Best is trial 58 with value: 0.8432433773332427.
Best xgboost_v2 AUC: 0.8432
Best xgboost_v2 params: {'n_estimators': 1189, 'learning_rate': 0.014204944704666236, 'max_depth': 4, 'min_child_weight': 1, 'subsample': 0.9765285466327456, 'colsample_bytree': 0.8960690353489539, 'colsample_bylevel': 0.8630461236701165, 'reg_alpha': 0.029034135693057267, 'reg_lambda': 4.272454247302952, 'gamma': 0.0069277025975850864}
Optimizing lightgbm_v2 with enhanced parameters...
Found existing study: lightgbm_v2_optimization


Best trial: 7. Best value: 0.842287:   5%|▌         | 1/20 [00:01<00:20,  1.10s/it]

[I 2025-08-16 20:51:06,535] Trial 40 finished with value: 0.7849147218253159 and parameters: {'n_estimators': 1330, 'learning_rate': 0.12310440477686978, 'max_depth': 3, 'num_leaves': 455, 'min_child_samples': 44, 'subsample': 0.8385091811463585, 'colsample_bytree': 0.8110393998200629, 'reg_alpha': 2.209848888015869, 'reg_lambda': 3.5675951074893035, 'min_gain_to_split': 0.4570278491242318}. Best is trial 7 with value: 0.8422871164183168.


Best trial: 7. Best value: 0.842287:  10%|█         | 2/20 [00:01<00:16,  1.06it/s]

[I 2025-08-16 20:51:07,370] Trial 41 finished with value: 0.8240838549817424 and parameters: {'n_estimators': 851, 'learning_rate': 0.1745802377655717, 'max_depth': 4, 'num_leaves': 494, 'min_child_samples': 25, 'subsample': 0.8255414295628644, 'colsample_bytree': 0.7372162746497257, 'reg_alpha': 0.5086861284828973, 'reg_lambda': 2.70221777354265, 'min_gain_to_split': 0.5528688778932098}. Best is trial 7 with value: 0.8422871164183168.


Best trial: 7. Best value: 0.842287:  15%|█▌        | 3/20 [00:02<00:16,  1.03it/s]

[I 2025-08-16 20:51:08,371] Trial 42 finished with value: 0.8164280975709328 and parameters: {'n_estimators': 1159, 'learning_rate': 0.18512692773046396, 'max_depth': 5, 'num_leaves': 475, 'min_child_samples': 29, 'subsample': 0.8596377181638907, 'colsample_bytree': 0.776639442878702, 'reg_alpha': 1.5201875255081816, 'reg_lambda': 1.7609099311176089, 'min_gain_to_split': 0.6007350341314401}. Best is trial 7 with value: 0.8422871164183168.


Best trial: 7. Best value: 0.842287:  20%|██        | 4/20 [00:04<00:16,  1.02s/it]

[I 2025-08-16 20:51:09,464] Trial 43 finished with value: 0.8073675750153093 and parameters: {'n_estimators': 1214, 'learning_rate': 0.1992024359766368, 'max_depth': 3, 'num_leaves': 495, 'min_child_samples': 51, 'subsample': 0.8398311300660398, 'colsample_bytree': 0.7196518140990905, 'reg_alpha': 0.4156716378117423, 'reg_lambda': 2.817566423896279, 'min_gain_to_split': 0.5024623259039676}. Best is trial 7 with value: 0.8422871164183168.


Best trial: 7. Best value: 0.842287:  25%|██▌       | 5/20 [00:04<00:14,  1.05it/s]

[I 2025-08-16 20:51:10,301] Trial 44 finished with value: 0.8146953885146629 and parameters: {'n_estimators': 829, 'learning_rate': 0.14291857574936728, 'max_depth': 5, 'num_leaves': 441, 'min_child_samples': 28, 'subsample': 0.813322359007422, 'colsample_bytree': 0.7580454226042543, 'reg_alpha': 0.5435687573840998, 'reg_lambda': 3.518099750805303, 'min_gain_to_split': 0.6886045126568741}. Best is trial 7 with value: 0.8422871164183168.


Best trial: 7. Best value: 0.842287:  30%|███       | 6/20 [00:05<00:12,  1.09it/s]

[I 2025-08-16 20:51:11,138] Trial 45 finished with value: 0.7320497663922343 and parameters: {'n_estimators': 806, 'learning_rate': 0.11256042152858702, 'max_depth': 4, 'num_leaves': 466, 'min_child_samples': 89, 'subsample': 0.8604366234505285, 'colsample_bytree': 0.7386291865546555, 'reg_alpha': 2.2402927835602533, 'reg_lambda': 1.8629086040997918, 'min_gain_to_split': 0.5389662510671419}. Best is trial 7 with value: 0.8422871164183168.


Best trial: 7. Best value: 0.842287:  35%|███▌      | 7/20 [00:06<00:11,  1.11it/s]

[I 2025-08-16 20:51:12,007] Trial 46 finished with value: 0.7813008890703319 and parameters: {'n_estimators': 967, 'learning_rate': 0.15186866305874255, 'max_depth': 3, 'num_leaves': 447, 'min_child_samples': 72, 'subsample': 0.892994754351547, 'colsample_bytree': 0.8769577947761467, 'reg_alpha': 1.3402768493360615, 'reg_lambda': 2.786461841716532, 'min_gain_to_split': 0.48076658877177797}. Best is trial 7 with value: 0.8422871164183168.


Best trial: 7. Best value: 0.842287:  40%|████      | 8/20 [00:07<00:12,  1.03s/it]

[I 2025-08-16 20:51:13,322] Trial 47 finished with value: 0.8154755930915606 and parameters: {'n_estimators': 1229, 'learning_rate': 0.07745816069871785, 'max_depth': 5, 'num_leaves': 499, 'min_child_samples': 39, 'subsample': 0.8515413969698934, 'colsample_bytree': 0.8405340130834232, 'reg_alpha': 0.5435976410339475, 'reg_lambda': 4.1778738498390915, 'min_gain_to_split': 0.6182004467204003}. Best is trial 7 with value: 0.8422871164183168.


Best trial: 7. Best value: 0.842287:  45%|████▌     | 9/20 [00:09<00:13,  1.21s/it]

[I 2025-08-16 20:51:14,929] Trial 48 finished with value: 0.8127020678823342 and parameters: {'n_estimators': 1462, 'learning_rate': 0.06157925862420463, 'max_depth': 4, 'num_leaves': 473, 'min_child_samples': 32, 'subsample': 0.824397832455759, 'colsample_bytree': 0.7171330772225282, 'reg_alpha': 1.6809239496192312, 'reg_lambda': 4.481805039557403, 'min_gain_to_split': 0.40630948970839376}. Best is trial 7 with value: 0.8422871164183168.


Best trial: 7. Best value: 0.842287:  50%|█████     | 10/20 [00:13<00:19,  2.00s/it]

[I 2025-08-16 20:51:18,688] Trial 49 finished with value: 0.7635758998435056 and parameters: {'n_estimators': 1400, 'learning_rate': 0.005191851369996218, 'max_depth': 3, 'num_leaves': 423, 'min_child_samples': 58, 'subsample': 0.7844678406820722, 'colsample_bytree': 0.7971547565994891, 'reg_alpha': 3.326546827874679, 'reg_lambda': 3.3775550906604472, 'min_gain_to_split': 0.3368848114524312}. Best is trial 7 with value: 0.8422871164183168.


Best trial: 7. Best value: 0.842287:  55%|█████▌    | 11/20 [00:13<00:14,  1.56s/it]

[I 2025-08-16 20:51:19,257] Trial 50 finished with value: 0.8088459095960626 and parameters: {'n_estimators': 506, 'learning_rate': 0.15758085190945054, 'max_depth': 6, 'num_leaves': 389, 'min_child_samples': 23, 'subsample': 0.8855875600644756, 'colsample_bytree': 0.7444374900586055, 'reg_alpha': 2.1201877676233205, 'reg_lambda': 5.340195401147394, 'min_gain_to_split': 0.5444946527619422}. Best is trial 7 with value: 0.8422871164183168.


Best trial: 7. Best value: 0.842287:  60%|██████    | 12/20 [00:14<00:11,  1.38s/it]

[I 2025-08-16 20:51:20,240] Trial 51 finished with value: 0.8255340517339139 and parameters: {'n_estimators': 858, 'learning_rate': 0.1188919754644089, 'max_depth': 4, 'num_leaves': 485, 'min_child_samples': 26, 'subsample': 0.8336743203373794, 'colsample_bytree': 0.7379709017341635, 'reg_alpha': 0.5004488731192829, 'reg_lambda': 2.7764597405392344, 'min_gain_to_split': 0.5507858535284547}. Best is trial 7 with value: 0.8422871164183168.


Best trial: 52. Best value: 0.84495:  65%|██████▌   | 13/20 [00:15<00:08,  1.23s/it]

[I 2025-08-16 20:51:21,109] Trial 52 finished with value: 0.8449502165974916 and parameters: {'n_estimators': 766, 'learning_rate': 0.11671917327235631, 'max_depth': 4, 'num_leaves': 437, 'min_child_samples': 18, 'subsample': 0.8684394851165458, 'colsample_bytree': 0.7322297303008563, 'reg_alpha': 0.09720361799465871, 'reg_lambda': 2.6670786533019557, 'min_gain_to_split': 0.7132178862944913}. Best is trial 52 with value: 0.8449502165974916.


Best trial: 53. Best value: 0.846182:  70%|███████   | 14/20 [00:16<00:06,  1.05s/it]

[I 2025-08-16 20:51:21,756] Trial 53 finished with value: 0.8461823274591186 and parameters: {'n_estimators': 626, 'learning_rate': 0.1955679189112378, 'max_depth': 3, 'num_leaves': 438, 'min_child_samples': 17, 'subsample': 0.8688515233892402, 'colsample_bytree': 0.7701740801549516, 'reg_alpha': 0.0784834992072776, 'reg_lambda': 1.5993930984695741, 'min_gain_to_split': 0.7233957634609105}. Best is trial 53 with value: 0.8461823274591186.


Best trial: 53. Best value: 0.846182:  75%|███████▌  | 15/20 [00:17<00:04,  1.03it/s]

[I 2025-08-16 20:51:22,540] Trial 54 finished with value: 0.8369242022181398 and parameters: {'n_estimators': 713, 'learning_rate': 0.129830098992241, 'max_depth': 3, 'num_leaves': 440, 'min_child_samples': 20, 'subsample': 0.8654300853033664, 'colsample_bytree': 0.7147626964506908, 'reg_alpha': 0.23695372775894857, 'reg_lambda': 1.506429443507985, 'min_gain_to_split': 0.7244532311966527}. Best is trial 53 with value: 0.8461823274591186.


Best trial: 53. Best value: 0.846182:  80%|████████  | 16/20 [00:17<00:03,  1.10it/s]

[I 2025-08-16 20:51:23,316] Trial 55 finished with value: 0.8425043517951509 and parameters: {'n_estimators': 637, 'learning_rate': 0.10213203251155425, 'max_depth': 3, 'num_leaves': 430, 'min_child_samples': 18, 'subsample': 0.9453522879711864, 'colsample_bytree': 0.7846994329333423, 'reg_alpha': 0.00024156159878160888, 'reg_lambda': 1.398117815529629, 'min_gain_to_split': 0.7282390540115609}. Best is trial 53 with value: 0.8461823274591186.


Best trial: 56. Best value: 0.851109:  85%|████████▌ | 17/20 [00:18<00:02,  1.16it/s]

[I 2025-08-16 20:51:24,053] Trial 56 finished with value: 0.8511093533827765 and parameters: {'n_estimators': 596, 'learning_rate': 0.10616766366828562, 'max_depth': 3, 'num_leaves': 435, 'min_child_samples': 18, 'subsample': 0.9408099751164188, 'colsample_bytree': 0.7708730295926163, 'reg_alpha': 0.0518034735166127, 'reg_lambda': 1.068493188492761, 'min_gain_to_split': 0.7238500545020455}. Best is trial 56 with value: 0.8511093533827765.


Best trial: 56. Best value: 0.851109:  90%|█████████ | 18/20 [00:20<00:02,  1.24s/it]

[I 2025-08-16 20:51:26,192] Trial 57 finished with value: 0.8331212010387608 and parameters: {'n_estimators': 596, 'learning_rate': 0.006814631902571205, 'max_depth': 3, 'num_leaves': 435, 'min_child_samples': 18, 'subsample': 0.9478881286180544, 'colsample_bytree': 0.7869039005024427, 'reg_alpha': 0.04512567923441016, 'reg_lambda': 1.0748412784272352, 'min_gain_to_split': 0.7386373121276765}. Best is trial 56 with value: 0.8511093533827765.


Best trial: 56. Best value: 0.851109:  95%|█████████▌| 19/20 [00:21<00:01,  1.09s/it]

[I 2025-08-16 20:51:26,920] Trial 58 finished with value: 0.825326880769318 and parameters: {'n_estimators': 750, 'learning_rate': 0.13284756400617254, 'max_depth': 4, 'num_leaves': 361, 'min_child_samples': 18, 'subsample': 0.937759880056302, 'colsample_bytree': 0.7698291892314361, 'reg_alpha': 1.2703294827507472, 'reg_lambda': 1.4426141953806084, 'min_gain_to_split': 0.9293699165626232}. Best is trial 56 with value: 0.8511093533827765.


Best trial: 56. Best value: 0.851109: 100%|██████████| 20/20 [00:22<00:00,  1.11s/it]


[I 2025-08-16 20:51:27,546] Trial 59 finished with value: 0.7347912201469688 and parameters: {'n_estimators': 720, 'learning_rate': 0.10487639129380145, 'max_depth': 3, 'num_leaves': 417, 'min_child_samples': 21, 'subsample': 0.8703946848928927, 'colsample_bytree': 0.8079898726262869, 'reg_alpha': 5.934406535283339, 'reg_lambda': 1.5665105358041624, 'min_gain_to_split': 0.6986746517209014}. Best is trial 56 with value: 0.8511093533827765.
Best lightgbm_v2 AUC: 0.8511
Best lightgbm_v2 params: {'n_estimators': 596, 'learning_rate': 0.10616766366828562, 'max_depth': 3, 'num_leaves': 435, 'min_child_samples': 18, 'subsample': 0.9408099751164188, 'colsample_bytree': 0.7708730295926163, 'reg_alpha': 0.0518034735166127, 'reg_lambda': 1.068493188492761, 'min_gain_to_split': 0.7238500545020455}
Optimizing catboost_v2 with enhanced parameters...
Found existing study: catboost_v2_optimization


Best trial: 1. Best value: 0.881262:   5%|▌         | 1/20 [00:11<03:34, 11.27s/it]

[I 2025-08-16 20:51:38,833] Trial 40 finished with value: 0.859641210791318 and parameters: {'iterations': 1637, 'learning_rate': 0.016887744439027567, 'depth': 8, 'l2_leaf_reg': 1.9096612476255737, 'border_count': 107, 'bagging_temperature': 4.346032826557857, 'random_strength': 6.561570595735214}. Best is trial 1 with value: 0.8812619780680866.


Best trial: 1. Best value: 0.881262:  10%|█         | 2/20 [00:15<02:09,  7.22s/it]

[I 2025-08-16 20:51:43,216] Trial 41 finished with value: 0.8713272691705789 and parameters: {'iterations': 1094, 'learning_rate': 0.012124001159703239, 'depth': 6, 'l2_leaf_reg': 1.6230776654477503, 'border_count': 241, 'bagging_temperature': 7.965682991325959, 'random_strength': 8.46328201090728}. Best is trial 1 with value: 0.8812619780680866.


Best trial: 1. Best value: 0.881262:  15%|█▌        | 3/20 [00:20<01:44,  6.13s/it]

[I 2025-08-16 20:51:48,040] Trial 42 finished with value: 0.8718774097888458 and parameters: {'iterations': 1263, 'learning_rate': 0.013612863755213077, 'depth': 6, 'l2_leaf_reg': 2.3668794050022495, 'border_count': 215, 'bagging_temperature': 7.883804899404762, 'random_strength': 8.540608945201877}. Best is trial 1 with value: 0.8812619780680866.


Best trial: 1. Best value: 0.881262:  20%|██        | 4/20 [00:24<01:23,  5.24s/it]

[I 2025-08-16 20:51:51,920] Trial 43 finished with value: 0.8668786572089543 and parameters: {'iterations': 1453, 'learning_rate': 0.006579462894300343, 'depth': 5, 'l2_leaf_reg': 1.3721929365176542, 'border_count': 227, 'bagging_temperature': 7.00287287126839, 'random_strength': 7.861229583441085}. Best is trial 1 with value: 0.8812619780680866.


Best trial: 1. Best value: 0.881262:  25%|██▌       | 5/20 [00:27<01:06,  4.42s/it]

[I 2025-08-16 20:51:54,896] Trial 44 finished with value: 0.8655344911659977 and parameters: {'iterations': 1275, 'learning_rate': 0.022921053511179092, 'depth': 5, 'l2_leaf_reg': 2.7367504774786724, 'border_count': 139, 'bagging_temperature': 8.735315668540906, 'random_strength': 9.100010150911068}. Best is trial 1 with value: 0.8812619780680866.


Best trial: 1. Best value: 0.881262:  30%|███       | 6/20 [00:38<01:34,  6.72s/it]

[I 2025-08-16 20:52:06,071] Trial 45 finished with value: 0.8637931352203398 and parameters: {'iterations': 1814, 'learning_rate': 0.008712816087374396, 'depth': 7, 'l2_leaf_reg': 5.131964162829724, 'border_count': 242, 'bagging_temperature': 6.696686434561948, 'random_strength': 4.732385095371651}. Best is trial 1 with value: 0.8812619780680866.


Best trial: 1. Best value: 0.881262:  35%|███▌      | 7/20 [00:53<02:00,  9.28s/it]

[I 2025-08-16 20:52:20,620] Trial 46 finished with value: 0.8628730920142432 and parameters: {'iterations': 812, 'learning_rate': 0.015045543456679508, 'depth': 9, 'l2_leaf_reg': 3.7213403573615684, 'border_count': 199, 'bagging_temperature': 7.5130422581528356, 'random_strength': 9.984619142446288}. Best is trial 1 with value: 0.8812619780680866.


Best trial: 1. Best value: 0.881262:  40%|████      | 8/20 [00:57<01:34,  7.88s/it]

[I 2025-08-16 20:52:25,504] Trial 47 finished with value: 0.8679376176543967 and parameters: {'iterations': 1364, 'learning_rate': 0.017932246553079716, 'depth': 6, 'l2_leaf_reg': 3.344439227798677, 'border_count': 181, 'bagging_temperature': 5.733343462101902, 'random_strength': 8.824494575935823}. Best is trial 1 with value: 0.8812619780680866.


Best trial: 1. Best value: 0.881262:  45%|████▌     | 9/20 [01:03<01:19,  7.19s/it]

[I 2025-08-16 20:52:31,193] Trial 48 finished with value: 0.8504005210813996 and parameters: {'iterations': 946, 'learning_rate': 0.1232927556464615, 'depth': 7, 'l2_leaf_reg': 9.205428001761968, 'border_count': 221, 'bagging_temperature': 9.319773871033904, 'random_strength': 3.900858051166901}. Best is trial 1 with value: 0.8812619780680866.


Best trial: 1. Best value: 0.881262:  50%|█████     | 10/20 [01:10<01:09,  6.98s/it]

[I 2025-08-16 20:52:37,686] Trial 49 finished with value: 0.8638535216937697 and parameters: {'iterations': 1216, 'learning_rate': 0.006341589177241183, 'depth': 8, 'l2_leaf_reg': 7.632178544349049, 'border_count': 68, 'bagging_temperature': 8.347663695772438, 'random_strength': 7.487196280845241}. Best is trial 1 with value: 0.8812619780680866.


Best trial: 1. Best value: 0.881262:  55%|█████▌    | 11/20 [01:43<02:14, 14.92s/it]

[I 2025-08-16 20:53:10,617] Trial 50 finished with value: 0.8637541533419519 and parameters: {'iterations': 1646, 'learning_rate': 0.010317151405115582, 'depth': 9, 'l2_leaf_reg': 1.6962887492288712, 'border_count': 255, 'bagging_temperature': 3.6821228168488966, 'random_strength': 8.094417978927211}. Best is trial 1 with value: 0.8812619780680866.


Best trial: 1. Best value: 0.881262:  60%|██████    | 12/20 [01:47<01:32, 11.61s/it]

[I 2025-08-16 20:53:14,650] Trial 51 finished with value: 0.8614219738722187 and parameters: {'iterations': 1822, 'learning_rate': 0.01452021684181027, 'depth': 5, 'l2_leaf_reg': 8.176118954992962, 'border_count': 117, 'bagging_temperature': 0.7369440939548063, 'random_strength': 5.013976872562152}. Best is trial 1 with value: 0.8812619780680866.


Best trial: 1. Best value: 0.881262:  65%|██████▌   | 13/20 [01:49<01:01,  8.75s/it]

[I 2025-08-16 20:53:16,808] Trial 52 finished with value: 0.8651339700845979 and parameters: {'iterations': 1538, 'learning_rate': 0.02036304693916969, 'depth': 4, 'l2_leaf_reg': 6.03362399135431, 'border_count': 38, 'bagging_temperature': 1.3716923494333075, 'random_strength': 2.280566809876396}. Best is trial 1 with value: 0.8812619780680866.


Best trial: 1. Best value: 0.881262:  70%|███████   | 14/20 [02:48<02:23, 23.87s/it]

[I 2025-08-16 20:54:15,630] Trial 53 finished with value: 0.8471875637885283 and parameters: {'iterations': 1729, 'learning_rate': 0.009397790457933639, 'depth': 10, 'l2_leaf_reg': 7.257217552369398, 'border_count': 233, 'bagging_temperature': 0.12471914003981743, 'random_strength': 3.542713416833271}. Best is trial 1 with value: 0.8812619780680866.


Best trial: 1. Best value: 0.881262:  75%|███████▌  | 15/20 [03:03<01:46, 21.22s/it]

[I 2025-08-16 20:54:30,712] Trial 54 finished with value: 0.8460225017577283 and parameters: {'iterations': 1912, 'learning_rate': 0.03364829367999718, 'depth': 8, 'l2_leaf_reg': 8.506619086493663, 'border_count': 133, 'bagging_temperature': 1.799779303581525, 'random_strength': 9.546632843810116}. Best is trial 1 with value: 0.8812619780680866.


Best trial: 1. Best value: 0.881262:  80%|████████  | 16/20 [03:05<01:01, 15.50s/it]

[I 2025-08-16 20:54:32,923] Trial 55 finished with value: 0.8661993093828674 and parameters: {'iterations': 570, 'learning_rate': 0.027331548776359592, 'depth': 7, 'l2_leaf_reg': 9.038960076755773, 'border_count': 75, 'bagging_temperature': 2.6143999613123143, 'random_strength': 4.490614505007368}. Best is trial 1 with value: 0.8812619780680866.


Best trial: 1. Best value: 0.881262:  85%|████████▌ | 17/20 [03:07<00:34, 11.44s/it]

[I 2025-08-16 20:54:34,935] Trial 56 finished with value: 0.8692970929441384 and parameters: {'iterations': 1327, 'learning_rate': 0.04086307472815495, 'depth': 4, 'l2_leaf_reg': 2.1559947435864344, 'border_count': 60, 'bagging_temperature': 1.1808570148136, 'random_strength': 5.563149429913079}. Best is trial 1 with value: 0.8812619780680866.


Best trial: 1. Best value: 0.881262:  90%|█████████ | 18/20 [03:10<00:18,  9.04s/it]

[I 2025-08-16 20:54:38,380] Trial 57 finished with value: 0.8490810199360412 and parameters: {'iterations': 1053, 'learning_rate': 0.0978850751486101, 'depth': 6, 'l2_leaf_reg': 1.1407659579379261, 'border_count': 148, 'bagging_temperature': 4.853352056420288, 'random_strength': 9.043365644213294}. Best is trial 1 with value: 0.8812619780680866.


Best trial: 1. Best value: 0.881262:  95%|█████████▌| 19/20 [03:26<00:10, 10.89s/it]

[I 2025-08-16 20:54:53,583] Trial 58 finished with value: 0.8523563482343336 and parameters: {'iterations': 1436, 'learning_rate': 0.024179855106239965, 'depth': 8, 'l2_leaf_reg': 6.596364474884341, 'border_count': 246, 'bagging_temperature': 7.7548751221839725, 'random_strength': 0.8479913580282092}. Best is trial 1 with value: 0.8812619780680866.


Best trial: 1. Best value: 0.881262: 100%|██████████| 20/20 [03:42<00:00, 11.11s/it]

[I 2025-08-16 20:55:09,827] Trial 59 finished with value: 0.8502042650427526 and parameters: {'iterations': 505, 'learning_rate': 0.012620518589636446, 'depth': 10, 'l2_leaf_reg': 9.43726580052309, 'border_count': 217, 'bagging_temperature': 3.244865489306816, 'random_strength': 8.637763137817862}. Best is trial 1 with value: 0.8812619780680866.
Best catboost_v2 AUC: 0.8813
Best catboost_v2 params: {'iterations': 1800, 'learning_rate': 0.045918988705873284, 'depth': 8, 'l2_leaf_reg': 1.185260448662222, 'border_count': 249, 'bagging_temperature': 8.324426408004218, 'random_strength': 2.1233911067827616}

Advanced Optimization Results:
xgboost_v2: AUC = 0.8432
lightgbm_v2: AUC = 0.8511
catboost_v2: AUC = 0.8813

xgboost_v2 Study Results:
Best AUC: 0.8432
Best Params: {'n_estimators': 1189, 'learning_rate': 0.014204944704666236, 'max_depth': 4, 'min_child_weight': 1, 'subsample': 0.9765285466327456, 'colsample_bytree': 0.8960690353489539, 'colsample_bylevel': 0.8630461236701165, 'reg_alph




In [84]:
"""
xgboost_v2 Study Results:
Best AUC: 0.8432
Best Params: {'n_estimators': 1189, 'learning_rate': 0.014204944704666236, 'max_depth': 4, 'min_child_weight': 1, 'subsample': 0.9765285466327456, 'colsample_bytree': 0.8960690353489539, 'colsample_bylevel': 0.8630461236701165, 'reg_alpha': 0.029034135693057267, 'reg_lambda': 4.272454247302952, 'gamma': 0.0069277025975850864}

lightgbm_v2 Study Results:
Best AUC: 0.8511
Best Params: {'n_estimators': 596, 'learning_rate': 0.10616766366828562, 'max_depth': 3, 'num_leaves': 435, 'min_child_samples': 18, 'subsample': 0.9408099751164188, 'colsample_bytree': 0.7708730295926163, 'reg_alpha': 0.0518034735166127, 'reg_lambda': 1.068493188492761, 'min_gain_to_split': 0.7238500545020455}

catboost_v2 Study Results:
Best catboost_v2 AUC: 0.8911
Best catboost_v2 params: {'iterations': 1389, 'learning_rate': 0.005934530307791973, 'depth': 8, 'l2_leaf_reg': 2.5347171131856236, 'border_count': 46, 'bagging_temperature': 9.488855372533333, 'random_strength': 9.656320330745594}
"""

"\nxgboost_v2 Study Results:\nBest AUC: 0.8418\nBest Params: {'n_estimators': 1659, 'learning_rate': 0.01040697346842838, 'max_depth': 3, 'min_child_weight': 9, 'subsample': 0.9120572031542851, 'colsample_bytree': 0.9187021504122962, 'colsample_bylevel': 0.9313811040057837, 'reg_alpha': 0.7404465173409036, 'reg_lambda': 4.226191556898454, 'gamma': 0.5793452976256486}\n\nlightgbm_v2 Study Results:\nBest AUC: 0.8423\nBest Params: {'n_estimators': 1659, 'learning_rate': 0.01040697346842838, 'max_depth': 3, 'num_leaves': 414, 'min_child_samples': 74, 'subsample': 0.9187021504122962, 'colsample_bytree': 0.9313811040057837, 'reg_alpha': 0.7404465173409036, 'reg_lambda': 4.226191556898454, 'min_gain_to_split': 0.11586905952512971}\n\ncatboost_v2 Study Results:\nBest catboost_v2 AUC: 0.8911\nBest catboost_v2 params: {'iterations': 1389, 'learning_rate': 0.005934530307791973, 'depth': 8, 'l2_leaf_reg': 2.5347171131856236, 'border_count': 46, 'bagging_temperature': 9.488855372533333, 'random_str

In [85]:
# 4. Advanced Ensemble with Weighted Voting and Stacking - FIXED VERSION
class UltraAdvancedEnsemble:
    """Ultra-advanced ensemble with multiple techniques"""
    
    def __init__(self, random_state=42):
        self.random_state = random_state
        self.models = {}
        self.weights = {}
        self.meta_model = None
        
    def create_advanced_models(self, optimization_results):
        """Create models with advanced parameters"""
        models = {}
        
        for model_type, result in optimization_results.items():
            params = result['params'].copy()
            params['random_state'] = self.random_state
            
            if model_type == 'xgboost_v2':
                params['eval_metric'] = 'auc'
                params['verbosity'] = 0
                models[model_type] = xgb.XGBClassifier(**params)
            elif model_type == 'lightgbm_v2':
                params['verbosity'] = -1
                params['force_col_wise'] = True
                models[model_type] = lgb.LGBMClassifier(**params)
            elif model_type == 'catboost_v2':
                params['verbose'] = False
                models[model_type] = cb.CatBoostClassifier(**params)
                
        return models
    
    def fit_weighted_ensemble(self, X, y, models):
        """Fit ensemble with optimal weights"""
        from scipy.optimize import minimize
        
        # CRITICAL FIX: Convert to clean numpy arrays immediately
        print("Converting data to numpy arrays...")
        X_array = X.values if hasattr(X, 'values') else X
        y_array = y.values if hasattr(y, 'values') else y
        
        # Ensure clean indices for any DataFrame operations
        if hasattr(X, 'reset_index'):
            X = X.reset_index(drop=True)
        if hasattr(y, 'reset_index'):
            y = y.reset_index(drop=True)
        
        # Train all models
        trained_models = {}
        cv_scores = {}
        
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=self.random_state)
        
        for name, model in models.items():
            print(f"Training {name}...")
            # Use numpy arrays for training to avoid DataFrame issues
            model.fit(X_array, y_array)
            trained_models[name] = model
            
            # Get CV scores for weighting using numpy arrays
            scores = []
            for train_idx, val_idx in cv.split(X_array, y_array):
                X_train_fold = X_array[train_idx]
                y_train_fold = y_array[train_idx]
                X_val_fold = X_array[val_idx]
                y_val_fold = y_array[val_idx]
                
                try:
                    model_clone = clone(model)
                    model_clone.fit(X_train_fold, y_train_fold)
                    y_pred_proba = model_clone.predict_proba(X_val_fold)[:, 1]
                    score = roc_auc_score(y_val_fold, y_pred_proba)
                    scores.append(score)
                except Exception as e:
                    print(f"Error in CV for {name}: {e}")
                    scores.append(0.5)  # Default score
            
            cv_scores[name] = np.mean(scores)
            print(f"{name} CV AUC: {cv_scores[name]:.4f}")
        
        # Optimize weights based on CV performance
        def objective(weights):
            weights = weights / np.sum(weights)  # Normalize
            ensemble_pred = np.zeros(len(y_array))
            
            try:
                for fold, (train_idx, val_idx) in enumerate(cv.split(X_array, y_array)):
                    X_train_fold = X_array[train_idx]
                    y_train_fold = y_array[train_idx]
                    X_val_fold = X_array[val_idx]
                    y_val_fold = y_array[val_idx]
                    
                    fold_preds = []
                    for i, (name, model) in enumerate(models.items()):
                        try:
                            model_clone = clone(model)
                            model_clone.fit(X_train_fold, y_train_fold)
                            pred_proba = model_clone.predict_proba(X_val_fold)[:, 1]
                            fold_preds.append(pred_proba)
                        except Exception as e:
                            print(f"Error in weight optimization for {name}: {e}")
                            fold_preds.append(np.full(len(y_val_fold), 0.5))
                    
                    if fold_preds:
                        weighted_pred = np.average(fold_preds, axis=0, weights=weights)
                        ensemble_pred[val_idx] = weighted_pred
                
                return -roc_auc_score(y_array, ensemble_pred)  # Minimize negative AUC
            except Exception as e:
                print(f"Error in objective function: {e}")
                return 1.0  # High value to minimize
        
        # Initial weights based on CV scores
        if cv_scores:
            initial_weights = np.array([cv_scores[name] for name in models.keys()])
            initial_weights = initial_weights / np.sum(initial_weights)
            
            # Optimize weights with error handling
            try:
                result = minimize(
                    objective, 
                    initial_weights, 
                    method='SLSQP',
                    bounds=[(0.1, 1.0) for _ in range(len(models))],
                    constraints={'type': 'eq', 'fun': lambda w: np.sum(w) - 1}
                )
                optimal_weights = result.x / np.sum(result.x)
            except Exception as e:
                print(f"Weight optimization failed: {e}")
                print("Using equal weights...")
                optimal_weights = np.ones(len(models)) / len(models)
        else:
            print("No CV scores available, using equal weights...")
            optimal_weights = np.ones(len(models)) / len(models)
        
        self.models = trained_models
        self.weights = dict(zip(models.keys(), optimal_weights))
        
        print("\nOptimal weights:")
        for name, weight in self.weights.items():
            print(f"{name}: {weight:.3f}")
        
        return self
    
    def predict_proba_weighted(self, X):
        """Predict with weighted ensemble"""
        # Convert to numpy array to avoid DataFrame issues
        X_array = X.values if hasattr(X, 'values') else X
        
        predictions = []
        
        for name, model in self.models.items():
            try:
                pred_proba = model.predict_proba(X_array)[:, 1]
                predictions.append(pred_proba * self.weights[name])
            except Exception as e:
                print(f"Error predicting with {name}: {e}")
                predictions.append(np.full(X_array.shape[0], 0.5) * self.weights[name])
        
        return np.sum(predictions, axis=0)
    
    def predict_weighted(self, X):
        """Predict classes with weighted ensemble"""
        proba = self.predict_proba_weighted(X)
        return (proba > 0.5).astype(int)

# Create and train ultra-advanced ensemble with error handling
print("Creating ultra-advanced ensemble...")
ultra_ensemble = UltraAdvancedEnsemble(random_state=42)
advanced_models_dict = ultra_ensemble.create_advanced_models(advanced_results)

print("Fitting weighted ensemble...")
ultra_ensemble.fit_weighted_ensemble(X_train_final_v2, Y_train, advanced_models_dict)

print("Making predictions...")
# Make predictions
ultra_pred_proba = ultra_ensemble.predict_proba_weighted(X_test_final_v2)
ultra_pred = ultra_ensemble.predict_weighted(X_test_final_v2)

# Evaluate ultra ensemble
ultra_metrics = evaluator.calculate_metrics(Y_test, ultra_pred, ultra_pred_proba)
print(f"\nUltra Ensemble Results:")
print(f"AUC: {ultra_metrics['auc']:.4f}")
print(f"Accuracy: {ultra_metrics['accuracy']:.4f}")
print(f"F1: {ultra_metrics['f1']:.4f}")
print(f"MCC: {ultra_metrics['mcc']:.4f}")

Creating ultra-advanced ensemble...
Fitting weighted ensemble...
Converting data to numpy arrays...
Training xgboost_v2...
xgboost_v2 CV AUC: 0.8432
Training lightgbm_v2...
lightgbm_v2 CV AUC: 0.8511
Training catboost_v2...
catboost_v2 CV AUC: 0.8531

Optimal weights:
xgboost_v2: 0.331
lightgbm_v2: 0.334
catboost_v2: 0.335
Making predictions...

Ultra Ensemble Results:
AUC: 0.8363
Accuracy: 0.7833
F1: 0.4091
MCC: 0.3297


In [86]:
# 5. Additional Techniques: Pseudo-labeling and Data Augmentation - FIXED VERSION
def pseudo_labeling_enhancement(X_train, y_train, X_test, best_model, confidence_threshold=0.9):
    """Add high-confidence predictions as pseudo-labels"""
    
    # CRITICAL FIX: Convert to numpy arrays to avoid column name issues
    X_train_array = X_train.values if hasattr(X_train, 'values') else X_train
    y_train_array = y_train.values if hasattr(y_train, 'values') else y_train
    X_test_array = X_test.values if hasattr(X_test, 'values') else X_test
    
    # Train model on original data using numpy arrays
    best_model.fit(X_train_array, y_train_array)
    
    # Get predictions on test set
    test_proba = best_model.predict_proba(X_test_array)
    
    # Select high-confidence predictions
    max_proba = np.max(test_proba, axis=1)
    high_conf_mask = max_proba >= confidence_threshold
    
    if np.sum(high_conf_mask) > 0:
        # Add pseudo-labels using numpy arrays
        X_pseudo = X_test_array[high_conf_mask]
        y_pseudo = np.argmax(test_proba[high_conf_mask], axis=1)
        
        # Combine with training data (numpy arrays)
        X_enhanced = np.vstack([X_train_array, X_pseudo])
        y_enhanced = np.concatenate([y_train_array, y_pseudo])
        
        print(f"Added {len(y_pseudo)} pseudo-labels")
        return X_enhanced, y_enhanced
    else:
        print("No high-confidence predictions found")
        return X_train_array, y_train_array

# Also, let's check for and fix duplicate column names in our data
def fix_duplicate_columns(df):
    """Fix duplicate column names by adding suffixes"""
    cols = pd.Series(df.columns)
    for dup in cols[cols.duplicated()].unique():
        cols[cols[cols == dup].index.values[1:]] = [dup + '_' + str(i) for i in range(1, sum(cols == dup))]
    df.columns = cols
    return df

# Fix duplicate columns in our datasets
print("Checking for duplicate columns...")
print(f"X_train_final_v2 duplicate columns: {X_train_final_v2.columns.duplicated().sum()}")
print(f"X_test_final_v2 duplicate columns: {X_test_final_v2.columns.duplicated().sum()}")

if X_train_final_v2.columns.duplicated().sum() > 0:
    print("Fixing duplicate columns...")
    X_train_final_v2 = fix_duplicate_columns(X_train_final_v2)
    X_test_final_v2 = fix_duplicate_columns(X_test_final_v2)
    print("Duplicate columns fixed!")

# Apply pseudo-labeling with best model
best_model_name = max(advanced_results.keys(), key=lambda x: advanced_results[x]['score'])
best_model = advanced_models_dict[best_model_name]

print(f"Using {best_model_name} for pseudo-labeling...")

X_train_pseudo, Y_train_pseudo = pseudo_labeling_enhancement(
    X_train_final_v2, Y_train, X_test_final_v2, best_model, confidence_threshold=0.95
)

# Retrain ultra ensemble with pseudo-labels if any were added
if len(Y_train_pseudo) > len(Y_train):
    print("Retraining with pseudo-labels...")
    
    # Create new ensemble for pseudo-labeled data
    ultra_ensemble_v2 = UltraAdvancedEnsemble(random_state=42)
    
    # For the ensemble, we need to use the original models but retrain them
    # Create fresh models to avoid any state issues
    advanced_models_dict_v2 = ultra_ensemble_v2.create_advanced_models(advanced_results)
    
    # Fit with pseudo-labeled data (using numpy arrays)
    ultra_ensemble_v2.fit_weighted_ensemble(X_train_pseudo, Y_train_pseudo, advanced_models_dict_v2)
    
    # Final predictions
    final_pred_proba = ultra_ensemble_v2.predict_proba_weighted(X_test_final_v2)
    final_pred = ultra_ensemble_v2.predict_weighted(X_test_final_v2)
    
    final_metrics = evaluator.calculate_metrics(Y_test, final_pred, final_pred_proba)
    print(f"\nFinal Enhanced Results:")
    print(f"AUC: {final_metrics['auc']:.4f}")
    print(f"Accuracy: {final_metrics['accuracy']:.4f}")
    print(f"F1: {final_metrics['f1']:.4f}")
    print(f"MCC: {final_metrics['mcc']:.4f}")
else:
    final_pred_proba = ultra_pred_proba
    final_pred = ultra_pred
    final_metrics = ultra_metrics
    print("No pseudo-labels added, using original ensemble results.")

Checking for duplicate columns...
X_train_final_v2 duplicate columns: 3
X_test_final_v2 duplicate columns: 3
Fixing duplicate columns...
Duplicate columns fixed!
Using catboost_v2 for pseudo-labeling...
Added 91 pseudo-labels
Retraining with pseudo-labels...
Converting data to numpy arrays...
Training xgboost_v2...
xgboost_v2 CV AUC: 0.8941
Training lightgbm_v2...
lightgbm_v2 CV AUC: 0.8902
Training catboost_v2...
catboost_v2 CV AUC: 0.8947

Optimal weights:
xgboost_v2: 0.334
lightgbm_v2: 0.332
catboost_v2: 0.334

Final Enhanced Results:
AUC: 0.8248
Accuracy: 0.7833
F1: 0.4091
MCC: 0.3297


In [87]:
# 6. Model Calibration for Better Probability Estimates
from sklearn.calibration import CalibratedClassifierCV

def calibrate_model_predictions(model, X_train, y_train, X_test, method='isotonic'):
    """Calibrate model predictions for better probability estimates"""
    
    # Create calibrated classifier
    calibrated_model = CalibratedClassifierCV(model, method=method, cv=5)
    calibrated_model.fit(X_train, y_train)
    
    # Get calibrated predictions
    calibrated_proba = calibrated_model.predict_proba(X_test)[:, 1]
    calibrated_pred = calibrated_model.predict(X_test)
    
    return calibrated_pred, calibrated_proba

# Calibrate the best individual model
best_model_calibrated = clone(best_model)
cal_pred, cal_pred_proba = calibrate_model_predictions(
    best_model_calibrated, X_train_final_v2, Y_train, X_test_final_v2
)

cal_metrics = evaluator.calculate_metrics(Y_test, cal_pred, cal_pred_proba)
print(f"\nCalibrated Model Results:")
print(f"AUC: {cal_metrics['auc']:.4f}")
print(f"Accuracy: {cal_metrics['accuracy']:.4f}")
print(f"F1: {cal_metrics['f1']:.4f}")
print(f"MCC: {cal_metrics['mcc']:.4f}")


Calibrated Model Results:
AUC: 0.8539
Accuracy: 0.8083
F1: 0.5306
MCC: 0.4349


In [88]:
# 7. Final Comparison and Selection
print("\n" + "="*50)
print("FINAL RESULTS COMPARISON")
print("="*50)

all_results = {
    'Original Best': results_df.loc[results_df['Test_auc'].idxmax(), 'Test_auc'],
    'Ultra Ensemble': ultra_metrics['auc'],
    'Calibrated Model': cal_metrics['auc'],
}

if len(Y_train_pseudo) > len(Y_train):
    all_results['Enhanced with Pseudo-labels'] = final_metrics['auc']

for method, auc in sorted(all_results.items(), key=lambda x: x[1], reverse=True):
    print(f"{method:25}: AUC = {auc:.4f}")

# Select best method
best_method = max(all_results.keys(), key=lambda x: all_results[x])
best_auc = all_results[best_method]

print(f"\nBest method: {best_method} with AUC = {best_auc:.4f}")

if best_auc >= 0.9:
    print("🎉 Target AUC of 0.9+ achieved!")
else:
    print(f"Current best AUC: {best_auc:.4f}")
    print("Consider running with more trials or trying additional techniques.")


FINAL RESULTS COMPARISON
Original Best            : AUC = 0.8926
Calibrated Model         : AUC = 0.8539
Ultra Ensemble           : AUC = 0.8363
Enhanced with Pseudo-labels: AUC = 0.8248

Best method: Original Best with AUC = 0.8926
Current best AUC: 0.8926
Consider running with more trials or trying additional techniques.


In [89]:
"""
==================================================
FINAL RESULTS COMPARISON
==================================================
Original Best            : AUC = 0.8926
Calibrated Model         : AUC = 0.8539
Enhanced with Pseudo-labels: AUC = 0.8456
Ultra Ensemble           : AUC = 0.8430

Best method: Original Best with AUC = 0.8926
"""



In [98]:
for model_name, result in optimizer.study_results.items():
    print("\nStudy Results:")
    print(f"Model: {model_name}")
    completed = [t for t in result.trials if t.state == optuna.trial.TrialState.COMPLETE]
    print(f"Number of completed trials: {len(completed)}")
    print(f"Best AUC: {result.best_value:.4f}")    


Study Results:
Model: balanced_rf
Number of completed trials: 60
Best AUC: 0.7830

Study Results:
Model: xgboost
Number of completed trials: 60
Best AUC: 0.7676

Study Results:
Model: lightgbm
Number of completed trials: 60
Best AUC: 0.7634


In [99]:
for model_name, result in advanced_optimizer.study_results.items():
    print("\nAdvanced Study Results:")
    print(f"Model: {model_name}")
    completed = [t for t in result.trials if t.state == optuna.trial.TrialState.COMPLETE]
    print(f"Number of completed trials: {len(completed)}")
    print(f"Best AUC: {result.best_value:.4f}")


Advanced Study Results:
Model: xgboost_v2
Number of completed trials: 60
Best AUC: 0.8432

Advanced Study Results:
Model: lightgbm_v2
Number of completed trials: 60
Best AUC: 0.8511

Advanced Study Results:
Model: catboost_v2
Number of completed trials: 60
Best AUC: 0.8813
