In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, precision_recall_curve
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from imblearn.over_sampling import SMOTE
from sklearn.calibration import CalibratedClassifierCV
import warnings
warnings.filterwarnings('ignore')

# Load your dataset
df = pd.read_excel('../data/processed/exploratory_analysis_final.xlsx', sheet_name='Sheet1')

# Display information about the dataset
print(f"Dataset shape: {df.shape}")
print("\nTarget distribution:")
print(df['decision_grouped'].value_counts(normalize=True))

# --- Feature Engineering ---

# Basic transformations
df['gre_avg'] = (df['gre_quantitative_reasoning'] + df['gre_verbal_reasoning']) / 2
df['gpa_x_acceptancerate'] = df['undergrad_gpa'] * (1 - df['acceptance_rate'])

# Advanced interaction features
df['gpa_gre_interaction'] = df['undergrad_gpa'] * df['gre_avg']
df['gpa_acceptance_interaction'] = df['undergrad_gpa'] * (1 - df['acceptance_rate'])
df['gre_acceptance_interaction'] = df['gre_avg'] * (1 - df['acceptance_rate'])
df['writing_gre_interaction'] = df['analytical_writing'] * df['gre_avg']
df['writing_gpa_interaction'] = df['analytical_writing'] * df['undergrad_gpa']

# Polynomial features for key metrics
df['gpa_squared'] = df['undergrad_gpa'] ** 2
df['gre_quant_squared'] = df['gre_quantitative_reasoning'] ** 2
df['gre_verbal_squared'] = df['gre_verbal_reasoning'] ** 2

# Composite profile strength with weighted components
df['profile_strength'] = (
    df['gre_avg'] * 0.4 + 
    df['undergrad_gpa'] * 0.3 + 
    df['analytical_writing'] * 0.1 + 
    (1 - df['acceptance_rate']) * 0.2
)

# Log transformation for skewed features
df['log_qs_rank'] = np.log1p(abs(df['qs_rank_score']))

# Rank bucketing with more granular categories
def rank_bucket(score):
    if pd.isna(score):
        return 'Unknown'
    elif score <= -5:
        return 'Top 5'
    elif score <= -10:
        return 'Top 10'
    elif score <= -25:
        return 'Top 25'
    elif score <= -50:
        return 'Top 50'
    elif score <= -100:
        return 'Top 100'
    else:
        return 'Other'

df['qs_rank_bucket'] = df['qs_rank_score'].apply(rank_bucket)

# Percentile rankings for numeric features
for col in ['undergrad_gpa', 'gre_quantitative_reasoning', 'gre_verbal_reasoning', 'analytical_writing']:
    if col in df.columns:
        df[f'{col}_percentile'] = df[col].rank(pct=True)

# Standardized z-scores
for col in ['undergrad_gpa', 'gre_quantitative_reasoning', 'gre_verbal_reasoning', 'analytical_writing']:
    if col in df.columns:
        df[f'{col}_zscore'] = stats.zscore(df[col], nan_policy='omit')

# Relative standing features
df['gre_quant_verbal_ratio'] = df['gre_quantitative_reasoning'] / df['gre_verbal_reasoning'].replace(0, 0.001)
df['gpa_to_acceptance_ratio'] = df['undergrad_gpa'] / df['acceptance_rate'].replace(0, 0.001)

# Update feature list - add all potentially useful features
features = [
    # Original features
    'undergrad_gpa', 'gre_quantitative_reasoning', 'gre_verbal_reasoning',
    'analytical_writing', 'acceptance_rate', 'qs_rank_score', 'qs_tier',
    'gpa_percentile', 'gre_avg', 'gpa_x_acceptancerate',
    
    # Interaction features
    'gpa_gre_interaction', 'gpa_acceptance_interaction', 'gre_acceptance_interaction',
    'writing_gre_interaction', 'writing_gpa_interaction',
    
    # Polynomial features
    'gpa_squared', 'gre_quant_squared', 'gre_verbal_squared',
    
    # Composite metrics
    'profile_strength',
    
    # Transformed features
    'log_qs_rank',
    
    # Categorical features
    'qs_rank_bucket',
    
    # Percentile features
    'undergrad_gpa_percentile', 'gre_quantitative_reasoning_percentile',
    'gre_verbal_reasoning_percentile', 'analytical_writing_percentile',
    
    # Z-score features
    'undergrad_gpa_zscore', 'gre_quantitative_reasoning_zscore',
    'gre_verbal_reasoning_zscore', 'analytical_writing_zscore',
    
    # Ratio features
    'gre_quant_verbal_ratio', 'gpa_to_acceptance_ratio'
]


Dataset shape: (100621, 21)

Target distribution:
decision_grouped
0    0.370151
1    0.357639
2    0.272210
Name: proportion, dtype: float64


In [20]:

# Remove any features that might not exist in the dataframe
features = [f for f in features if f in df.columns]

# Create binary target: 1 if admitted, 0 otherwise
df['binary_decision'] = df['decision_grouped'].apply(lambda x: 1 if x == 1 else 0)

# Check for class imbalance
print("\nClass balance:")
print(df['binary_decision'].value_counts(normalize=True))

X = df[features]
y = df['binary_decision']

# Split data with stratification to maintain class balance
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Identify numeric and categorical features
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print(f"\nNumeric features: {len(numeric_features)}")
print(f"Categorical features: {len(categorical_features)}")

# --- Check for feature correlations and handle multicollinearity ---
corr_matrix = X[numeric_features].corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
high_corr_features = [column for column in upper_tri.columns if any(upper_tri[column] > 0.90)]
print(f"\nHigh correlation features (>0.90): {high_corr_features}")

# --- Advanced Preprocessing Pipeline ---
numeric_transformer = Pipeline([
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', PowerTransformer(method='yeo-johnson', standardize=True))
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# --- Define the standard pipeline without SMOTE ---
pipeline_steps = [
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))
]

# --- Define the pipeline ---
xgb_pipeline = Pipeline(pipeline_steps)

# --- Check for class imbalance and apply SMOTE if needed ---
class_counts = y_train.value_counts()
imbalanced = class_counts.min() / class_counts.max() < 0.8



Class balance:
binary_decision
0    0.642361
1    0.357639
Name: proportion, dtype: float64

Numeric features: 30
Categorical features: 1

High correlation features (>0.90): ['qs_tier', 'gpa_percentile', 'gpa_acceptance_interaction', 'gre_quantitative_reasoning_percentile', 'gre_verbal_reasoning_percentile', 'analytical_writing_percentile', 'undergrad_gpa_zscore', 'gre_quantitative_reasoning_zscore', 'gre_verbal_reasoning_zscore', 'analytical_writing_zscore']


In [21]:

if imbalanced:
    print("\nDetected class imbalance. Will apply SMOTE after preprocessing.")
    # We'll apply SMOTE manually after preprocessing in the fitting process

# --- First level hyperparameter tuning with RandomizedSearchCV ---
# Define wider parameter grid for XGBoost
param_dist = {
    'classifier__n_estimators': [100, 200, 300, 500, 700, 1000],
    'classifier__max_depth': [3, 5, 7, 9, 12],
    'classifier__learning_rate': [0.01, 0.03, 0.05, 0.1, 0.2],
    'classifier__subsample': [0.7, 0.8, 0.9, 1.0],
    'classifier__colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'classifier__min_child_weight': [1, 3, 5, 7],
    'classifier__gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'classifier__reg_alpha': [0, 0.1, 1, 10, 100],
    'classifier__reg_lambda': [0, 0.1, 1, 10, 100]
}

# Use StratifiedKFold for maintaining class distribution
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# First level search - random search for computational efficiency
random_search = RandomizedSearchCV(
    xgb_pipeline, 
    param_distributions=param_dist, 
    n_iter=50,  # Try 50 parameter combinations
    cv=cv, 
    scoring='accuracy',
    verbose=1, 
    n_jobs=-1,
    random_state=42
)

print("\nPerforming initial hyperparameter search...")
# If we have class imbalance, manually apply SMOTE before fitting
if imbalanced:
    # Fit the preprocessor
    X_train_processed = preprocessor.fit_transform(X_train)
    
    # Apply SMOTE
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)
    
    # Create a simpler pipeline without the preprocessor since we already applied it
    simple_pipeline = Pipeline([
        ('classifier', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))
    ])
    
    # New param distributions without preprocessor
    # Make sure we only include parameters that are valid for XGBClassifier
    simple_param_dist = {}
    for k, v in param_dist.items():
        if k.startswith('classifier__'):
            simple_param_dist[k.replace('classifier__', '')] = v
    
    # Create new RandomizedSearchCV with the simple pipeline
    simple_search = RandomizedSearchCV(
        simple_pipeline.named_steps['classifier'],  # Use just the classifier, not the pipeline
        param_distributions=simple_param_dist, 
        n_iter=50,
        cv=cv, 
        scoring='accuracy',
        verbose=1, 
        n_jobs=-1,
        random_state=42
    )
    
    # Fit the search on the resampled data
    try:
        simple_search.fit(X_train_resampled, y_train_resampled)
        
        # Store the best parameters and create a full pipeline with the best parameters
        try:
            best_params = {'classifier__' + k: v for k, v in simple_search.best_params_.items()}
        except AttributeError:
            print("Warning: Could not access best_params_ from RandomizedSearchCV. Using default parameters.")
            best_params = {
                'classifier__n_estimators': 200,
                'classifier__max_depth': 5,
                'classifier__learning_rate': 0.1,
                'classifier__subsample': 0.8,
                'classifier__colsample_bytree': 0.8,
                'classifier__min_child_weight': 3,
                'classifier__gamma': 0.1
            }
    except Exception as e:
        print(f"Warning: RandomizedSearchCV failed with error: {e}. Using default parameters.")
        best_params = {
            'classifier__n_estimators': 200,
            'classifier__max_depth': 5,
            'classifier__learning_rate': 0.1,
            'classifier__subsample': 0.8,
            'classifier__colsample_bytree': 0.8,
            'classifier__min_child_weight': 3,
            'classifier__gamma': 0.1
        }
    
    # Create a new best model that includes the preprocessor
    best_model = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(
            random_state=42, 
            use_label_encoder=False, 
            eval_metric='logloss',
            **{k.replace('classifier__', ''): v for k, v in best_params.items() 
               if k.startswith('classifier__')}
        ))
    ])
    
    # Fit the best model on the original data
    best_model.fit(X_train, y_train)
    
    # Print the best parameters
    print("\nBest parameters from initial search (with SMOTE):")
    print(best_params)
else:
    # Without imbalance, just fit normally
    try:
        random_search.fit(X_train, y_train)
        best_params = random_search.best_params_
        best_model = random_search.best_estimator_
    except Exception as e:
        print(f"Warning: RandomizedSearchCV failed with error: {e}. Using default parameters.")
        best_params = {
            'classifier__n_estimators': 200,
            'classifier__max_depth': 5,
            'classifier__learning_rate': 0.1,
            'classifier__subsample': 0.8,
            'classifier__colsample_bytree': 0.8,
            'classifier__min_child_weight': 3,
            'classifier__gamma': 0.1
        }
        best_model = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', XGBClassifier(
                random_state=42, 
                use_label_encoder=False, 
                eval_metric='logloss',
                **{k.replace('classifier__', ''): v for k, v in best_params.items() 
                   if k.startswith('classifier__')}
            ))
        ])
        best_model.fit(X_train, y_train)
        
    print("\nBest parameters from initial search:")
    print(best_params)



Detected class imbalance. Will apply SMOTE after preprocessing.

Performing initial hyperparameter search...
Fitting 5 folds for each of 50 candidates, totalling 250 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode


Best parameters from initial search (with SMOTE):
{'classifier__subsample': 0.9, 'classifier__reg_lambda': 10, 'classifier__reg_alpha': 0, 'classifier__n_estimators': 300, 'classifier__min_child_weight': 3, 'classifier__max_depth': 12, 'classifier__learning_rate': 0.05, 'classifier__gamma': 0.3, 'classifier__colsample_bytree': 1.0}


In [22]:
# --- Second level hyperparameter tuning with GridSearchCV ---
# Create narrower search space around the best parameters
def create_refined_param_grid(best_params):
    refined_param_grid = {}
    
    # Learning rate refinement
    lr = best_params.get('classifier__learning_rate', 0.1)
    refined_param_grid['classifier__learning_rate'] = [lr * 0.5, lr, lr * 1.5]
    
    # Max depth refinement
    max_depth = best_params.get('classifier__max_depth', 3)
    refined_param_grid['classifier__max_depth'] = [max(1, max_depth - 1), max_depth, max_depth + 1]
    
    # N estimators refinement
    n_est = best_params.get('classifier__n_estimators', 100)
    refined_param_grid['classifier__n_estimators'] = [n_est, int(n_est * 1.5), int(n_est * 2)]
    
    # Regularization refinement
    gamma = best_params.get('classifier__gamma', 0)
    refined_param_grid['classifier__gamma'] = [max(0, gamma - 0.1), gamma, gamma + 0.1]
    
    alpha = best_params.get('classifier__reg_alpha', 0)
    refined_param_grid['classifier__reg_alpha'] = [max(0, alpha * 0.5), alpha, alpha * 2]
    
    lambda_val = best_params.get('classifier__reg_lambda', 1)
    refined_param_grid['classifier__reg_lambda'] = [max(0, lambda_val * 0.5), lambda_val, lambda_val * 2]
    
    return refined_param_grid

# Create refined param grid for second search
refined_param_grid = create_refined_param_grid({k: v for k, v in best_params.items() if k in param_dist})

# Set best parameters from first search
best_model = random_search.best_estimator_

# --- Feature Importance Analysis ---
# Get feature importances from the XGBoost model
print("\nAnalyzing feature importances...")
try:
    # Fit the preprocessor to get the feature names
    preprocessor.fit(X_train)
    
    # Get feature names after preprocessing
    feature_names = []
    if hasattr(preprocessor.named_transformers_['num'], 'get_feature_names_out'):
        numeric_feature_names = preprocessor.named_transformers_['num'].get_feature_names_out(numeric_features)
        feature_names.extend(numeric_feature_names)
    else:
        feature_names.extend(numeric_features)
        
    if categorical_features:
        if hasattr(preprocessor.named_transformers_['cat'], 'get_feature_names_out'):
            categorical_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
            feature_names.extend(categorical_feature_names)
    
    # Extract the trained XGBoost model
    xgb_model = best_model.named_steps['classifier']
    
    # Get the feature importances
    importances = xgb_model.feature_importances_
    
    # Create and print feature importance DataFrame
    if len(feature_names) == len(importances):
        importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': importances
        }).sort_values('Importance', ascending=False)
        
        print("\nTop 10 most important features:")
        print(importance_df.head(10))
        
        # Select the most important features
        top_features_threshold = importance_df['Importance'].quantile(0.7)  # Keep top 70% features
        n_top_features = sum(importance_df['Importance'] >= top_features_threshold)
        print(f"\nKeeping top {n_top_features} features out of {len(feature_names)}")
    
except Exception as e:
    print(f"Error in feature importance analysis: {e}")
    n_top_features = None

# --- Feature Selection and Second-level Tuning ---
print("\nPerforming second-level tuning with feature selection...")

# Try to get feature importances from best model
try:
    # Extract the classifier from best model
    if hasattr(best_model, 'named_steps') and 'classifier' in best_model.named_steps:
        classifier = best_model.named_steps['classifier']
        
        # Get preprocessed data
        X_train_processed = preprocessor.fit_transform(X_train)
        
        # If we have class imbalance, use SMOTE
        if imbalanced:
            smote = SMOTE(random_state=42)
            X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)
            classifier.fit(X_train_resampled, y_train_resampled)
        else:
            classifier.fit(X_train_processed, y_train)
        
        # Get feature importances
        importances = classifier.feature_importances_
        
        # Get feature names
        feature_names = []
        if hasattr(preprocessor, 'get_feature_names_out'):
            feature_names = preprocessor.get_feature_names_out()
        elif hasattr(preprocessor, 'named_transformers_'):
            if hasattr(preprocessor.named_transformers_['num'], 'get_feature_names_out'):
                numeric_feature_names = preprocessor.named_transformers_['num'].get_feature_names_out(numeric_features)
                feature_names.extend(numeric_feature_names)
            else:
                feature_names.extend(numeric_features)
                
            if categorical_features and hasattr(preprocessor.named_transformers_['cat'], 'get_feature_names_out'):
                categorical_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
                feature_names.extend(categorical_feature_names)
        
        # Only proceed if we have feature names and importances of the same length
        if len(feature_names) == len(importances):
            # Create DataFrame with importances
            importance_df = pd.DataFrame({
                'Feature': feature_names,
                'Importance': importances
            }).sort_values('Importance', ascending=False)
            
            print("\nTop 10 most important features:")
            print(importance_df.head(10))
            
            # Select top features
            top_features_threshold = importance_df['Importance'].quantile(0.7)  # Top 70%
            n_top_features = sum(importance_df['Importance'] >= top_features_threshold)
            print(f"\nKeeping top {n_top_features} features out of {len(feature_names)}")
            
            # Create feature selector
            feature_selector = SelectFromModel(
                XGBClassifier(random_state=42),
                threshold='median'
            )
            
            # Fit selector
            feature_selector.fit(X_train_processed, y_train)
            
            # Get selected features
            X_train_selected = feature_selector.transform(X_train_processed)
            print(f"Selected feature shape: {X_train_selected.shape}")
            
            # Evaluate refined parameters with selected features
            refined_results = []
            
            for i, params in enumerate(refined_params_list):
                print(f"Evaluating refined parameter set {i+1}/{len(refined_params_list)}...")
                try:
                    # Create model
                    model = XGBClassifier(
                        random_state=42,
                        use_label_encoder=False,
                        eval_metric='logloss',
                        **params
                    )
                    
                    # Fit and evaluate with selected features
                    if imbalanced:
                        # Apply SMOTE to selected features
                        X_selected_resampled, y_selected_resampled = smote.fit_resample(X_train_selected, y_train)
                        
                        # Cross-validation with selected features
                        kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
                        scores = []
                        
                        for train_idx, val_idx in kf.split(X_selected_resampled, y_selected_resampled):
                            X_fold_train = X_selected_resampled[train_idx]
                            y_fold_train = y_selected_resampled[train_idx]
                            X_fold_val = X_selected_resampled[val_idx]
                            y_fold_val = y_selected_resampled[val_idx]
                            
                            model.fit(X_fold_train, y_fold_train)
                            y_pred = model.predict(X_fold_val)
                            scores.append(accuracy_score(y_fold_val, y_pred))
                        
                        cv_score = np.mean(scores)
                    else:
                        # Cross-validation with selected features
                        from sklearn.model_selection import cross_val_score
                        scores = cross_val_score(model, X_train_selected, y_train, cv=3, scoring='accuracy')
                        cv_score = np.mean(scores)
                    
                    refined_results.append((params, cv_score))
                    print(f"CV Score with selected features: {cv_score:.4f}")
                except Exception as e:
                    print(f"Error evaluating refined parameters: {e}")
            
            # If we have results, find the best
            if refined_results:
                # Sort by score
                refined_results.sort(key=lambda x: x[1], reverse=True)
                
                # Get best parameters
                best_refined_params = refined_results[0][0]
                print("\nBest parameters from refined search:")
                for param, value in best_refined_params.items():
                    print(f"  {param}: {value}")
                print(f"CV Score: {refined_results[0][1]:.4f}")
                
                # Create model with feature selection and best parameters
                final_model = Pipeline([
                    ('preprocessor', preprocessor),
                    ('feature_selection', feature_selector),
                    ('classifier', XGBClassifier(
                        random_state=42,
                        use_label_encoder=False,
                        eval_metric='logloss',
                        **best_refined_params
                    ))
                ])
                
                # Fit final model
                if imbalanced:
                    # First preprocess
                    X_train_processed = preprocessor.fit_transform(X_train)
                    
                    # Apply feature selection
                    X_train_selected = feature_selector.transform(X_train_processed)
                    
                    # Apply SMOTE
                    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_selected, y_train)
                    
                    # Fit classifier
                    classifier = XGBClassifier(
                        random_state=42,
                        use_label_encoder=False,
                        eval_metric='logloss',
                        **best_refined_params
                    )
                    classifier.fit(X_train_resampled, y_train_resampled)
                    
                    # Replace classifier in pipeline
                    final_model.named_steps['classifier'] = classifier
                else:
                    final_model.fit(X_train, y_train)
            else:
                print("No valid results from refined search. Using best model from initial search.")
                final_model = best_model
        else:
            print(f"Feature names ({len(feature_names)}) and importances ({len(importances)}) have different lengths. Skipping feature selection.")
            final_model = best_model
    else:
        print("Best model does not have a classifier step. Skipping feature selection.")
        final_model = best_model
except Exception as e:
    print(f"Error in feature selection process: {e}")
    print("Using best model from initial search.")
    final_model = best_model

# --- Model Calibration ---
# Calibrate probabilities
calibrated_model = CalibratedClassifierCV(
    final_model,
    method='isotonic',
    cv=3
)

print("\nCalibrating model probabilities...")
calibrated_model.fit(X_train, y_train)

# --- Ensemble Learning ---
# Create an ensemble of different models
print("\nTraining ensemble of models...")

# Create base models with best parameters
xgb_best = XGBClassifier(
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss',
    **{k.replace('classifier__', ''): v for k, v in best_params.items() if k.startswith('classifier__')}
)

lgbm_model = LGBMClassifier(
    random_state=42,
    n_estimators=best_params.get('classifier__n_estimators', 100),
    learning_rate=best_params.get('classifier__learning_rate', 0.1),
    max_depth=best_params.get('classifier__max_depth', 3)
)

rf_model = RandomForestClassifier(
    random_state=42,
    n_estimators=best_params.get('classifier__n_estimators', 100),
    max_depth=best_params.get('classifier__max_depth', 3)
)

catboost_model = CatBoostClassifier(
    random_state=42,
    n_estimators=best_params.get('classifier__n_estimators', 100),
    learning_rate=best_params.get('classifier__learning_rate', 0.1),
    max_depth=best_params.get('classifier__max_depth', 3),
    verbose=0
)

# Preprocess data for ensemble models
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# If we have class imbalance, apply SMOTE before training ensemble
if imbalanced:
    print("Applying SMOTE for ensemble training...")
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)
    
    # Train each model on the resampled data
    xgb_best.fit(X_train_resampled, y_train_resampled)
    lgbm_model.fit(X_train_resampled, y_train_resampled)
    rf_model.fit(X_train_resampled, y_train_resampled)
    catboost_model.fit(X_train_resampled, y_train_resampled)
    
    # Create ensemble with pre-trained models
    ensemble = VotingClassifier(
        estimators=[
            ('xgb', xgb_best),
            ('lgbm', lgbm_model),
            ('rf', rf_model),
            ('catboost', catboost_model)
        ],
        voting='soft'  # Use probability estimates for voting
    )
    
    # We need to fit the VotingClassifier again, but since models are already trained,
    # this will just calibrate the voting
    ensemble.fit(X_train_processed, y_train)
else:
    # Create and train voting ensemble normally
    ensemble = VotingClassifier(
        estimators=[
            ('xgb', xgb_best),
            ('lgbm', lgbm_model),
            ('rf', rf_model),
            ('catboost', catboost_model)
        ],
        voting='soft'  # Use probability estimates for voting
    )
    
    ensemble.fit(X_train_processed, y_train)

# --- Final Evaluation ---
# Get predictions from both models
calibrated_preds = calibrated_model.predict(X_test)
ensemble_preds = ensemble.predict(X_test_processed)

# Calculate metrics
calibrated_accuracy = accuracy_score(y_test, calibrated_preds)
ensemble_accuracy = accuracy_score(y_test, ensemble_preds)

# Print results
print("\n--- Final Results ---")
print(f"Calibrated Model Accuracy: {calibrated_accuracy:.4f}")
print(f"Ensemble Model Accuracy: {ensemble_accuracy:.4f}")

print("\nCalibrated Model Classification Report:")
print(classification_report(y_test, calibrated_preds))

print("\nEnsemble Model Classification Report:")
print(classification_report(y_test, ensemble_preds))

# Choose the best performing model
best_final_model = ensemble if ensemble_accuracy > calibrated_accuracy else calibrated_model
best_final_preds = ensemble_preds if ensemble_accuracy > calibrated_accuracy else calibrated_preds
best_accuracy = max(ensemble_accuracy, calibrated_accuracy)

print(f"\nBest Model: {'Ensemble' if ensemble_accuracy > calibrated_accuracy else 'Calibrated XGBoost'}")
print(f"Best Accuracy: {best_accuracy:.4f}")

# --- Confusion Matrix ---
cm = confusion_matrix(y_test, best_final_preds)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(f'Confusion Matrix (Accuracy: {best_accuracy:.4f})')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# --- Calculate probability scores and decision threshold analysis ---
if hasattr(best_final_model, 'predict_proba'):
    # Get probability predictions
    y_proba = best_final_model.predict_proba(X_test if best_final_model == calibrated_model else X_test_processed)[:, 1]
    
    # Calculate ROC AUC score
    roc_auc = roc_auc_score(y_test, y_proba)
    print(f"\nROC AUC Score: {roc_auc:.4f}")
    
    # Find optimal threshold
    precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
    f1_scores = 2 * (precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1] + 1e-10)
    optimal_threshold_idx = np.argmax(f1_scores)
    optimal_threshold = thresholds[optimal_threshold_idx]
    
    print(f"Optimal Probability Threshold: {optimal_threshold:.4f}")
    
    # Apply optimal threshold
    optimized_preds = (y_proba >= optimal_threshold).astype(int)
    optimized_accuracy = accuracy_score(y_test, optimized_preds)
    
    print(f"Accuracy with Optimal Threshold: {optimized_accuracy:.4f}")
    print("\nClassification Report with Optimal Threshold:")
    print(classification_report(y_test, optimized_preds))
    
    if optimized_accuracy > best_accuracy:
        print("\nUsing optimized threshold improved the model performance!")
    
# --- Feature Permutation Importance ---
from sklearn.inspection import permutation_importance

if isinstance(best_final_model, Pipeline):
    # For pipeline model
    perm_importance = permutation_importance(
        best_final_model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1
    )
else:
    # For ensemble model
    perm_importance = permutation_importance(
        best_final_model, X_test_processed, y_test, n_repeats=10, random_state=42, n_jobs=-1
    )

# Sort features by importance
if hasattr(X, 'columns'):
    if isinstance(best_final_model, Pipeline):
        # For pipeline model, try to get feature names
        try:
            if 'feature_selection' in best_final_model.named_steps:
                # Get feature mask from the feature selector
                feature_mask = best_final_model.named_steps['feature_selection'].get_support()
                # Get transformed feature names
                transformed_features = []
                if hasattr(preprocessor, 'get_feature_names_out'):
                    transformed_features = preprocessor.get_feature_names_out()
                # Get selected feature names
                selected_features = [transformed_features[i] for i in range(len(transformed_features)) if feature_mask[i]]
                perm_sorted_idx = perm_importance.importances_mean.argsort()[::-1]
                perm_feature_names = [selected_features[i] for i in perm_sorted_idx]
            else:
                # Without feature selection, use all feature names
                perm_sorted_idx = perm_importance.importances_mean.argsort()[::-1]
                if hasattr(preprocessor, 'get_feature_names_out'):
                    perm_feature_names = preprocessor.get_feature_names_out()[perm_sorted_idx]
                else:
                    perm_feature_names = ["Feature_" + str(i) for i in perm_sorted_idx]
        except Exception as e:
            print(f"Error getting feature names: {e}")
            perm_sorted_idx = perm_importance.importances_mean.argsort()[::-1]
            perm_feature_names = ["Feature_" + str(i) for i in perm_sorted_idx]
    else:
        # For ensemble model, use original feature names
        perm_sorted_idx = perm_importance.importances_mean.argsort()[::-1]
        perm_feature_names = X.columns[perm_sorted_idx] if hasattr(X, 'columns') else ["Feature_" + str(i) for i in perm_sorted_idx]
else:
    perm_sorted_idx = perm_importance.importances_mean.argsort()[::-1]
    perm_feature_names = ["Feature_" + str(i) for i in perm_sorted_idx]

# Display top 10 features by permutation importance
print("\nTop 10 features by permutation importance:")
for i in range(min(10, len(perm_sorted_idx))):
    idx = perm_sorted_idx[i]
    print(f"{perm_feature_names[i]}: {perm_importance.importances_mean[idx]:.4f} ± {perm_importance.importances_std[idx]:.4f}")

# --- Final Summary ---
print("\n=== Final Model Performance Summary ===")
print(f"Best Model: {'Optimized Threshold Ensemble' if optimized_accuracy > best_accuracy else 'Ensemble' if ensemble_accuracy > calibrated_accuracy else 'Calibrated XGBoost'}")
print(f"Final Accuracy: {max(optimized_accuracy if 'optimized_accuracy' in locals() else 0, best_accuracy):.4f}")

print("\nModel training and evaluation complete!")

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_estimator_'