In [6]:
import pandas as pd
import numpy as np
from typing import Tuple, List, Optional, Dict
import matplotlib.pyplot as plt
import seaborn as sns
from dataclasses import dataclass

from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

@dataclass
class FeatureImportance:
    """Class to store feature importance information."""
    feature_names: List[str]
    importance_values: Dict[str, np.ndarray]
    importance_types: Dict[str, str]

class DataLoader:
    """Class to handle data loading and initial preprocessing."""
    
    @staticmethod
    def load_data() -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.DataFrame]:
        """Load and combine the training and test datasets."""
        train = pd.read_csv('train.csv', index_col='id')
        test = pd.read_csv('test.csv', index_col='id')
        submission = pd.read_csv('sample_submission.csv', index_col='id')
        original_data = pd.read_csv('credit_risk_dataset.csv')
        
        train_df = pd.concat([train, original_data])
        X_train = train_df.drop(['loan_status'], axis=1)
        y_train = train_df['loan_status']
        
        return X_train, test, y_train, submission

class CustomPreprocessor(BaseEstimator, TransformerMixin):
    """Custom preprocessor that handles both numerical and categorical features."""
    
    def __init__(self, categorical_features: Optional[List[str]] = None):
        self.categorical_features = categorical_features
        self.numerical_features = None
        self.preprocessor = None
        self.feature_names = None
    
    def get_feature_names(self) -> List[str]:
        """Return list of feature names after transformation."""
        return self.feature_names
    
    def fit(self, X: pd.DataFrame, y=None):
        """Fit the preprocessor on the training data."""
        if self.categorical_features is None:
            self.categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
        
        self.numerical_features = X.select_dtypes(include=['number']).columns.tolist()
        
        # Create preprocessing steps for numerical and categorical features
        numeric_transformer = Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ])
        
        categorical_transformer = Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
        ])
        
        # Combine transformers
        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, self.numerical_features),
                ('cat', categorical_transformer, self.categorical_features)
            ])
        
        self.preprocessor.fit(X)
        
        # Store feature names
        self.feature_names = self.numerical_features + self.categorical_features
        return self
    
    def transform(self, X: pd.DataFrame) -> np.ndarray:
        """Transform the data."""
        return self.preprocessor.transform(X)

class FeatureImportanceAnalyzer:
    """Class to analyze and visualize feature importances."""
    
    @staticmethod
    def plot_feature_importance(importance: FeatureImportance, top_n: int = 20):
        """Plot feature importance for each model."""
        n_models = len(importance.importance_values)
        fig, axes = plt.subplots(1, n_models, figsize=(20, 8))
        
        for idx, (model_name, importance_vals) in enumerate(importance.importance_values.items()):
            # Sort features by importance
            sorted_idx = np.argsort(importance_vals)
            pos = np.arange(sorted_idx[-top_n:].shape[0]) + .5
            
            # Plot
            ax = axes[idx] if n_models > 1 else axes
            ax.barh(pos, importance_vals[sorted_idx[-top_n:]])
            ax.set_yticks(pos)
            ax.set_yticklabels(np.array(importance.feature_names)[sorted_idx[-top_n:]])
            ax.set_title(f'{model_name}\n({importance.importance_types[model_name]})')
            
        plt.tight_layout()
        return fig
    
    @staticmethod
    def get_feature_importance_summary(importance: FeatureImportance) -> pd.DataFrame:
        """Create a summary DataFrame of feature importances across all models."""
        summary_dict = {}
        
        for model_name, imp_values in importance.importance_values.items():
            # Normalize importance values
            normalized_imp = imp_values / np.sum(imp_values)
            summary_dict[f"{model_name}_importance"] = normalized_imp
        
        summary_df = pd.DataFrame(summary_dict, index=importance.feature_names)
        
        # Add mean importance across all models
        summary_df['mean_importance'] = summary_df.mean(axis=1)
        summary_df = summary_df.sort_values('mean_importance', ascending=False)
        
        return summary_df

class LoanApprovalModel:
    """Main model class that handles training and prediction."""
    
    def __init__(self, n_splits: int = 10, random_state: int = 42):
        self.n_splits = n_splits
        self.random_state = random_state
        self.preprocessor = CustomPreprocessor()
        self.models = {}
        self.fold_scores = {}
        self.feature_importance = None
    
    def _init_models(self):
        """Initialize the models with default parameters."""
        self.models = {
            'catboost': CatBoostClassifier(
                loss_function='Logloss',
                eval_metric='AUC',
                iterations=5000,
                early_stopping_rounds=200,
                task_type='GPU',
                random_seed=self.random_state,
                verbose=False
            ),
            'xgboost': XGBClassifier(
                objective='binary:logistic',
                eval_metric='auc',
                use_label_encoder=False,
                enable_categorical=True,
                n_estimators=5000,
                early_stopping_rounds=200,
                tree_method='hist',
                random_state=self.random_state
            ),
            'lightgbm': LGBMClassifier(
                objective='binary',
                metric='auc',
                n_estimators=5000,
                early_stopping_rounds=200,
                random_state=self.random_state,
                verbose=-1
            )
        }
    
    def _get_feature_importance(self, model_name: str, model) -> np.ndarray:
        """Extract feature importance from a model."""
        if model_name == 'catboost':
            return model.get_feature_importance()
        elif model_name == 'xgboost':
            return model.feature_importances_
        elif model_name == 'lightgbm':
            return model.feature_importances_
        return np.zeros(len(self.preprocessor.get_feature_names()))
    
    def train_and_evaluate(self, X: pd.DataFrame, y: pd.Series) -> dict:
        """Train models using cross-validation and return predictions."""
        self._init_models()
        X_processed = self.preprocessor.fit_transform(X)
        
        predictions = {}
        feature_importances = {model_name: np.zeros(len(self.preprocessor.get_feature_names())) 
                             for model_name in self.models.keys()}
        importance_types = {
            'catboost': 'Feature Importance',
            'xgboost': 'Gain',
            'lightgbm': 'Split'
        }
        
        skf = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state)
        
        for model_name, model in self.models.items():
            print(f"\nTraining {model_name}")
            fold_predictions = pd.DataFrame()
            fold_scores = []
            
            for fold, (train_idx, val_idx) in enumerate(skf.split(X_processed, y)):
                print(f"Fold {fold + 1}/{self.n_splits}")
                
                X_train, X_val = X_processed[train_idx], X_processed[val_idx]
                y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
                
                # Train model
                fold_model = clone(model)
                fold_model.fit(X_train, y_train)
                
                # Validate
                val_pred = fold_model.predict_proba(X_val)[:, 1]
                fold_score = roc_auc_score(y_val, val_pred)
                fold_scores.append(fold_score)
                print(f"AUC score: {fold_score:.6f}")
                
                # Store predictions and feature importance
                fold_predictions[fold] = fold_model.predict_proba(X_processed)[:, 1]
                feature_importances[model_name] += self._get_feature_importance(model_name, fold_model)
            
            # Average feature importances across folds
            feature_importances[model_name] /= self.n_splits
            
            self.fold_scores[model_name] = fold_scores
            predictions[model_name] = fold_predictions
            
            print(f"\n{model_name} Results:")
            print(f"Mean AUC: {np.mean(fold_scores):.6f} (±{np.std(fold_scores):.6f})")
        
        # Store feature importance information
        self.feature_importance = FeatureImportance(
            feature_names=self.preprocessor.get_feature_names(),
            importance_values=feature_importances,
            importance_types=importance_types
        )
        
        return predictions
    
    def predict(self, X: pd.DataFrame) -> np.ndarray:
        """Generate predictions for test data."""
        X_processed = self.preprocessor.transform(X)
        
        final_predictions = []
        for model_name, model in self.models.items():
            model_preds = model.predict_proba(X_processed)[:, 1]
            final_predictions.append(model_preds)
        
        # Average predictions from all models
        return np.mean(final_predictions, axis=0)

def main():
    # Initialize data loader and model
    data_loader = DataLoader()
    X_train, X_test, y_train, submission = data_loader.load_data()
    
    # Train model
    model = LoanApprovalModel(n_splits=3, random_state=42)
    predictions = model.train_and_evaluate(X_train, y_train)
    
    # Analyze feature importance
    analyzer = FeatureImportanceAnalyzer()
    importance_summary = analyzer.get_feature_importance_summary(model.feature_importance)
    print("\nTop 10 Most Important Features:")
    print(importance_summary.head(10))
    
    # Plot feature importance
    fig = analyzer.plot_feature_importance(model.feature_importance)
    plt.savefig('feature_importance.png', bbox_inches='tight', dpi=300)
    plt.close()
    
    # Generate test predictions
    test_predictions = model.predict(X_test)
    
    # Create submission
    submission['loan_status'] = test_predictions
    submission.to_csv('ensemble_submission.csv')
    print("\nSubmission file created: ensemble_submission.csv")
    return submission, importance_summary

if __name__ == "__main__":
    submission, importance_summary = main()


Training catboost
Fold 1/3


Default metric period is 5 because AUC is/are not implemented for GPU


AUC score: 0.954227
Fold 2/3


Default metric period is 5 because AUC is/are not implemented for GPU


AUC score: 0.948810
Fold 3/3


Default metric period is 5 because AUC is/are not implemented for GPU


AUC score: 0.949394

catboost Results:
Mean AUC: 0.950810 (±0.002428)

Training xgboost
Fold 1/3


Parameters: { "use_label_encoder" } are not used.



ValueError: Must have at least 1 validation dataset for early stopping.