In [None]:
# Load the data
import pandas as pd
data = pd.read_csv('../data/raw/credit.csv')

## feature engineering

In [8]:
# Split the data into X and y
from sklearn.model_selection import train_test_split
X = data.drop('Class', axis=1)
y = data['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# Normalize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

## model development

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, roc_auc_score
import pandas as pd

# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(eval_metric='logloss', random_state=42),
    'LightGBM': LGBMClassifier(random_state=42, verbose=-1)
}

# Evaluate models
results = {}

for name, model in models.items():
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', model)
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_proba = pipeline.predict_proba(X_test)[:, 1]
    
    results[name] = {
        'Classification Report': classification_report(y_test, y_pred, output_dict=True),
        'ROC AUC': roc_auc_score(y_test, y_proba)
    }

# Display results
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print("ROC AUC:", metrics['ROC AUC'])
    print("Classification Report:", pd.DataFrame(metrics['Classification Report']).T)

In [None]:
def select_best_model(results, criterion="ROC AUC"):
    """
    Selects the best model based on a specified evaluation criterion.

    Parameters:
    results (dict): A dictionary containing model names as keys and their evaluation metrics as values.
                    Example format:
                    {
                        'Model 1': {'ROC AUC': 0.95, 'Accuracy': 0.90},
                        'Model 2': {'ROC AUC': 0.96, 'Accuracy': 0.92}
                    }
    criterion (str): The evaluation metric to use for selecting the best model. Default is 'ROC AUC'.

    Returns:
    str: The name of the best model.
    dict: The evaluation metrics of the best model.
    """
    # Validate criterion
    if criterion not in next(iter(results.values())).keys():
        raise ValueError(f"Invalid criterion '{criterion}'. Available options: {list(next(iter(results.values())).keys())}")
    
    # Find the best model
    best_model = max(results.items(), key=lambda x: x[1][criterion])
    return best_model[0], best_model[1]

# Example usage with the provided results
results = {
    'Logistic Regression': {'ROC AUC': 0.9998606385147815, 'Accuracy': 0.998499},
    'Random Forest': {'ROC AUC': 0.9999930099536247, 'Accuracy': 0.999818},
    'XGBoost': {'ROC AUC': 0.9999860812119913, 'Accuracy': 0.999801},
    'LightGBM': {'ROC AUC': 0.9999028494410965, 'Accuracy': 0.999760}
}

# Choose the best model based on ROC AUC
best_model_name, best_model_metrics = select_best_model(results, criterion="ROC AUC")
print(f"Best Model: {best_model_name}")
print(f"Metrics: {best_model_metrics}")