In [None]:
# Load the data
import pandas as pd
data = pd.read_csv('../data/raw/credit.csv')

## feature engineering

In [8]:
# Split the data into X and y
from sklearn.model_selection import train_test_split
X = data.drop('Class', axis=1)
y = data['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# Normalize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

## model development

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, roc_auc_score
import pandas as pd

# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(eval_metric='logloss', random_state=42),
    'LightGBM': LGBMClassifier(random_state=42, verbose=-1)
}

# Evaluate models
results = {}

for name, model in models.items():
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', model)
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_proba = pipeline.predict_proba(X_test)[:, 1]
    
    results[name] = {
        'Classification Report': classification_report(y_test, y_pred, output_dict=True),
        'ROC AUC': roc_auc_score(y_test, y_proba)
    }

# Display results
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print("ROC AUC:", metrics['ROC AUC'])
    print("Classification Report:", pd.DataFrame(metrics['Classification Report']).T)

Model: Logistic Regression
ROC AUC: 0.9998606385147815
Classification Report:               precision    recall  f1-score        support
0              0.998087  0.998908  0.998497   85149.000000
1              0.998911  0.998092  0.998501   85440.000000
accuracy       0.998499  0.998499  0.998499       0.998499
macro avg      0.998499  0.998500  0.998499  170589.000000
weighted avg   0.998500  0.998499  0.998499  170589.000000
Model: Random Forest
ROC AUC: 0.9999930099536247
Classification Report:               precision    recall  f1-score        support
0              0.999718  0.999918  0.999818   85149.000000
1              0.999918  0.999719  0.999819   85440.000000
accuracy       0.999818  0.999818  0.999818       0.999818
macro avg      0.999818  0.999818  0.999818  170589.000000
weighted avg   0.999818  0.999818  0.999818  170589.000000
Model: XGBoost
ROC AUC: 0.9999860812119913
Classification Report:               precision    recall  f1-score        support
0              0.

### pick best model

In [None]:
# Initialize variables to track the best model
best_model = None
best_roc_auc = -1  # Start with a very low ROC AUC

# Iterate through the results
for model_name, metrics in results.items():
    roc_auc = metrics["ROC AUC"]  # Access ROC AUC for the current model
    if roc_auc > best_roc_auc:  # Update if a better ROC AUC is found
        best_model = model_name
        best_roc_auc = roc_auc

# Print the best model and its ROC AUC
print(f"Best Model: {best_model}")
print(f"ROC AUC: {best_roc_auc}")

Best Model: Random Forest
ROC AUC: 0.9999930099536247
