## Credit Risk Modelling (Task 4)


process of model selection, training, and evaluation for your credit scoring model

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the processed data
df = pd.read_csv("processed_data.csv")

In [None]:
# Prepare the features and target
X = df.drop(['CustomerId', 'TransactionStartTime', 'Label'], axis=1)
y = df['Label']

# 1. Model Selection and Training

In [None]:
# a. Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### b. Choose Models
 We'll use Logistic Regression and Random Forest

In [None]:
# b. Choose Models
# We'll use Logistic Regression and Random Forest

In [None]:
# c. Train the Models
# Logistic Regression
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train_scaled, y_train)

In [None]:
# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train)

In [None]:
# d. Hyperparameter Tuning
# Logistic Regression
lr_param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}
lr_grid_search = GridSearchCV(LogisticRegression(random_state=42), lr_param_grid, cv=5, scoring='roc_auc')
lr_grid_search.fit(X_train_scaled, y_train)

print("Best parameters for Logistic Regression:", lr_grid_search.best_params_)
lr_best_model = lr_grid_search.best_estimator_

In [None]:
# Random Forest
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=5, scoring='roc_auc')
rf_grid_search.fit(X_train_scaled, y_train)

print("Best parameters for Random Forest:", rf_grid_search.best_params_)
rf_best_model = rf_grid_search.best_estimator_

In [None]:
# 2. Model Evaluation
def evaluate_model(model, X, y, model_name):
    y_pred = model.predict(X)
    y_pred_proba = model.predict_proba(X)[:, 1]
    
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred, pos_label='Good')
    recall = recall_score(y, y_pred, pos_label='Good')
    f1 = f1_score(y, y_pred, pos_label='Good')
    roc_auc = roc_auc_score(y, y_pred_proba)
    
    print(f"\nEvaluation Metrics for {model_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC-AUC: {roc_auc:.4f}")
    
    return y_pred_proba

In [None]:
# Evaluate Logistic Regression
lr_proba = evaluate_model(lr_best_model, X_test_scaled, y_test, "Logistic Regression")

In [None]:
# Evaluate Random Forest
rf_proba = evaluate_model(rf_best_model, X_test_scaled, y_test, "Random Forest")

In [None]:
# Plot ROC curves
plt.figure(figsize=(10, 6))
fpr_lr, tpr_lr, _ = roc_curve(y_test, lr_proba)
fpr_rf, tpr_rf, _ = roc_curve(y_test, rf_proba)

In [None]:
plt.plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC = {roc_auc_score(y_test, lr_proba):.2f})')
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {roc_auc_score(y_test, rf_proba):.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()

In [None]:
# Feature Importance for Random Forest
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_best_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(x='importance', y='feature', data=feature_importance.head(20))
plt.title('Top 20 Feature Importances (Random Forest)')
plt.show()

Key Points:

Data Preparation: The script assumes that you have already processed your data and saved it as "processed_data.csv". Make sure this file contains all the engineered features from the previous tasks.
Model Performance: The evaluation metrics will give you an idea of how well each model is performing. In general:
Higher accuracy, precision, recall, F1 score, and ROC-AUC indicate better performance.
ROC-AUC of 0.5 suggests no discrimination, 0.7-0.8 is acceptable, 0.8-0.9 is excellent, and >0.9 is outstanding.
ROC Curve: The plot allows you to visually compare the performance of both models. The model with the curve closer to the top-left corner performs better.
Feature Importance: For the Random Forest model, we plot the top 20 most important features. This can provide insights into which factors are most crucial for credit scoring in your model.

In [None]:
import joblib

# Save the best model (assuming Random Forest performed better)
joblib.dump(rf_best_model, 'credit_scoring_model.joblib')

# Save the scaler
joblib.dump(scaler, 'scaler.joblib')