
# Credit Risk Prediction using German Credit Dataset

This notebook develops a machine learning model to classify loan applicants into **good** or **bad credit risk** categories using the German Credit dataset.


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
import shap
import warnings
warnings.filterwarnings("ignore")


In [None]:

# Load dataset
df = pd.read_csv("german_credit_data.csv")
df.head()


In [None]:

df.info()
df.describe(include='all')


In [None]:

# Check and handle missing values
df.isnull().sum()


In [None]:

# Visualizing target distribution and correlations
sns.countplot(data=df, x='Risk')
plt.title("Credit Risk Distribution")
plt.show()

# Correlation heatmap for numeric features
plt.figure(figsize=(10, 6))
sns.heatmap(df.select_dtypes(include=[np.number]).corr(), annot=True, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.show()


In [None]:

# Encode categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if col != 'Risk':
        df[col] = LabelEncoder().fit_transform(df[col])

# Encode target
df['Risk'] = df['Risk'].map({'good': 1, 'bad': 0})

# Feature and target separation
X = df.drop('Risk', axis=1)
y = df['Risk']

# Standardize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [None]:

# Train multiple models
models = {
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Model: {name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    print("-" * 40)


In [None]:

# Interpret model using SHAP (example with Random Forest)
explainer = shap.TreeExplainer(models["Random Forest"])
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values[1], X_train, feature_names=X.columns)



## Key Insights and Recommendations

- **Top influential features** identified by SHAP can guide credit policy reviews.
- **Feature scaling and encoding** significantly impacted model accuracy and interpretability.
- **Model performance** suggests Random Forest and XGBoost performed best on this dataset.
- Consider **enhancing the dataset** with more recent or real-time financial indicators.
- Automate preprocessing and scoring for real-time credit risk assessment.
