# Project : German Credit Risk Analysis

In [1]:
# Install ucimlrepo if it is not already installed
%pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0 -> 25.3
[notice] To update, run: C:\Users\liuja\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


### Import required libraries

In [2]:
import pandas as pd
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

### Gather and Structure the Data

In [3]:
# Load dataset (German Credit Data from UCI)
statlog_data = fetch_ucirepo(id=144)
X = statlog_data.data.features
y = statlog_data.data.targets

In [4]:
# Rename columns based on UCI documentation
X.columns = [
    "Status_checking_account", "Duration_month", "Credit_history", "Purpose",
    "Credit_amount", "Savings_account", "Employment_since", "Installment_rate",
    "Personal_status_sex", "Other_debtors_guarantors", "Residence_since",
    "Property", "Age_years", "Other_installment_plans", "Housing",
    "Existing_credits", "Job", "Num_liable", "Telephone", "Foreign_worker"
]

# Convert target: 1 = good credit, 2 = bad credit
y = y.replace({1: "Good", 2: "Bad"})
if isinstance(y, pd.DataFrame):
    y = y.rename(columns={y.columns[0]: "Credit_risk"})
else:
    y = y.rename("Credit_risk")

# Combine features and target
df = pd.concat([X, y], axis=1)

### 1. Descriptive Analysis

In [5]:
print("Dataset overview")
print(df.head(), "\n")
print("Shape:", df.shape, "\n")
print("Data types:\n", df.dtypes, "\n")
print("Target distribution:\n", df["Credit_risk"].value_counts(), "\n")
print("Numeric summary:\n", df.describe(), "\n")

Dataset overview
  Status_checking_account  Duration_month Credit_history Purpose  \
0                     A11               6            A34     A43   
1                     A12              48            A32     A43   
2                     A14              12            A34     A46   
3                     A11              42            A32     A42   
4                     A11              24            A33     A40   

   Credit_amount Savings_account Employment_since  Installment_rate  \
0           1169             A65              A75                 4   
1           5951             A61              A73                 2   
2           2096             A61              A74                 2   
3           7882             A61              A74                 2   
4           4870             A61              A73                 3   

  Personal_status_sex Other_debtors_guarantors  ...  Property Age_years  \
0                 A93                     A101  ...      A121        67 

#### Descriptive Analysis Summary:

The dataset contains 1,000 observations and 21 columns (20 features + 1 target).

The target variable Credit_risk has two classes: “Good” (70%) and “Bad” (30%), showing a class imbalance.

There are no missing values, meaning the dataset is clean and ready for preprocessing.
Most categorical columns are encoded with codes such as A11, A12, etc., corresponding to the original UCI documentation.

Numerical columns such as Duration_month, Credit_amount, and Age_years have a wide range of values, suggesting different scales.

Feature scaling will therefore be necessary before model training.

### 2. Preprocessing

In [6]:
# Separate features and target
X = df.drop("Credit_risk", axis=1)
y = df["Credit_risk"]

# Encode categorical columns using LabelEncoder
label_encoder = LabelEncoder()
for col in X.select_dtypes(include=["object"]).columns:
    X[col] = label_encoder.fit_transform(X[col])

# Encode target labels (Good = 1, Bad = 0)
y = y.map({"Good": 1, "Bad": 0})

# Standardize numeric features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Split dataset into train/test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

### 3. Problem formalization

The objective of this project is to predict the credit risk of a client based on their financial and personal information.

Each observation represents a customer with 20 input features such as:<br>
account status,<br>
credit history,<br>
loan purpose,<br>
employment status,<br>
age, housing, and others.<br>

The target variable (Credit_risk) takes two possible values:

"Good" → low risk (the customer is likely to repay the credit)<br>
"Bad" → high risk (the customer may default on payments)

Therefore, this is a supervised learning problem, more precisely a binary classification task.

Inputs (features): 20 independent variables (mix of categorical and numerical).<br>
Output (target): Binary label (Good / Bad).

Evaluation metric: Since the classes are slightly imbalanced, we will consider:<br>
Accuracy for overall performance.<br>
Precision, Recall, and F1-score to better capture class imbalance effects.

In [7]:
# Define input features (X) and target (y)
X = df.drop("Credit_risk", axis=1)
y = df["Credit_risk"]

print("Number of input features:", X.shape[1])
print("Target variable:", y.name)
print("Unique target classes:", y.unique())

Number of input features: 20
Target variable: Credit_risk
Unique target classes: ['Good' 'Bad']


### 4. Selection of a Baseline Model and Implementation

At this stage, we aim to establish a baseline model.

Given that our problem is a binary classification (Good / Bad credit risk) and that we have both numerical and categorical features, the chosen baseline model is a Logistic Regression classifier.

Reasons for this choice:<br>
It is simple and fast to train.<br>
It provides interpretable coefficients that indicate feature influence.<br>
It performs reasonably well on linearly separable data.

The model will be trained on the preprocessed dataset using an 80/20 train-test split.
The evaluation will include Accuracy, Precision, Recall, and F1-score to account for the class imbalance.

In [8]:
# Baseline model: Logistic Regression
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [9]:
# Evaluation
print("Baseline Model Evaluation")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))

Baseline Model Evaluation
Accuracy: 0.78

Classification report:
               precision    recall  f1-score   support

           0       0.67      0.49      0.57        59
           1       0.81      0.90      0.85       141

    accuracy                           0.78       200
   macro avg       0.74      0.70      0.71       200
weighted avg       0.77      0.78      0.77       200



In [10]:
# Save preprocessed dataset
df.to_csv("german_credit_preprocessed.csv", index=False, sep=';', encoding='utf-8-sig')
print("Preprocessed dataset saved as 'german_credit_preprocessed.csv'")

Preprocessed dataset saved as 'german_credit_preprocessed.csv'


### Conclusion

The baseline Logistic Regression model achieved a acceptable overall accuracy of around 78%, indicating that the model can reasonably distinguish between good and bad credit risks.<br>
However, a closer look at the classification report reveals that the model performs significantly better on the “Good” class than on the “Bad” one.<br>
This imbalance suggests that the dataset is not perfectly balanced, which causes the model to favor the majority class (“Good”).

To improve performance, especially for detecting high-risk clients, the next steps should include:<br>
Applying class balancing techniques, such as SMOTE or class-weight adjustments during training.<br>
Exploring more robust models (Random Forest, Gradient Boosting, or XGBoost) that can better capture nonlinear relationships.<br>
Feature engineering and selection, to identify the most influential predictors of credit risk.

These improvements will allow the model to achieve better generalization and higher reliability for real-world credit risk assessment.

### 5. RandomOverSampler testing for improvements

In [11]:
from sklearn.metrics import f1_score
from imblearn.over_sampling import RandomOverSampler
print("Class distribution before ROS:")
print(y_train.value_counts().rename({1: 'Good', 0: 'Bad'}))

#ROS application
print("\nRandomOverSampler application ...")
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

print("\nClass distribution after ROS:")
print(y_train_ros.value_counts().rename({1: 'Good', 0: 'Bad'}))

# ROS model training
model_ros = LogisticRegression(max_iter=1000, random_state=42)
model_ros.fit(X_train_ros, y_train_ros)
y_pred_ros = model_ros.predict(X_test)

accuracy_ros = accuracy_score(y_test, y_pred_ros)
f1_ros = f1_score(y_test, y_pred_ros, pos_label=0)

print(f"\nRandomOverSampler evaluation :")
print(f"Accuracy: {accuracy_ros:.4f}")
print(f"F1-Score (Bad Risk): {f1_ros:.4f}\n")
print(classification_report(y_test, y_pred_ros))

Class distribution before ROS:
Credit_risk
Good    559
Bad     241
Name: count, dtype: int64

RandomOverSampler application ...

Class distribution after ROS:
Credit_risk
Bad     559
Good    559
Name: count, dtype: int64

RandomOverSampler evaluation :
Accuracy: 0.6950
F1-Score (Bad Risk): 0.5960

              precision    recall  f1-score   support

           0       0.49      0.76      0.60        59
           1       0.87      0.67      0.76       141

    accuracy                           0.69       200
   macro avg       0.68      0.71      0.68       200
weighted avg       0.76      0.69      0.71       200



Adding a RandomOverSampler to our data improves massively our minority class recall score but it degrades the overall accuracy by almost 10%

### SMOTE

In [13]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter
import pandas as pd

# 1) Charger le dataset encodé
df_encoded = pd.read_csv("german_credit_preprocessed_encoded.csv")

# (si besoin)
# df_encoded["Credit_risk"] = df_encoded["Credit_risk"].map({"Bad": 0, "Good": 1})

# 2) Définir X et y
X = df_encoded.drop(columns=["Credit_risk"])
y = df_encoded["Credit_risk"]

print("Taille totale du dataset :", X.shape)
print("Répartition des classes avant split :", Counter(y))

# 3) Split train / test
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

print("Taille X_train :", X_train.shape)
print("Taille X_test  :", X_test.shape)
print("Répartition des classes dans y_train AVANT SMOTE :", Counter(y_train))
print("Répartition des classes dans y_test  :", Counter(y_test))

# 4) SMOTE sur le TRAIN uniquement
smote = SMOTE(random_state=42, k_neighbors=5)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Taille X_train_res :", X_train_res.shape)
print("Répartition des classes dans y_train APRÈS SMOTE :", Counter(y_train_res))

# 5) Modèle Logistic Regression
model_smote = LogisticRegression(max_iter=1000, random_state=42)

model_smote.fit(X_train_res, y_train_res)

# 6) Évaluation sur le TEST original
y_pred_smote = model_smote.predict(X_test)

print("SMOTE Model Evaluation")
print("Accuracy:", accuracy_score(y_test, y_pred_smote))
print("\nClassification report:\n", classification_report(y_test, y_pred_smote))


KeyError: "['Credit_risk'] not found in axis"

## Optimisation with GridSearch

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV


# Modèle de base
log_reg = LogisticRegression(max_iter=1000)

# Grille d'hyperparamètres
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']  
}


# Grid Search avec cross-validation
grid = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    scoring='f1',   # ou 'accuracy', 'recall', selon ton projet
    cv=5,
    n_jobs=-1
)

grid.fit(X_train, y_train)


print("Meilleurs hyperparamètres trouvés :", grid.best_params_)
print("Meilleur score CV :", grid.best_score_)


best_log_reg = grid.best_estimator_
best_log_reg.fit(X_train, y_train)

y_pred = best_log_reg.predict(X_test)


from sklearn.metrics import accuracy_score, f1_score, classification_report

print("Accuracy :", accuracy_score(y_test, y_pred))
print("F1-score :", f1_score(y_test, y_pred))
print("\nClassification report :\n", classification_report(y_test, y_pred))

