# Ciredit risk modelling using German Credit Card Dataset

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#loading dataset
df = pd.read_csv('../prepocessing/preprocessed_data.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,age,job,credit_amt,Duration,sex,housing,sav_acc,purpose,risk
0,1,22,2,5951,48,1,0,0.0,0,1
1,2,49,1,2096,12,0,0,0.0,1,0
2,3,45,2,7882,42,0,3,0.0,2,0
3,4,53,2,4870,24,0,3,0.0,3,1
4,6,53,2,2835,24,0,0,3.0,2,0


In [5]:
df=df.drop('Unnamed: 0',axis=1)

In [6]:
df.head()

Unnamed: 0,age,job,credit_amt,Duration,sex,housing,sav_acc,purpose,risk
0,22,2,5951,48,1,0,0.0,0,1
1,49,1,2096,12,0,0,0.0,1,0
2,45,2,7882,42,0,3,0.0,2,0
3,53,2,4870,24,0,3,0.0,3,1
4,53,2,2835,24,0,0,3.0,2,0


### Feature Engineering

we ca define a new feature that is 

credit_amt / duration 

this can be a good estimator of credit risk

In [8]:
df['credit_per_month'] = df['credit_amt'] / df['Duration']

In [9]:
df.corr()

Unnamed: 0,age,job,credit_amt,Duration,sex,housing,sav_acc,purpose,risk,credit_per_month
age,1.0,-0.015036,0.026014,-0.045577,-0.150733,0.187405,0.037279,0.090909,-0.077953,0.134201
job,-0.015036,1.0,0.307173,0.220413,-0.070399,0.133259,-0.037338,0.02521,0.036742,0.189432
credit_amt,0.026014,0.307173,1.0,0.631132,-0.071275,0.160001,-0.065134,0.221189,0.194125,0.520386
Duration,-0.045577,0.220413,0.631132,1.0,-0.065814,0.150137,-0.02905,0.103108,0.272686,-0.125515
sex,-0.150733,-0.070399,-0.071275,-0.065814,1.0,0.00544,-0.017152,-0.06485,0.058062,-0.070419
housing,0.187405,0.133259,0.160001,0.150137,0.00544,1.0,-0.028549,0.081081,0.135166,0.069602
sav_acc,0.037279,-0.037338,-0.065134,-0.02905,-0.017152,-0.028549,1.0,-0.003972,-0.138183,-0.063351
purpose,0.090909,0.02521,0.221189,0.103108,-0.06485,0.081081,-0.003972,1.0,0.082411,0.122927
risk,-0.077953,0.036742,0.194125,0.272686,0.058062,0.135166,-0.138183,0.082411,1.0,0.001807
credit_per_month,0.134201,0.189432,0.520386,-0.125515,-0.070419,0.069602,-0.063351,0.122927,0.001807,1.0


even now the correlation is not that high

### Model Building

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_auc_score
from sklearn.preprocessing import StandardScaler

In [12]:
df.columns

Index(['age', 'job', 'credit_amt', 'Duration', 'sex', 'housing', 'sav_acc',
       'purpose', 'risk', 'credit_per_month'],
      dtype='object')

In [14]:
#features and target variable
X = df[['age','job','credit_amt','Duration', 'sex', 'housing', 'sav_acc','purpose', 'credit_per_month']]
y = df['risk']

In [43]:
#splitting the data
X_train , X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [44]:
# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [19]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [20]:
#making predictions
y_pred = model.predict(X_test)
y_pred_probab = model.predict_proba(X_test)[:,1]

In [30]:
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred,normalize='true')
class_report = classification_report(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_probab)

In [31]:
print(f"Accuracy: {accuracy}")

Accuracy: 0.774390243902439


In [32]:
print(f"Confusion Matrix:\n{conf_matrix}")

Confusion Matrix:
[[0.92307692 0.07692308]
 [0.59574468 0.40425532]]


In [25]:
print(f"Classification Report:\n{class_report}")

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.92      0.85       117
           1       0.68      0.40      0.51        47

    accuracy                           0.77       164
   macro avg       0.74      0.66      0.68       164
weighted avg       0.76      0.77      0.75       164



In [26]:
print(f"ROC AUC Score: {roc_auc}")

ROC AUC Score: 0.7166757592289508


- High Recall for Non-Defaulters (0.92): The model accurately identifies most non-defaulters, ensuring that it correctly recognizes customers who are low risk.

- Low Recall for Defaulters (0.4): The model misses a significant number of defaulters, indicating that it might need improvements to better catch high-risk customers.

The model is strong in identifying non-defaulters but needs enhancements to better detect defaulters to reduce potential financial risks.

In [33]:
df.risk.value_counts()

risk
0    549
1    268
Name: count, dtype: int64

we can see that the '1' are low in number. it is a minority class. We can use resampling technique to increase the number of instances of '1'

In [39]:
#Using SMOTE to oversample
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_resampled , y_train_resampled = smote.fit_resample(X_train,y_train)