# German Credit Risk

# Required Libraries and Data:

In [31]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

import warnings
warnings.filterwarnings("ignore")


In [32]:
df = pd.read_csv("german_credit_data.csv", index_col=0)
df.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               1000 non-null   int64 
 1   Sex               1000 non-null   object
 2   Job               1000 non-null   int64 
 3   Housing           1000 non-null   object
 4   Saving accounts   817 non-null    object
 5   Checking account  606 non-null    object
 6   Credit amount     1000 non-null   int64 
 7   Duration          1000 non-null   int64 
 8   Purpose           1000 non-null   object
 9   Risk              1000 non-null   object
dtypes: int64(4), object(6)
memory usage: 85.9+ KB


In [34]:
df.shape

(1000, 10)

In [35]:
df.isnull().sum().sort_values(ascending=False)

Checking account    394
Saving accounts     183
Age                   0
Sex                   0
Job                   0
Housing               0
Credit amount         0
Duration              0
Purpose               0
Risk                  0
dtype: int64

In [36]:
df.duplicated().sum()

0

In [37]:
columns = df.columns.to_list()
for col in columns:
    unique_values = df[col].unique()
    numeric = pd.api.types.is_numeric_dtype(df[col])
    print(f"{col} ({df[col].dtype})")
    
    if numeric:
        unique_values = np.sort(unique_values)
        
    if numeric and len(unique_values) > 20:
        print(unique_values[:10], " ... ", unique_values[-10:])
    else:
        print(unique_values)
        
    print("\n")

Age (int64)
[19 20 21 22 23 24 25 26 27 28]  ...  [62 63 64 65 66 67 68 70 74 75]


Sex (object)
['male' 'female']


Job (int64)
[0 1 2 3]


Housing (object)
['own' 'free' 'rent']


Saving accounts (object)
[nan 'little' 'quite rich' 'rich' 'moderate']


Checking account (object)
['little' 'moderate' nan 'rich']


Credit amount (int64)
[250 276 338 339 343 362 368 385 392 409]  ...  [14318 14421 14555 14782 14896 15653 15672 15857 15945 18424]


Duration (int64)
[ 4  5  6  7  8  9 10 11 12 13]  ...  [36 39 40 42 45 47 48 54 60 72]


Purpose (object)
['radio/TV' 'education' 'furniture/equipment' 'car' 'business'
 'domestic appliances' 'repairs' 'vacation/others']


Risk (object)
['good' 'bad']




In [38]:
df["Saving accounts"] = df["Saving accounts"].fillna("none")
df["Checking account"] = df["Checking account"].fillna("none")

In [39]:
df["Job"] = df["Job"].map({
    0: "unskilled and non-resident",
    1: "unskilled and resident",
    2: "skilled",
    3: "highly skilled"
})

In [40]:
df.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,skilled,own,none,little,1169,6,radio/TV,good
1,22,female,skilled,own,little,moderate,5951,48,radio/TV,bad
2,49,male,unskilled and resident,own,little,none,2096,12,education,good
3,45,male,skilled,free,little,little,7882,42,furniture/equipment,good
4,53,male,skilled,free,little,little,4870,24,car,bad


In [41]:
numeric_features = df.select_dtypes(include=[int, float]).columns.to_list()
categorical_features = list(set(df.drop("Risk", axis=1).columns) - set(numeric_features))

# Data Preprocessing and Feature Engineering

In [42]:
df

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,skilled,own,none,little,1169,6,radio/TV,good
1,22,female,skilled,own,little,moderate,5951,48,radio/TV,bad
2,49,male,unskilled and resident,own,little,none,2096,12,education,good
3,45,male,skilled,free,little,little,7882,42,furniture/equipment,good
4,53,male,skilled,free,little,little,4870,24,car,bad
...,...,...,...,...,...,...,...,...,...,...
995,31,female,unskilled and resident,own,little,none,1736,12,furniture/equipment,good
996,40,male,highly skilled,own,little,little,3857,30,car,good
997,38,male,skilled,own,little,none,804,12,radio/TV,good
998,23,male,skilled,free,little,little,1845,45,radio/TV,bad


In [50]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder

# Initialize encoders
ordinal_encoder = OrdinalEncoder()
onehot_encoder = OneHotEncoder(drop='first', sparse=False)  # Use drop='first' to avoid multicollinearity
J_encoder = LabelEncoder()
SA_encoder = LabelEncoder()
CA_encoder = LabelEncoder()
H_encoder = LabelEncoder()
P_encoder = LabelEncoder()
S_encoder = LabelEncoder()
label_encoder = LabelEncoder()
# Example of using encoders
# Assuming df is your DataFrame

# Ordinal encoding for 'Job' column
df['Job'] = J_encoder.fit_transform(df[['Job']])



# Label encoding for 'Risk' column
df['Risk'] = label_encoder.fit_transform(df['Risk'])

# Example of encoding other columns
# Assuming df is your DataFrame

# Ordinal encoding for 'Saving accounts' column
df['Saving accounts'] = SA_encoder.fit_transform(df[['Saving accounts']])

df['Checking account'] = CA_encoder.fit_transform(df[['Checking account']])
df['Housing'] = H_encoder.fit_transform(df[['Housing']])
df['Purpose'] = P_encoder.fit_transform(df[['Purpose']])

# Label encoding for 'Sex' column
df['Sex'] = S_encoder.fit_transform(df['Sex'])



In [51]:
columns = df.columns.to_list()
for col in columns:
    unique_values = df[col].unique()
    numeric = pd.api.types.is_numeric_dtype(df[col])
    print(f"{col} ({df[col].dtype})")
    
    if numeric:
        unique_values = np.sort(unique_values)
        
    if numeric and len(unique_values) > 20:
        print(unique_values[:10], " ... ", unique_values[-10:])
    else:
        print(unique_values)
        
    print("\n")

Age (int64)
[19 20 21 22 23 24 25 26 27 28]  ...  [62 63 64 65 66 67 68 70 74 75]


Sex (int32)
[0 1]


Job (int64)
[0 1 2 3]


Housing (int32)
[0 1 2]


Saving accounts (int32)
[0 1 2 3 4]


Checking account (int32)
[0 1 2 3]


Credit amount (int64)
[250 276 338 339 343 362 368 385 392 409]  ...  [14318 14421 14555 14782 14896 15653 15672 15857 15945 18424]


Duration (int64)
[ 4  5  6  7  8  9 10 11 12 13]  ...  [36 39 40 42 45 47 48 54 60 72]


Purpose (int32)
[0 1 2 3 4 5 6 7]


Risk (int32)
[0 1]




In [52]:
df

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,1,1,1,2,0,1169,6,5,1
1,22,0,1,1,0,1,5951,48,5,0
2,49,1,3,1,0,2,2096,12,3,1
3,45,1,1,0,0,0,7882,42,4,1
4,53,1,1,0,0,0,4870,24,1,0
...,...,...,...,...,...,...,...,...,...,...
995,31,0,3,1,0,2,1736,12,4,1
996,40,1,0,1,0,0,3857,30,1,1
997,38,1,1,1,0,2,804,12,5,1
998,23,1,1,0,0,0,1845,45,5,0


In [53]:
from sklearn.model_selection import train_test_split

X = df.drop("Risk", axis=1)
y = df["Risk"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"X_train: {X_train.shape}\nX_test: {X_test.shape}")

X_train: (800, 9)
X_test: (200, 9)


In [54]:
y

0      1
1      0
2      1
3      1
4      0
      ..
995    1
996    1
997    1
998    0
999    1
Name: Risk, Length: 1000, dtype: int32

## Modeling

In [55]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
random_prediction = random_forest.predict(X_test)
random_acc = round((accuracy_score(y_test, random_prediction) * 100), 2)
random_f1 = round((f1_score(y_test, random_prediction)), 4)
print(f'The Random Forest classifier has an accuracy of {random_acc}% and f1 score of {random_f1}')

The Random Forest classifier has an accuracy of 76.0% and f1 score of 0.8431


In [56]:
from matplotlib import pyplot
importance = random_forest.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))

Feature: 0, Score: 0.18567
Feature: 1, Score: 0.03003
Feature: 2, Score: 0.05410
Feature: 3, Score: 0.04478
Feature: 4, Score: 0.07489
Feature: 5, Score: 0.12404
Feature: 6, Score: 0.24297
Feature: 7, Score: 0.14924
Feature: 8, Score: 0.09428


In [58]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
gradient_boost = GradientBoostingClassifier()
gradient_boost.fit(X_train, y_train)
gradient_pred = gradient_boost.predict(X_test)
gradient_acc = round((accuracy_score(y_test, gradient_pred)*100), 2)
gradient_f1 = round((f1_score(y_test, gradient_pred)), 4)

print(f'gradient boost has an accuracy of {gradient_acc}% and f1 score of {gradient_f1}')

from matplotlib import pyplot
importance = random_forest.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))

gradient boost has an accuracy of 75.0% and f1 score of 0.8366
Feature: 0, Score: 0.18567
Feature: 1, Score: 0.03003
Feature: 2, Score: 0.05410
Feature: 3, Score: 0.04478
Feature: 4, Score: 0.07489
Feature: 5, Score: 0.12404
Feature: 6, Score: 0.24297
Feature: 7, Score: 0.14924
Feature: 8, Score: 0.09428


In [59]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_prediction = svm_model.predict(X_test)
svm_acc = round((accuracy_score(y_test, svm_prediction)* 100), 2)
svm_f1 = round((f1_score(y_test, svm_prediction)), 4)
print(f'The Random Forest classifier has an accuracy of {random_acc}% and f1 score of {random_f1}')
from matplotlib import pyplot
importance = random_forest.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))

The Random Forest classifier has an accuracy of 76.0% and f1 score of 0.8431
Feature: 0, Score: 0.18567
Feature: 1, Score: 0.03003
Feature: 2, Score: 0.05410
Feature: 3, Score: 0.04478
Feature: 4, Score: 0.07489
Feature: 5, Score: 0.12404
Feature: 6, Score: 0.24297
Feature: 7, Score: 0.14924
Feature: 8, Score: 0.09428


In [60]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
naive_model = GaussianNB()
naive_model.fit(X_train, y_train)
naive_prediction = naive_model.predict(X_test)
naive_acc = round((accuracy_score(y_test, naive_prediction)*100), 2)
naive_f1 = round((f1_score(y_test, naive_prediction)), 4)
print('accuracy score: ', naive_acc)
print('f1_score: ', naive_f1)
from matplotlib import pyplot
importance = random_forest.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))

accuracy score:  72.0
f1_score:  0.8133
Feature: 0, Score: 0.18567
Feature: 1, Score: 0.03003
Feature: 2, Score: 0.05410
Feature: 3, Score: 0.04478
Feature: 4, Score: 0.07489
Feature: 5, Score: 0.12404
Feature: 6, Score: 0.24297
Feature: 7, Score: 0.14924
Feature: 8, Score: 0.09428


In [61]:
X_test.columns

Index(['Age', 'Sex', 'Job', 'Housing', 'Saving accounts', 'Checking account',
       'Credit amount', 'Duration', 'Purpose'],
      dtype='object')

In [None]:
X

In [64]:
X=np.array([['56', 'male', '0', 'rent', 'little', 'moderate',
       '5951', '48', 'radio/TV']])
X

array([['56', 'male', '0', 'rent', 'little', 'moderate', '5951', '48',
        'radio/TV']], dtype='<U8')

In [63]:
X[:, 1] = S_encoder.transform(X[:, 1])
X[:, 2] = J_encoder.transform(X[:, 2])
X[:, 3] = H_encoder.transform(X[:, 3])  # Reshape for compatibility
X[:, 4] = SA_encoder.transform(X[:, 4])  # Reshape for compatibility
X[:, 5] = CA_encoder.transform(X[:, 5])  # Reshape for compatibility
X[:, 8] = P_encoder.transform(X[:, 8])  # Reshape for compatibility 
X=X.astype(float)
X

ValueError: invalid literal for int() with base 10: 'skilled'

In [None]:
pred=svm_model.predict(X)
pred

In [None]:
data={"model": svm_model, "S_encoder": S_encoder, "J_encoder": J_encoder, "H_encoder": H_encoder, "SA_encoder": SA_encoder, "CA_encoder": CA_encoder, "P_encoder": P_encoder}

with open('credit_predict3.sav', 'wb') as file:
    pickle.dump(data, file)

In [None]:
with open ('credit_predict3.sav', 'rb') as file:
    data = pickle.load(file)

svm_loaded=data["model"]
S_encoder= data["S_encoder"]
J_encoder=data["J_encoder"]
H_encoder=data["H_encoder"]
SA_encoder=data["SA_encoder"]
CA_encoder=data["CA_encoder"]
P_encoder=data["P_encoder"]

In [None]:
predd = svm_model.predict(X)
predd