# Credit Risk Modeling Project

In [1]:
# importing libraries
import numpy as np
import pandas as pd

In [2]:
# loading in the data
data = pd.read_csv("../data/german_credit_data.csv")
data

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,0,67,male,2,own,,little,1169,6,radio/TV
1,1,22,female,2,own,little,moderate,5951,48,radio/TV
2,2,49,male,1,own,little,,2096,12,education
3,3,45,male,2,free,little,little,7882,42,furniture/equipment
4,4,53,male,2,free,little,little,4870,24,car
...,...,...,...,...,...,...,...,...,...,...
995,995,31,female,1,own,little,,1736,12,furniture/equipment
996,996,40,male,3,own,little,little,3857,30,car
997,997,38,male,2,own,little,,804,12,radio/TV
998,998,23,male,2,free,little,little,1845,45,radio/TV


# Data Cleaning and EDA

In [3]:
data.describe()

Unnamed: 0.1,Unnamed: 0,Age,Job,Credit amount,Duration
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,499.5,35.546,1.904,3271.258,20.903
std,288.819436,11.375469,0.653614,2822.736876,12.058814
min,0.0,19.0,0.0,250.0,4.0
25%,249.75,27.0,2.0,1365.5,12.0
50%,499.5,33.0,2.0,2319.5,18.0
75%,749.25,42.0,2.0,3972.25,24.0
max,999.0,75.0,3.0,18424.0,72.0


In [4]:
data.isna().sum()

Unnamed: 0            0
Age                   0
Sex                   0
Job                   0
Housing               0
Saving accounts     183
Checking account    394
Credit amount         0
Duration              0
Purpose               0
dtype: int64

In [5]:
data["Saving accounts"].unique()

array([nan, 'little', 'quite rich', 'rich', 'moderate'], dtype=object)

In [6]:
data["Checking account"].unique()

array(['little', 'moderate', nan, 'rich'], dtype=object)

In [7]:
# replacing NaN values in the Checking and Saving columns with an "unknown" value for valid modeling.
data.fillna("unknown", inplace=True)
data.isna().sum()

Unnamed: 0          0
Age                 0
Sex                 0
Job                 0
Housing             0
Saving accounts     0
Checking account    0
Credit amount       0
Duration            0
Purpose             0
dtype: int64

In [8]:
# dropping the unnamed column as well.
data.drop(columns=["Unnamed: 0"], inplace=True)
data

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,67,male,2,own,unknown,little,1169,6,radio/TV
1,22,female,2,own,little,moderate,5951,48,radio/TV
2,49,male,1,own,little,unknown,2096,12,education
3,45,male,2,free,little,little,7882,42,furniture/equipment
4,53,male,2,free,little,little,4870,24,car
...,...,...,...,...,...,...,...,...,...
995,31,female,1,own,little,unknown,1736,12,furniture/equipment
996,40,male,3,own,little,little,3857,30,car
997,38,male,2,own,little,unknown,804,12,radio/TV
998,23,male,2,free,little,little,1845,45,radio/TV


 # Feature Engineering

In [9]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
encoder = OneHotEncoder(sparse_output=True)
scaler = StandardScaler()

In [10]:
# getting the columns to one hot encode
encoding_cols = ["Job", "Sex", "Housing", "Saving accounts", "Checking account", "Purpose"]
encoded_cols = encoder.fit_transform(data[encoding_cols]).toarray()
encoded_df = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out(encoding_cols))
encoded_df

Unnamed: 0,Job_0,Job_1,Job_2,Job_3,Sex_female,Sex_male,Housing_free,Housing_own,Housing_rent,Saving accounts_little,...,Checking account_rich,Checking account_unknown,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others
0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
996,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
998,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [11]:
# getting columns to standard sacle
scaling_cols = ["Age", "Credit amount", "Duration"]
scaled_cols = scaler.fit_transform(data[scaling_cols])
scaled_df = pd.DataFrame(scaled_cols, columns=scaling_cols)
scaled_df

Unnamed: 0,Age,Credit amount,Duration
0,2.766456,-0.745131,-1.236478
1,-1.191404,0.949817,2.248194
2,1.183312,-0.416562,-0.738668
3,0.831502,1.634247,1.750384
4,1.535122,0.566664,0.256953
...,...,...,...
995,-0.399832,-0.544162,-0.738668
996,0.391740,0.207612,0.754763
997,0.215835,-0.874503,-0.738668
998,-1.103451,-0.505528,1.999289


In [12]:
# adding more features
credit_prop = data["Credit amount"] / data["Duration"]
credit_df = pd.DataFrame(credit_prop, columns=["Credit Proportion"])
credit_df

Unnamed: 0,Credit Proportion
0,194.833333
1,123.979167
2,174.666667
3,187.666667
4,202.916667
...,...
995,144.666667
996,128.566667
997,67.000000
998,41.000000


In [13]:
# appending all of the columns into a final df.
df = pd.concat([encoded_df, scaled_df, credit_df], axis=1)
df

Unnamed: 0,Job_0,Job_1,Job_2,Job_3,Sex_female,Sex_male,Housing_free,Housing_own,Housing_rent,Saving accounts_little,...,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others,Age,Credit amount,Duration,Credit Proportion
0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,2.766456,-0.745131,-1.236478,194.833333
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,-1.191404,0.949817,2.248194,123.979167
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.183312,-0.416562,-0.738668,174.666667
3,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.831502,1.634247,1.750384,187.666667
4,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.535122,0.566664,0.256953,202.916667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,-0.399832,-0.544162,-0.738668,144.666667
996,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.391740,0.207612,0.754763,128.566667
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.215835,-0.874503,-0.738668,67.000000
998,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,-1.103451,-0.505528,1.999289,41.000000


# Model Training

In [14]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import networkx as nx
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx

In [15]:
def create_similarity_graph(data, threshold=0.5):
    G = nx.Graph()
    for i in range(data.shape[0]):
        G.add_node(i)
    for i in range(data.shape[0]):
        for j in range(i + 1, data.shape[0]):
            if np.linalg.norm(data.iloc[i] - data.iloc[j]) < threshold:
                G.add_edge(i, j)
    return G

# Create a similarity graph
graph = create_similarity_graph(df, threshold=2.0)

print(f"Number of nodes: {graph.number_of_nodes()}")
print(f"Number of edges: {graph.number_of_edges()}")

# Convert the graph to a PyTorch Geometric data object
graph_data = from_networkx(graph)
graph_data.x = torch.tensor(df.values, dtype=torch.float)

Number of nodes: 1000
Number of edges: 138


In [16]:
class GCN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        return x

# Initialize the GCN model
input_dim = df.shape[1]
hidden_dim = 16
output_dim = 2  # We use 2-dimensional output for visualization purposes

model = GCN(input_dim, hidden_dim, output_dim)
optimizer = optim.Adam(model.parameters(), lr=0.01)


In [17]:
def train_gnn(model, data, optimizer, epochs=100):
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        output = model(data)
        loss = torch.mean(output)  # Simplified loss function for embeddings
        loss.backward()
        optimizer.step()
        if epoch % 10 == 0:
            print(f'Epoch {epoch}, Loss: {loss.item()}')

train_gnn(model, graph_data, optimizer)


In [None]:
model.eval()
with torch.no_grad():
    embeddings = model(graph_data)

# Apply K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(embeddings)

# Add the cluster labels to the nodes
graph_data.y = torch.tensor(kmeans.labels_, dtype=torch.long)

In [None]:
# Use PCA for visualization
pca = PCA(n_components=2)
pca_result = pca.fit_transform(embeddings)

# Convert to DataFrame for plotting
pca_df = pd.DataFrame(data=pca_result, columns=['PC1', 'PC2'])
pca_df['cluster'] = kmeans.labels_

# Plot the clusters
plt.figure(figsize=(10, 7))
plt.scatter(pca_df['PC1'], pca_df['PC2'], c=pca_df['cluster'], cmap='viridis')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('GNN Clustering Results')
plt.colorbar()
plt.show()