### Základy neurónových sietí STU FIIT
## Credit Risk Assessment


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import wandb
import yaml
import torch
import torch.nn as nn
from scipy.stats import mstats
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.feature_selection import SelectKBest, chi2, f_classif



df = pd.read_csv("data/dataset.csv")
pd.set_option('display.max_columns', 50)



In [2]:
wandb.login()

wandb: Currently logged in as: fajermichal48 (fajermichal48-none) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


True

### Data Analysis

In [None]:
print("=====SHAPE=====")
df.shape

In [None]:
print("=====HEAD=====")
df.head()


In [None]:
print("=====TAIL=====")
df.tail()

In [None]:
print("=====DTYPE=====")
df.dtypes

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
df.loc[df.duplicated()]

In [None]:
num_cols = df.select_dtypes(include=['number']).columns
cat_cols = df.select_dtypes(exclude=['number']).columns

print("Numeric columns:\n", list(num_cols))
print("\nCategorical columns:\n", list(cat_cols))

In [None]:
df[num_cols].hist(figsize=(10,8))
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(df[num_cols].corr(), cmap="coolwarm", annot=True, fmt=".2f",
            vmin=-1, vmax=1, center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title("Correlation Matrix", fontsize=16, pad=20)
plt.tight_layout()
plt.show()

In [None]:
cat_cols = df.select_dtypes(exclude=['number']).columns

for c in cat_cols:
    ax = sns.countplot(x=c, data=df)
    plt.title(c)
    plt.xticks(rotation=45)

    for container in ax.containers:
        ax.bar_label(container)

    plt.show()



In [3]:
df_temp = df.copy()
df_temp['class_numeric'] = df_temp['class'].map({'good': 0, 'bad': 1})

numerical_features = df_temp.select_dtypes(include=['int64', 'float64']).columns
correlations = df_temp[numerical_features].corr()['class_numeric'].drop('class_numeric')
print("Numerical feature correlations:")
print(correlations.sort_values(ascending=False))

Numerical feature correlations:
duration                  0.214927
credit_amount             0.154739
installment_commitment    0.072404
residence_since           0.002967
num_dependents           -0.003015
existing_credits         -0.045732
age                      -0.091127
Name: class_numeric, dtype: float64


In [4]:
X_categorical = df.drop(columns=['class']).select_dtypes(include=['object'])
y_numeric = df['class'].map({'good': 0, 'bad': 1})

chi2_scores, p_values = chi2(pd.get_dummies(X_categorical), y_numeric)
feature_scores = pd.DataFrame({
    'feature': pd.get_dummies(X_categorical).columns,
    'chi2_score': chi2_scores,
    'p_value': p_values
})
print("Categorical feature importance (Chi-square):")
print(feature_scores.sort_values('chi2_score', ascending=False))

Categorical feature importance (Chi-square):
                                            feature  chi2_score       p_value
3                     checking_status_'no checking'   63.002659  2.064277e-15
1                              checking_status_'<0'   48.450469  3.387373e-12
5   credit_history_'critical/other existing credit'   23.344872  1.354040e-06
8              credit_history_'no credits/all paid'   20.119048  7.276835e-06
4                         credit_history_'all paid'   17.190476  3.381273e-05
23                savings_status_'no known savings'   13.645850  2.207287e-04
37           property_magnitude_'no known property'   13.377860  2.546120e-04
0                        checking_status_'0<=X<200'   10.453001  1.224504e-03
21                            savings_status_'<100'   10.291479  1.336459e-03
38                 property_magnitude_'real estate'   10.218845  1.390129e-03
26                                  employment_'<1'    9.373200  2.201808e-03
11                 

In [5]:


X_encoded = pd.get_dummies(df.drop(columns=['class']), drop_first=True)
y_encoded = df['class'].map({'good': 0, 'bad': 1})

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_encoded, y_encoded)

feature_importance = pd.DataFrame({'feature': X_encoded.columns,'importance': rf.feature_importances_}).sort_values('importance', ascending=False)

print("Feature Importance from Random Forest:")
print(feature_importance.head(15))

Feature Importance from Random Forest:
                                            feature  importance
1                                     credit_amount    0.118910
0                                          duration    0.091799
4                                               age    0.088728
9                     checking_status_'no checking'    0.060065
7                              checking_status_'<0'    0.039443
2                            installment_commitment    0.038383
3                                   residence_since    0.038086
24                            savings_status_'<100'    0.024877
10  credit_history_'critical/other existing credit'    0.024813
39                         other_payment_plans_none    0.021346
5                                  existing_credits    0.020797
14                                purpose_'new car'    0.019922
33                    personal_status_'male single'    0.019841
45                                      job_skilled    0.018680
4

# Data preprocessing and normalization 

In [6]:


data = df.copy()

TARGET = "class"
y_temp = df[TARGET]
data = df.drop(columns=[TARGET]).copy()

cat_cols = data.select_dtypes(include=["object"]).columns
data = pd.get_dummies(data, columns=cat_cols, drop_first=True, dtype=int)

#outlier handle
num_cols = data.select_dtypes(include=["int64", "float64"]).columns
for col in num_cols:
    data[col] = mstats.winsorize(data[col], limits=[0.05, 0.05])  

y = (y_temp == 'bad').astype(int)

print("Final shape:", data.shape)
print("Any missing values?", data.isnull().sum().sum())
data.head()

Final shape: (1000, 48)
Any missing values? 0


Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents,checking_status_'<0',checking_status_'>=200',checking_status_'no checking',credit_history_'critical/other existing credit',credit_history_'delayed previously',credit_history_'existing paid',credit_history_'no credits/all paid',purpose_'new car',purpose_'used car',purpose_business,purpose_education,purpose_furniture/equipment,purpose_other,purpose_radio/tv,purpose_repairs,purpose_retraining,savings_status_'500<=X<1000',savings_status_'<100',savings_status_'>=1000',savings_status_'no known savings',employment_'4<=X<7',employment_'<1',employment_'>=7',employment_unemployed,personal_status_'male div/sep',personal_status_'male mar/wid',personal_status_'male single',other_parties_guarantor,other_parties_none,property_magnitude_'no known property',property_magnitude_'real estate',property_magnitude_car,other_payment_plans_none,other_payment_plans_stores,housing_own,housing_rent,job_'unemp/unskilled non res',job_'unskilled resident',job_skilled,own_telephone_yes,foreign_worker_yes
0,6,1169,4,4,60,2,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1,0,1,0,1,0,0,0,1,1,1
1,48,5951,2,2,22,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0,1,0,1
2,12,2096,2,3,49,1,2,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,0,1,0,0,1
3,42,7882,2,4,45,1,2,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,1
4,24,4870,3,4,53,2,2,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,1,0,1


# Data Split

In [7]:

idx_train, idx_temp, y_train, y_temp = train_test_split(
    df.index, y, test_size=0.3, random_state=11, stratify=y
)
idx_val, idx_test, y_val, y_test = train_test_split(
    idx_temp, y_temp, test_size=0.5, random_state=11, stratify=y_temp
)

X_train = data.loc[idx_train].reset_index(drop=True)
X_val = data.loc[idx_val].reset_index(drop=True)
X_test = data.loc[idx_test].reset_index(drop=True)

import pandas as pd
y_train = pd.Series(y_train).reset_index(drop=True)
y_val = pd.Series(y_val).reset_index(drop=True)
y_test = pd.Series(y_test).reset_index(drop=True)

print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_val:", X_val.shape, "y_val:", y_val.shape)
print("X_test:", X_test.shape, "y_test:", y_test.shape)


X_train: (700, 48) y_train: (700,)
X_val: (150, 48) y_val: (150,)
X_test: (150, 48) y_test: (150,)


# Normalization

In [8]:
num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns

scaler = MinMaxScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])

X_val[num_cols] = scaler.transform(X_val[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])


# Resample

In [9]:

n_class_0 = (y_train == 0).sum() 
n_class_1 = (y_train == 1).sum()  
pos_weight = n_class_0 / n_class_1  

print(f"Class counts - Good(0): {n_class_0}, Bad(1): {n_class_1}, pos_weight: {pos_weight:.3f}")



Class counts - Good(0): 490, Bad(1): 210, pos_weight: 2.333


# Configuration 

In [89]:
with open('config.yaml', 'r') as f:
    config = yaml.safe_load(f)

model_config = config['model']
train_config = config['training']

print("Using config:", config)

#convert data to PyTorch tensors
X_train_tensor = torch.from_numpy(X_train.values).float()
y_train_tensor = torch.from_numpy(y_train.values.astype(np.float32))
X_val_tensor = torch.from_numpy(X_val.values).float()
y_val_tensor = torch.from_numpy(y_val.values.astype(np.float32))
X_test_tensor = torch.from_numpy(X_test.values).float()
y_test_tensor = torch.from_numpy(y_test.values.astype(np.float32))

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=train_config['batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=train_config['batch_size'], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=train_config['batch_size'], shuffle=False)

Using config: {'model': {'input_dim': 48, 'output_dim': 1, 'hidden_layers': [64, 32], 'dropout': 0.5, 'use_bn': True}, 'training': {'batch_size': 32, 'lr': 0.0005, 'epochs': 200, 'wandb_project': 'MLP-project', 'early_stop_patience': 10, 'weight_decay': 0.001}}


# MLP model

In [90]:
class MLP(nn.Module):
    def __init__(self, config):
        super(MLP, self).__init__()
        input_dim = config['input_dim']
        hidden_layers = config['hidden_layers']
        output_dim = 1
        dropout = config['dropout']
        use_bn = config['use_bn']
        
        layers = []
        prev_dim = input_dim
        for i, hidden_dim in enumerate(hidden_layers):
            layers.append(nn.Linear(prev_dim, hidden_dim))
            if use_bn:
                layers.append(nn.BatchNorm1d(hidden_dim))
            layers.append(nn.ReLU())
            if dropout > 0 and i > 0:
                layers.append(nn.Dropout(dropout))
            prev_dim = hidden_dim
        layers.append(nn.Linear(prev_dim, output_dim))
        self.net = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.net(x).squeeze(1)

# Train model

In [91]:
#initialize model, optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MLP(model_config).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=train_config['lr'], weight_decay=train_config.get('weight_decay', 0))
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=8, min_lr=1e-6)
pos_weight = torch.tensor([np.sqrt(n_class_0 / n_class_1)]).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

wandb.init(project=train_config['wandb_project'], config=config)

#early stopping setup
patience = train_config.get('early_stop_patience', 15)
best_val_loss = float('inf')
counter = 0
best_model_state = None

for epoch in range(train_config['epochs']):
    model.train()
    train_loss = 0
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device).float() 
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    train_loss /= len(train_loader)
    
    
    #validation
    model.eval()
    val_loss = 0
    val_preds, val_labels = [], []
    with torch.no_grad():
        for batch_x, batch_y in val_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device).float()
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            val_loss += loss.item()
            preds = (torch.sigmoid(outputs) > 0.5).long().cpu().numpy()
            val_preds.extend(preds)
            val_labels.extend(batch_y.long().cpu().numpy())  
    
    val_loss /= len(val_loader)
    val_acc = accuracy_score(val_labels, val_preds)
    val_report = classification_report(val_labels, val_preds, output_dict=True, zero_division=0)
    
    val_table = wandb.Table(columns=["", "precision", "recall", "f1-score", "support"])
    for cls in ['0', '1']:
        if cls in val_report:
            val_table.add_data(cls, val_report[cls]['precision'], val_report[cls]['recall'], val_report[cls]['f1-score'], val_report[cls]['support'])
    val_table.add_data("accuracy", None, None, val_report['accuracy'], val_report['weighted avg']['support'])
    val_table.add_data("macro avg", val_report['macro avg']['precision'], val_report['macro avg']['recall'], val_report['macro avg']['f1-score'], val_report['macro avg']['support'])
    val_table.add_data("weighted avg", val_report['weighted avg']['precision'], val_report['weighted avg']['recall'], val_report['weighted avg']['f1-score'], val_report['weighted avg']['support'])

    scheduler.step(val_loss)
    if epoch % 5 == 0:
        current_lr = optimizer.param_groups[0]['lr']
        print(f"Current learning rate: {current_lr:.6f}")
    wandb.log({'epoch': epoch, 'train_loss': train_loss, 'val_loss': val_loss, 'val_acc': val_acc, 'val_classification_table': val_table})
    
    print(f"Epoch {epoch+1}/{train_config['epochs']}: Train Loss {train_loss:.4f}, Val Loss {val_loss:.4f}, Val Acc {val_acc:.4f}")
    
    #early stopping check
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        counter = 0
        best_model_state = model.state_dict()
    else:
        counter += 1
        if counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            model.load_state_dict(best_model_state)
            break

Current learning rate: 0.000500
Epoch 1/200: Train Loss 0.7998, Val Loss 0.7801, Val Acc 0.5933
Epoch 2/200: Train Loss 0.7679, Val Loss 0.7306, Val Acc 0.6600
Epoch 3/200: Train Loss 0.7312, Val Loss 0.7057, Val Acc 0.6800
Epoch 4/200: Train Loss 0.6989, Val Loss 0.6901, Val Acc 0.6933
Epoch 5/200: Train Loss 0.6660, Val Loss 0.6817, Val Acc 0.6667
Current learning rate: 0.000500
Epoch 6/200: Train Loss 0.6665, Val Loss 0.6697, Val Acc 0.6800
Epoch 7/200: Train Loss 0.6341, Val Loss 0.6612, Val Acc 0.6867
Epoch 8/200: Train Loss 0.6198, Val Loss 0.6596, Val Acc 0.7067
Epoch 9/200: Train Loss 0.6005, Val Loss 0.6468, Val Acc 0.6800
Epoch 10/200: Train Loss 0.6162, Val Loss 0.6416, Val Acc 0.7067
Current learning rate: 0.000500
Epoch 11/200: Train Loss 0.5847, Val Loss 0.6349, Val Acc 0.7267
Epoch 12/200: Train Loss 0.5580, Val Loss 0.6300, Val Acc 0.7333
Epoch 13/200: Train Loss 0.5651, Val Loss 0.6281, Val Acc 0.7267
Epoch 14/200: Train Loss 0.5552, Val Loss 0.6287, Val Acc 0.7333
Epo

# Test model

In [92]:
model.eval()
test_preds, test_labels = [], []
with torch.no_grad():
    for batch_x, batch_y in test_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device).float()
        outputs = model(batch_x)
        preds = (torch.sigmoid(outputs) > 0.5).long().cpu().numpy()
        test_preds.extend(preds)
        test_labels.extend(batch_y.long().cpu().numpy())  

test_acc = accuracy_score(test_labels, test_preds)

test_report = classification_report(test_labels, test_preds, output_dict=True, zero_division=0)

test_table = wandb.Table(columns=["", "precision", "recall", "f1-score", "support"])
for cls in ['0', '1']:
    if cls in test_report:
        test_table.add_data(cls, test_report[cls]['precision'], test_report[cls]['recall'], test_report[cls]['f1-score'], test_report[cls]['support'])
test_table.add_data("accuracy", None, None, test_report['accuracy'], test_report['weighted avg']['support'])
test_table.add_data("macro avg", test_report['macro avg']['precision'], test_report['macro avg']['recall'], test_report['macro avg']['f1-score'], test_report['macro avg']['support'])
test_table.add_data("weighted avg", test_report['weighted avg']['precision'], test_report['weighted avg']['recall'], test_report['weighted avg']['f1-score'], test_report['weighted avg']['support'])

print("\n" + "="*60)
print("FINAL TEST CLASSIFICATION REPORT:")
print(classification_report(test_labels, test_preds, digits=4, zero_division=0))
print("="*60)
print(f"Test Accuracy: {test_acc:.4f}")

wandb.log({'test_acc': test_acc, 'test_classification_table': test_table})
wandb.finish()


FINAL TEST CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0     0.8400    0.8000    0.8195       105
           1     0.5800    0.6444    0.6105        45

    accuracy                         0.7533       150
   macro avg     0.7100    0.7222    0.7150       150
weighted avg     0.7620    0.7533    0.7568       150

Test Accuracy: 0.7533


0,1
epoch,▁▁▁▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇▇██
test_acc,▁
train_loss,█▇▇▆▆▆▅▅▄▅▄▄▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▁
val_acc,▁▃▄▅▄▄▄▅▄▅▆▆▆▆▆▇▇▇▇▇█▇▇▇██▇██
val_loss,█▆▅▄▄▃▃▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▂▂

0,1
epoch,28.0
test_acc,0.75333
train_loss,0.40604
val_acc,0.78667
val_loss,0.64625
