### Základy neurónových sietí STU FIIT
## Credit Risk Assessment


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import wandb
import yaml
import torch
import torch.nn as nn
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau



df = pd.read_csv("data/dataset.csv")
pd.set_option('display.max_columns', 50)



In [2]:
wandb.login()

wandb: Currently logged in as: fajermichal48 (fajermichal48-none) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


True

### Data Analysis

In [None]:
print("=====SHAPE=====")
df.shape

In [None]:
print("=====HEAD=====")
df.head()


In [None]:
print("=====TAIL=====")
df.tail()

In [None]:
print("=====DTYPE=====")
df.dtypes

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
df.loc[df.duplicated()]

In [None]:
num_cols = df.select_dtypes(include=['number']).columns
cat_cols = df.select_dtypes(exclude=['number']).columns

print("Numeric columns:\n", list(num_cols))
print("\nCategorical columns:\n", list(cat_cols))

In [None]:
df[num_cols].hist(figsize=(10,8))
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(df[num_cols].corr(), cmap="coolwarm", annot=True, fmt=".2f",
            vmin=-1, vmax=1, center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title("Correlation Matrix", fontsize=16, pad=20)
plt.tight_layout()
plt.show()

In [None]:
cat_cols = df.select_dtypes(exclude=['number']).columns

for c in cat_cols:
    ax = sns.countplot(x=c, data=df)
    plt.title(c)
    plt.xticks(rotation=45)

    for container in ax.containers:
        ax.bar_label(container)

    plt.show()



# Data preprocessing and normalization 

In [11]:
data = df.copy()

TARGET = "class"
y_temp = df[TARGET]
data = df.drop(columns=[TARGET, "own_telephone", "foreign_worker"]).copy()

cat_cols = data.select_dtypes(include=["object"]).columns
data = pd.get_dummies(data, columns=cat_cols, drop_first=True, dtype=int)

le = LabelEncoder()
y = le.fit_transform(y_temp)

print("Final shape:", data.shape)
print("Any missing values?", data.isnull().sum().sum())
data.head()

Final shape: (1000, 46)
Any missing values? 0


Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents,checking_status_'<0',checking_status_'>=200',checking_status_'no checking',credit_history_'critical/other existing credit',credit_history_'delayed previously',credit_history_'existing paid',credit_history_'no credits/all paid',purpose_'new car',purpose_'used car',purpose_business,purpose_education,purpose_furniture/equipment,purpose_other,purpose_radio/tv,purpose_repairs,purpose_retraining,savings_status_'500<=X<1000',savings_status_'<100',savings_status_'>=1000',savings_status_'no known savings',employment_'4<=X<7',employment_'<1',employment_'>=7',employment_unemployed,personal_status_'male div/sep',personal_status_'male mar/wid',personal_status_'male single',other_parties_guarantor,other_parties_none,property_magnitude_'no known property',property_magnitude_'real estate',property_magnitude_car,other_payment_plans_none,other_payment_plans_stores,housing_own,housing_rent,job_'unemp/unskilled non res',job_'unskilled resident',job_skilled
0,6,1169,4,4,67,2,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1,0,1,0,1,0,0,0,1
1,48,5951,2,2,22,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0,1
2,12,2096,2,3,49,1,2,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,0,1,0
3,42,7882,2,4,45,1,2,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1
4,24,4870,3,4,53,2,2,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,1


# Data Split

In [12]:

idx_train, idx_temp, y_train, y_temp = train_test_split(
    df.index, y, test_size=0.4, random_state=11, stratify=y
)

idx_val, idx_test, y_val, y_test = train_test_split(
    idx_temp, y_temp, test_size=0.5, random_state=11, stratify=y_temp
)

X_train = data.loc[idx_train].reset_index(drop=True)
X_val = data.loc[idx_val].reset_index(drop=True)
X_test = data.loc[idx_test].reset_index(drop=True)

import pandas as pd
y_train = pd.Series(y_train).reset_index(drop=True)
y_val = pd.Series(y_val).reset_index(drop=True)
y_test = pd.Series(y_test).reset_index(drop=True)

print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_val:", X_val.shape, "y_val:", y_val.shape)
print("X_test:", X_test.shape, "y_test:", y_test.shape)


X_train: (600, 46) y_train: (600,)
X_val: (200, 46) y_val: (200,)
X_test: (200, 46) y_test: (200,)


# Normalization

In [13]:
num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns

scaler = MinMaxScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])

X_val[num_cols] = scaler.transform(X_val[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])


# Resample

In [20]:
train_df = X_train.copy()
train_df["target"] = y_train.values

df_major = train_df[train_df["target"] == 1]
df_minor = train_df[train_df["target"] == 0]

n_samples = len(df_minor) * 2

df_minor_up = resample(
    df_minor,
    replace=True,
    n_samples=n_samples,
    random_state=42
)

df_balanced = (
    pd.concat([df_major, df_minor_up], axis=0)
      .sample(frac=1.0, random_state=42)
      .reset_index(drop=True)
)

X_train_bal = df_balanced.drop(columns=["target"])
y_train_bal = df_balanced["target"]

print("Class distribution after oversampling:")
print(y_train_bal.value_counts())  


Class distribution after oversampling:
target
0    540
1    420
Name: count, dtype: int64


# Configuration 

In [21]:
with open('config.yaml', 'r') as f:
    config = yaml.safe_load(f)

model_config = config['model']
train_config = config['training']

print("Using config:", config)

# Convert data to PyTorch tensors
X_train_tensor = torch.from_numpy(X_train_bal.values).float()
y_train_tensor = torch.from_numpy(y_train_bal.values.astype(np.float32)) 
X_val_tensor = torch.from_numpy(X_val.values).float()
y_val_tensor = torch.from_numpy(y_val.values.astype(np.float32))
X_test_tensor = torch.from_numpy(X_test.values).float()
y_test_tensor = torch.from_numpy(y_test.values.astype(np.float32))

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=train_config['batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=train_config['batch_size'], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=train_config['batch_size'], shuffle=False)

Using config: {'model': {'input_dim': 46, 'output_dim': 1, 'hidden_layers': [64, 32], 'dropout': 0.4, 'use_bn': True}, 'training': {'batch_size': 16, 'lr': 0.0005, 'epochs': 120, 'wandb_project': 'MLP-project', 'early_stop_patience': 25, 'weight_decay': 0.005}}


# MLP model

In [22]:
class MLP(nn.Module):
    def __init__(self, config):
        super(MLP, self).__init__()
        input_dim = config['input_dim']
        hidden_layers = config['hidden_layers']
        output_dim = 1 
        dropout = config['dropout']
        use_bn = config['use_bn']
        
        layers = []
        prev_dim = input_dim
        for hidden_dim in hidden_layers:
            layers.append(nn.Linear(prev_dim, hidden_dim))
            if use_bn:
                layers.append(nn.BatchNorm1d(hidden_dim))
            layers.append(nn.ReLU())
            if dropout > 0:
                layers.append(nn.Dropout(dropout))
            prev_dim = hidden_dim
        layers.append(nn.Linear(prev_dim, output_dim))
        
        self.net = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.net(x).squeeze(1)  

# Train model

In [23]:
# Initialize model, optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MLP(model_config).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=train_config['lr'], weight_decay=train_config.get('weight_decay', 0))
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10, min_lr=1e-5)
criterion = nn.BCEWithLogitsLoss()

wandb.init(project=train_config['wandb_project'], config=config)

# Early stopping setup
patience = train_config.get('early_stop_patience', 15)
best_val_loss = float('inf')
counter = 0
best_model_state = None

for epoch in range(train_config['epochs']):
    model.train()
    train_loss = 0
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device).float() 
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    train_loss /= len(train_loader)
    
    # Validation
    model.eval()
    val_loss = 0
    val_preds, val_labels = [], []
    with torch.no_grad():
        for batch_x, batch_y in val_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device).float()
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            val_loss += loss.item()
            preds = (torch.sigmoid(outputs) > 0.5).long().cpu().numpy()
            val_preds.extend(preds)
            val_labels.extend(batch_y.long().cpu().numpy())  
    
    val_loss /= len(val_loader)
    val_acc = accuracy_score(val_labels, val_preds)
    val_report = classification_report(val_labels, val_preds, output_dict=True, zero_division=0)
    
    val_table = wandb.Table(columns=["", "precision", "recall", "f1-score", "support"])
    for cls in ['0', '1']:
        if cls in val_report:
            val_table.add_data(cls, val_report[cls]['precision'], val_report[cls]['recall'], val_report[cls]['f1-score'], val_report[cls]['support'])
    val_table.add_data("accuracy", None, None, val_report['accuracy'], val_report['weighted avg']['support'])
    val_table.add_data("macro avg", val_report['macro avg']['precision'], val_report['macro avg']['recall'], val_report['macro avg']['f1-score'], val_report['macro avg']['support'])
    val_table.add_data("weighted avg", val_report['weighted avg']['precision'], val_report['weighted avg']['recall'], val_report['weighted avg']['f1-score'], val_report['weighted avg']['support'])

    scheduler.step(val_loss)
    wandb.log({'epoch': epoch, 'train_loss': train_loss, 'val_loss': val_loss, 'val_acc': val_acc, 'val_classification_table': val_table})
    
    print(f"Epoch {epoch+1}/{train_config['epochs']}: Train Loss {train_loss:.4f}, Val Loss {val_loss:.4f}, Val Acc {val_acc:.4f}")
    
    # Early stopping check
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        counter = 0
        best_model_state = model.state_dict()
    else:
        counter += 1
        if counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            model.load_state_dict(best_model_state)
            break

Epoch 1/120: Train Loss 0.7030, Val Loss 0.6991, Val Acc 0.5450
Epoch 2/120: Train Loss 0.6353, Val Loss 0.6764, Val Acc 0.6000
Epoch 3/120: Train Loss 0.6165, Val Loss 0.6540, Val Acc 0.6150
Epoch 4/120: Train Loss 0.5959, Val Loss 0.6445, Val Acc 0.6250
Epoch 5/120: Train Loss 0.5566, Val Loss 0.6327, Val Acc 0.6400
Epoch 6/120: Train Loss 0.5514, Val Loss 0.6349, Val Acc 0.6500
Epoch 7/120: Train Loss 0.5182, Val Loss 0.6521, Val Acc 0.6500
Epoch 8/120: Train Loss 0.5106, Val Loss 0.6202, Val Acc 0.6450
Epoch 9/120: Train Loss 0.4827, Val Loss 0.6378, Val Acc 0.6500
Epoch 10/120: Train Loss 0.4776, Val Loss 0.6385, Val Acc 0.6500
Epoch 11/120: Train Loss 0.4832, Val Loss 0.6501, Val Acc 0.6300
Epoch 12/120: Train Loss 0.4621, Val Loss 0.6334, Val Acc 0.6250
Epoch 13/120: Train Loss 0.4787, Val Loss 0.6456, Val Acc 0.6250
Epoch 14/120: Train Loss 0.4441, Val Loss 0.6468, Val Acc 0.6250
Epoch 15/120: Train Loss 0.4461, Val Loss 0.6798, Val Acc 0.6150
Epoch 16/120: Train Loss 0.4445, V

# Test model

In [24]:
model.eval()
test_preds, test_labels = [], []
with torch.no_grad():
    for batch_x, batch_y in test_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device).float()
        outputs = model(batch_x)
        preds = (torch.sigmoid(outputs) > 0.5).long().cpu().numpy()
        test_preds.extend(preds)
        test_labels.extend(batch_y.long().cpu().numpy())  

test_acc = accuracy_score(test_labels, test_preds)

test_report = classification_report(test_labels, test_preds, output_dict=True, zero_division=0)

test_table = wandb.Table(columns=["", "precision", "recall", "f1-score", "support"])
for cls in ['0', '1']:
    if cls in test_report:
        test_table.add_data(cls, test_report[cls]['precision'], test_report[cls]['recall'], test_report[cls]['f1-score'], test_report[cls]['support'])
test_table.add_data("accuracy", None, None, test_report['accuracy'], test_report['weighted avg']['support'])
test_table.add_data("macro avg", test_report['macro avg']['precision'], test_report['macro avg']['recall'], test_report['macro avg']['f1-score'], test_report['macro avg']['support'])
test_table.add_data("weighted avg", test_report['weighted avg']['precision'], test_report['weighted avg']['recall'], test_report['weighted avg']['f1-score'], test_report['weighted avg']['support'])

print("\n" + "="*60)
print("FINAL TEST CLASSIFICATION REPORT:")
print(classification_report(test_labels, test_preds, digits=4, zero_division=0))
print("="*60)
print(f"Test Accuracy: {test_acc:.4f}")

wandb.log({'test_acc': test_acc, 'test_classification_table': test_table})
wandb.finish()


FINAL TEST CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0     0.4583    0.7333    0.5641        60
           1     0.8462    0.6286    0.7213       140

    accuracy                         0.6600       200
   macro avg     0.6522    0.6810    0.6427       200
weighted avg     0.7298    0.6600    0.6741       200

Test Accuracy: 0.6600


0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇███
test_acc,▁
train_loss,█▇▆▆▅▅▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▂▂▁▁▁▁
val_acc,▁▄▅▆▇▇▇▇▇▇▆▆▆▆▅▇▇█▇█▇▇▇▇▇▇▇▇▇▇▇▇▇
val_loss,█▆▄▃▂▂▄▁▃▃▄▂▃▃▆▃▄▅▃▄▅▃▅▄▄▃▅▅▄▂▅▆▅

0,1
epoch,32.0
test_acc,0.66
train_loss,0.36765
val_acc,0.65
val_loss,0.67085
