In [None]:
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-{torchversion}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-{torchversion}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git
!pip install torch-geometric pandas numpy scikit-learn networkx matplotlib seaborn tqdm -q

In [None]:
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import kneighbors_graph
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.utils import to_undirected
from torch_geometric.nn import SAGEConv, GATv2Conv, GCNConv
import psutil

try:
    import pynvml
    pynvml.nvmlInit()
    NVML_AVAILABLE = True
except:
    NVML_AVAILABLE = False

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
CSV_PATH = '/home/klema/sibnn/gnn_tbank/check_notebooks/data/accepted_2007_to_2018Q4.csv'

Lending club dataset (2007-2018)
При загрузке данных отбираем только релевантные колонки:
1. Числовые: сумма кредита, ставка, доход и др
2. Категориальные: Цель кредита, рейтинг, стаж и др

Целевая переменная - loan_status

In [3]:
print("Загрузка данных")

usecols = [
    'loan_amnt', 'int_rate', 'installment', 'grade', 'emp_length', 
    'home_ownership', 'annual_inc', 'verification_status', 'loan_status',
    'purpose', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
    'pub_rec', 'revol_bal', 'revol_util', 'total_acc'
]

df = pd.read_csv(CSV_PATH, usecols=usecols, low_memory=False)

# Фильтруем только завершенные займы. Fully Paid - успешно погашен, Charged Off - дефолт.
df = df[df['loan_status'].isin(['Fully Paid', 'Charged Off'])]
df['target'] = df['loan_status'].map({'Fully Paid': 0, 'Charged Off': 1})

numeric_features = ['loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti',
                   'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 
                   'revol_bal', 'revol_util', 'total_acc']
categorical_features = ['grade', 'emp_length', 'home_ownership', 'verification_status', 'purpose']

X_num = df[numeric_features].copy()
X_num.replace([np.inf, -np.inf], np.nan, inplace=True)
X_num.fillna(X_num.median(), inplace=True)

X_cat = df[categorical_features].copy()
X_cat.fillna('Unknown', inplace=True)
X_cat_dummies = pd.get_dummies(X_cat, drop_first=True)

X = pd.concat([X_num, X_cat_dummies], axis=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
y = df['target'].values

# Строим разреженный граф симметричных связей на основе признакового сходства методом k-ближайших соседей.
k = 5
adj_matrix = kneighbors_graph(X_scaled, k, mode='connectivity', include_self=False)
edge_index = torch.tensor(np.array(adj_matrix.nonzero()), dtype=torch.long)
edge_index = to_undirected(edge_index)

num_nodes = len(X_scaled)
indices = np.arange(num_nodes)
train_idx, temp_idx = train_test_split(indices, test_size=0.4, random_state=42)
val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=42)

train_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)
train_mask[train_idx] = True
val_mask[val_idx] = True
test_mask[test_idx] = True

data = Data(
    x=torch.tensor(X_scaled, dtype=torch.float),
    edge_index=edge_index,
    y=torch.tensor(y, dtype=torch.long),
    train_mask=train_mask,
    val_mask=val_mask,
    test_mask=test_mask
)

print(f"Загружено {data.x.shape[0]} узлов, {data.x.shape[1]} признаков")
print(f"Train/Val/Test: {train_mask.sum().item()}/{val_mask.sum().item()}/{test_mask.sum().item()}")
print(f"Рёбер: {edge_index.shape[1]}")

class_counts = torch.bincount(data.y).tolist()
class_dist_str = ", ".join(f"Класс {i}: {count}" for i, count in enumerate(class_counts))
print(f"Распределение классов: {class_dist_str}")

Загрузка данных
Загружено 1345310 узлов, 49 признаков
Train/Val/Test: 807186/269062/269062
Рёбер: 9756360
Распределение классов: Класс 0: 1076751, Класс 1: 268559


In [4]:
def accuracy(pred_y, y):
    return ((pred_y == y).sum() / len(y)).item()

def test(model, data):
    model.eval()
    _, out = model(data.x, data.edge_index)
    acc = accuracy(out.argmax(dim=1)[data.test_mask], data.y[data.test_mask])
    return acc

class GraphSAGE(torch.nn.Module):
    def __init__(self, dim_in, dim_h, dim_out):
        super().__init__()
        self.sage1 = SAGEConv(dim_in, dim_h)
        self.sage2 = SAGEConv(dim_h, dim_out)
        self.optimizer = torch.optim.Adam(self.parameters(),
                                        lr=0.01,
                                        weight_decay=5e-4)

    def forward(self, x, edge_index):
        h = self.sage1(x, edge_index)
        h = torch.relu(h)
        h = F.dropout(h, p=0.5, training=self.training)
        h = self.sage2(h, edge_index)
        return h, F.log_softmax(h, dim=1)

    def fit(self, data, epochs):
        criterion = torch.nn.CrossEntropyLoss()
        optimizer = self.optimizer

        self.train()
        for epoch in range(epochs+1):
            optimizer.zero_grad()
            _, out = self(data.x, data.edge_index)
            loss = criterion(out[data.train_mask], data.y[data.train_mask])
            acc = accuracy(out[data.train_mask].argmax(dim=1),
                          data.y[data.train_mask])
            loss.backward()
            optimizer.step()

            val_loss = criterion(out[data.val_mask], data.y[data.val_mask])
            val_acc = accuracy(out[data.val_mask].argmax(dim=1),
                              data.y[data.val_mask])

            if(epoch % 10 == 0):
                print(f'Epoch {epoch:>3} | Train Loss: {loss:.3f} | Train Acc:'
                      f' {acc*100:>6.2f}% | Val Loss: {val_loss:.2f} | '
                      f'Val Acc: {val_acc*100:.2f}%')

class GAT(torch.nn.Module):
    def __init__(self, dim_in, dim_h, dim_out, heads=4):
        super().__init__()
        self.gat1 = GATv2Conv(dim_in, dim_h, heads=heads, dropout=0.6)
        self.gat2 = GATv2Conv(dim_h*heads, dim_out, heads=heads, dropout=0.6)
        self.optimizer = torch.optim.Adam(self.parameters(),
                                        lr=0.005,
                                        weight_decay=5e-4)

    def forward(self, x, edge_index):
        h = F.dropout(x, p=0.6, training=self.training)
        h = self.gat1(h, edge_index)
        h = F.elu(h)
        h = F.dropout(h, p=0.6, training=self.training)
        h = self.gat2(h, edge_index)
        return h, F.log_softmax(h, dim=1)

    def fit(self, data, epochs):
        criterion = torch.nn.CrossEntropyLoss()
        optimizer = self.optimizer

        self.train()
        for epoch in range(epochs+1):
            optimizer.zero_grad()
            _, out = self(data.x, data.edge_index)
            loss = criterion(out[data.train_mask], data.y[data.train_mask])
            acc = accuracy(out[data.train_mask].argmax(dim=1),
                          data.y[data.train_mask])
            loss.backward()
            optimizer.step()

            val_loss = criterion(out[data.val_mask], data.y[data.val_mask])
            val_acc = accuracy(out[data.val_mask].argmax(dim=1),
                              data.y[data.val_mask])

            if(epoch % 10 == 0):
                print(f'Epoch {epoch:>3} | Train Loss: {loss:.3f} | Train Acc:'
                      f' {acc*100:>6.2f}% | Val Loss: {val_loss:.2f} | '
                      f'Val Acc: {val_acc*100:.2f}%')

class GCN(torch.nn.Module):
    def __init__(self, dim_in, dim_h, dim_out):
        super().__init__()
        self.gcn1 = GCNConv(dim_in, dim_h)
        self.gcn2 = GCNConv(dim_h, dim_out)
        self.optimizer = torch.optim.Adam(self.parameters(),
                                        lr=0.01,
                                        weight_decay=5e-4)

    def forward(self, x, edge_index):
        h = F.dropout(x, p=0.5, training=self.training)
        h = self.gcn1(h, edge_index)
        h = torch.relu(h)
        h = F.dropout(h, p=0.5, training=self.training)
        h = self.gcn2(h, edge_index)
        return h, F.log_softmax(h, dim=1)

    def fit(self, data, epochs):
        criterion = torch.nn.CrossEntropyLoss()
        optimizer = self.optimizer

        self.train()
        for epoch in range(epochs+1):
            optimizer.zero_grad()
            _, out = self(data.x, data.edge_index)
            loss = criterion(out[data.train_mask], data.y[data.train_mask])
            acc = accuracy(out[data.train_mask].argmax(dim=1),
                          data.y[data.train_mask])
            loss.backward()
            optimizer.step()

            val_loss = criterion(out[data.val_mask], data.y[data.val_mask])
            val_acc = accuracy(out[data.val_mask].argmax(dim=1),
                              data.y[data.val_mask])

            if(epoch % 10 == 0):
                print(f'Epoch {epoch:>3} | Train Loss: {loss:.3f} | Train Acc:'
                      f' {acc*100:>6.2f}% | Val Loss: {val_loss:.2f} | '
                      f'Val Acc: {val_acc*100:.2f}%')

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = data.to(device)

def monitor_resources():
    stats = {}
    # CPU & RAM
    stats['ram_mb'] = psutil.virtual_memory().used / (1024 ** 2)
    stats['cpu_percent'] = psutil.cpu_percent()

    # GPU
    if device.type == 'cuda' and NVML_AVAILABLE:
        handle = pynvml.nvmlDeviceGetHandleByIndex(0)
        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        util = pynvml.nvmlDeviceGetUtilizationRates(handle)
        stats['gpu_mem_mb'] = mem_info.used / (1024 ** 2)
        stats['gpu_util'] = util.gpu
    else:
        stats['gpu_mem_mb'] = None
        stats['gpu_util'] = None
    return stats

def train_with_monitoring(model, data, epochs, model_name):
    print(f"\n{'='*50}\nTraining {model_name} with resource monitoring\n{'='*50}")
    
    if device.type == 'cuda':
        torch.cuda.reset_peak_memory_stats()
        torch.cuda.empty_cache()

    start_time = time.time()

    model.fit(data, epochs)

    final_ram = psutil.virtual_memory().used / (1024 ** 2)
    max_gpu_mem = None
    if device.type == 'cuda':
        max_gpu_mem = torch.cuda.max_memory_allocated() / (1024 ** 2)  # MB

    duration = time.time() - start_time
    test_acc = test(model, data)

    results = {
        'test_acc': test_acc,
        'training_time_sec': duration,
        'final_ram_mb': final_ram,
        'max_gpu_mem_mb': max_gpu_mem,
    }

    print(f"\n{model_name} finished")
    print(f"Test Accuracy: {test_acc*100:.2f}%")
    print(f"Training Time: {duration:.1f} sec")
    if max_gpu_mem:
        print(f"Peak GPU Memory: {max_gpu_mem:.1f} MB")
    print(f"Final RAM Usage: {final_ram:.1f} MB")

    return test_acc, results

In [6]:
%%time

results = {}

# 1. GraphSAGE
graphsage = GraphSAGE(data.x.shape[1], dim_h=64, dim_out=2).to(device)
acc_sage, _ = train_with_monitoring(graphsage, data, epochs=100, model_name="GraphSAGE")
results['GraphSAGE'] = acc_sage


Training GraphSAGE with resource monitoring
Epoch   0 | Train Loss: 0.802 | Train Acc:  41.97% | Val Loss: 0.80 | Val Acc: 41.89%
Epoch  10 | Train Loss: 0.479 | Train Acc:  78.96% | Val Loss: 0.48 | Val Acc: 79.12%
Epoch  20 | Train Loss: 0.467 | Train Acc:  79.97% | Val Loss: 0.46 | Val Acc: 80.14%
Epoch  30 | Train Loss: 0.463 | Train Acc:  80.02% | Val Loss: 0.46 | Val Acc: 80.17%
Epoch  40 | Train Loss: 0.461 | Train Acc:  80.04% | Val Loss: 0.46 | Val Acc: 80.20%
Epoch  50 | Train Loss: 0.459 | Train Acc:  80.09% | Val Loss: 0.46 | Val Acc: 80.24%
Epoch  60 | Train Loss: 0.459 | Train Acc:  80.12% | Val Loss: 0.46 | Val Acc: 80.27%
Epoch  70 | Train Loss: 0.458 | Train Acc:  80.12% | Val Loss: 0.46 | Val Acc: 80.27%
Epoch  80 | Train Loss: 0.458 | Train Acc:  80.14% | Val Loss: 0.46 | Val Acc: 80.29%
Epoch  90 | Train Loss: 0.457 | Train Acc:  80.15% | Val Loss: 0.46 | Val Acc: 80.28%
Epoch 100 | Train Loss: 0.457 | Train Acc:  80.16% | Val Loss: 0.45 | Val Acc: 80.31%

GraphSAG

In [8]:
%%time

# 2. GCN
gcn = GCN(data.x.shape[1], dim_h=64, dim_out=2).to(device)
acc_gcn, _ = train_with_monitoring(gcn, data, epochs=100, model_name="GCN")
results['GCN'] = acc_gcn


Training GCN with resource monitoring
Epoch   0 | Train Loss: 1.616 | Train Acc:  25.12% | Val Loss: 1.62 | Val Acc: 25.08%
Epoch  10 | Train Loss: 0.595 | Train Acc:  79.38% | Val Loss: 0.59 | Val Acc: 79.58%
Epoch  20 | Train Loss: 0.512 | Train Acc:  78.12% | Val Loss: 0.51 | Val Acc: 78.34%
Epoch  30 | Train Loss: 0.486 | Train Acc:  79.23% | Val Loss: 0.48 | Val Acc: 79.43%
Epoch  40 | Train Loss: 0.477 | Train Acc:  79.63% | Val Loss: 0.47 | Val Acc: 79.82%
Epoch  50 | Train Loss: 0.471 | Train Acc:  79.77% | Val Loss: 0.47 | Val Acc: 79.96%
Epoch  60 | Train Loss: 0.468 | Train Acc:  79.87% | Val Loss: 0.47 | Val Acc: 80.03%
Epoch  70 | Train Loss: 0.467 | Train Acc:  79.96% | Val Loss: 0.46 | Val Acc: 80.11%
Epoch  80 | Train Loss: 0.466 | Train Acc:  80.00% | Val Loss: 0.46 | Val Acc: 80.15%
Epoch  90 | Train Loss: 0.465 | Train Acc:  80.02% | Val Loss: 0.46 | Val Acc: 80.19%
Epoch 100 | Train Loss: 0.464 | Train Acc:  80.03% | Val Loss: 0.46 | Val Acc: 80.17%

GCN finished
T

In [None]:
%%time

# 3. GAT
gat = GAT(data.x.shape[1], dim_h=32, dim_out=2, heads=4).to(device)
acc_gat, _ = train_with_monitoring(gat, data, epochs=100, model_name="GAT")
results['GAT'] = acc_gat