In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Import

In [None]:
!pip install torch_geometric

In [None]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/KLTN/FDP_VN_1year_binary_FIN_WEIGHTED_SEN_2010_2022.csv")
df.head()

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
import pandas as pd
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# Load graph mới
data = torch.load(
    "/content/drive/MyDrive/KLTN/graph_data.pt",
    weights_only=False
)
data = data.to(device)

print(data)
print("Num nodes:", data.num_nodes)
print("Num edges:", data.edge_index.shape[1])

In [None]:
from torch_geometric.nn import SAGEConv

class GraphSAGE(torch.nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, dropout):
        super().__init__()
        self.conv1 = SAGEConv(in_dim, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, out_dim)
        self.dropout = dropout

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, edge_index)
        return x

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, classification_report
from sklearn.utils.class_weight import compute_class_weight

In [None]:
# ===== TEMPORAL MASK (REBUILD FROM DF) =====
years = df["Year"].values

train_mask = torch.tensor(years <= 2021, device=device)
test_mask  = torch.tensor(years == 2022, device=device)

print("Train samples:", train_mask.sum().item())
print("Test samples :", test_mask.sum().item())

In [None]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.array([0,1]),
    y=df.loc[years <= 2021, "Next_year_binary_distress_label"].values
)

class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
print("Class weights:", class_weights)

# Tuned RF

In [None]:
!pip install imbalanced-learn

In [None]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

In [None]:
df = pd.read_csv(
    "/content/drive/MyDrive/KLTN/FDP_VN_1year_binary_FIN_WEIGHTED_SEN_2010_2022.csv"
)

df['Code'] = df['Code'].astype(str).str.strip().str.upper()
df = df.sort_values(['Code', 'Year']).reset_index(drop=True)

feature_cols = [f'X{i}' for i in range(1, 20)] + ['SEN']
X = df[feature_cols].values
y = df['Next_year_binary_distress_label'].values

In [None]:
train_mask = df['Year'] <= 2021
test_mask  = df['Year'] == 2022

X_train, y_train = X[train_mask], y[train_mask]
X_test,  y_test  = X[test_mask],  y[test_mask]

print("Train:", len(X_train))
print("Test :", len(X_test))

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

In [None]:
print("Before SMOTE:", np.bincount(y_train))

smote = SMOTE(
    sampling_strategy='auto',
    random_state=42,
    k_neighbors=5
)

X_train_sm, y_train_sm = smote.fit_resample(X_train_scaled, y_train)

print("After SMOTE :", np.bincount(y_train_sm))

In [None]:
rf = RandomForestClassifier(
    random_state=42,
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=2,
    max_features='sqrt',
    bootstrap=False,
    n_jobs=-1
)

rf.fit(X_train_sm, y_train_sm)
print("RF trained with SMOTE + scaling")

In [None]:
y_pred = rf.predict(X_test_scaled)

print("\n===== FINAL TEST (2022) – RF + SMOTE + SCALING =====")
print(classification_report(y_test, y_pred, digits=4))

In [None]:
import pandas as pd
import numpy as np
from scipy import stats

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, classification_report
)
from imblearn.over_sampling import SMOTE

In [None]:
# LOAD DATA
data = pd.read_csv(
    "/content/drive/MyDrive/KLTN/FDP_VN_1year_binary_FIN_WEIGHTED_SEN_2010_2022.csv"
)
arr = data.to_numpy()

# FEATURES / LABEL
colxx = 21 + 1  # 19 FIN + SEN
X = arr[:, 2:colxx].astype(float)
Y = arr[:, colxx:colxx+1]

# GLOBAL Z-SCORE (NHƯ CODE GỐC)
X = stats.zscore(X, axis=0)

# SPLIT BY INDEX
rowxx = 11634
X_train = X[:rowxx]
X_test  = X[rowxx:]
y_train = np.ravel(Y[:rowxx]).astype(int)
y_test  = np.ravel(Y[rowxx:]).astype(int)

print("Before SMOTE:", np.bincount(y_train))

# SMOTE
sm = SMOTE(random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

print("After SMOTE :", np.bincount(y_train_sm))

# RANDOM FOREST (Y NGUYÊN THAM SỐ)
rf = RandomForestClassifier(
    random_state=42,
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=2,
    max_features='sqrt',
    bootstrap=False,
    n_jobs=-1
)

rf.fit(X_train_sm, y_train_sm)

# TEST
y_pred = rf.predict(X_test)

print("\n===== RF + ZSCORE (GLOBAL) + SMOTE =====")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))


In [None]:
import pandas as pd
import numpy as np
from scipy import stats

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

data = pd.read_csv(
    "/content/drive/MyDrive/KLTN/FDP_VN_1year_binary_FIN_WEIGHTED_SEN_2010_2022.csv"
)
arr = data.to_numpy()

colxx = 21 + 1  # 19 FIN + SEN
X = arr[:, 2:colxx].astype(float)
Y = arr[:, colxx:colxx+1]

mean = X_train.mean(axis=0)
std  = X_train.std(axis=0)

# tránh chia cho 0
std[std == 0] = 1.0

X_train = (X_train - mean) / std
X_test  = (X_test  - mean) / std

print("Before SMOTE:", np.bincount(y_train))

sm = SMOTE(random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

print("After SMOTE :", np.bincount(y_train_sm))

rf = RandomForestClassifier(
    random_state=42,
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=2,
    max_features='sqrt',
    bootstrap=False,
    n_jobs=-1
)

rf.fit(X_train_sm, y_train_sm)

y_pred = rf.predict(X_test)

print("\n===== RF + ZSCORE (TRAIN ONLY) + SMOTE =====")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))

In [None]:
import pandas as pd
import numpy as np
from scipy import stats

from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix


data = pd.read_csv(
    "/content/drive/MyDrive/KLTN/FDP_VN_1year_binary_FIN_WEIGHTED_SEN_2010_2022.csv"
)

arr = data.to_numpy()

# Split predictors and true label
colxx = 21 + 1   # 19 FIN + SEN
X = arr[:, 2:colxx].astype(float)
Y = arr[:, colxx:colxx+1]

X = stats.zscore(X, axis=0)

rowxx = 11634
X_train = X[:rowxx]
X_test  = X[rowxx:]

y_train = np.ravel(Y[:rowxx]).astype(int)
y_test  = np.ravel(Y[rowxx:]).astype(int)

print("Train distribution:", np.bincount(y_train))
print("Test distribution :", np.bincount(y_test))

neg, pos = np.bincount(y_train)
scale_pos_weight = neg / pos
print("scale_pos_weight:", scale_pos_weight)


xgb = XGBClassifier(
    n_estimators=500,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1
)

xgb.fit(X_train, y_train)
print("XGBoost trained.")

y_pred = xgb.predict(X_test)

print("\n===== XGBOOST + GLOBAL ZSCORE (NO SMOTE) =====")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))


# SAGE

In [None]:
def tune_graphsage(data, train_mask):
    hidden_dims = [32, 64, 128]
    lrs = [5e-4, 1e-3, 3e-3]
    dropouts = [0.3, 0.5]
    weight_decays = [1e-4, 5e-4]
    use_weighted_loss = [False, True]

    train_idx = train_mask.nonzero(as_tuple=True)[0].cpu().numpy()
    y_train = data.y[train_mask].cpu().numpy()

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    best_f1 = 0
    best_cfg = None

    for hd in hidden_dims:
        for lr in lrs:
            for dp in dropouts:
                for wd in weight_decays:
                    for weighted in use_weighted_loss:

                        fold_f1 = []

                        for tr, val in skf.split(train_idx, y_train):
                            tr_mask = torch.zeros(data.num_nodes, dtype=torch.bool, device=device)
                            val_mask = torch.zeros(data.num_nodes, dtype=torch.bool, device=device)

                            tr_mask[train_idx[tr]] = True
                            val_mask[train_idx[val]] = True

                            model = GraphSAGE(
                                in_dim=data.x.shape[1],
                                hidden_dim=hd,
                                out_dim=2,
                                dropout=dp
                            ).to(device)

                            opt = torch.optim.Adam(
                                model.parameters(),
                                lr=lr,
                                weight_decay=wd
                            )

                            for epoch in range(50):
                                model.train()
                                opt.zero_grad()
                                out = model(data)

                                if weighted:
                                    loss = F.cross_entropy(
                                        out[tr_mask],
                                        data.y[tr_mask],
                                        weight=class_weights
                                    )
                                else:
                                    loss = F.cross_entropy(
                                        out[tr_mask],
                                        data.y[tr_mask]
                                    )

                                loss.backward()
                                opt.step()

                            model.eval()
                            with torch.no_grad():
                                preds = model(data)[val_mask].argmax(dim=1)
                                f1 = f1_score(
                                    data.y[val_mask].cpu(),
                                    preds.cpu(),
                                    average="macro"
                                )
                                fold_f1.append(f1)

                        mean_f1 = np.mean(fold_f1)

                        if mean_f1 > best_f1:
                            best_f1 = mean_f1
                            best_cfg = (hd, lr, dp, wd, weighted)

                        print(
                            f"hd={hd}, lr={lr}, dp={dp}, wd={wd}, weighted={weighted} → F1={mean_f1:.4f}"
                        )

    print("\n==============================")
    print("BEST GraphSAGE (new graph)")
    print("Macro-F1:", best_f1)
    print("Config:", best_cfg)
    print("==============================")

    return best_cfg

In [None]:
best_cfg = tune_graphsage(data, train_mask)

In [None]:
def final_test_graphsage(data, cfg, train_mask, test_mask):
    hd, lr, dp, wd, weighted = cfg

    model = GraphSAGE(
        in_dim=data.x.shape[1],
        hidden_dim=hd,
        out_dim=2,
        dropout=dp
    ).to(device)

    opt = torch.optim.Adam(
        model.parameters(),
        lr=lr,
        weight_decay=wd
    )

    for epoch in range(100):
        model.train()
        opt.zero_grad()
        out = model(data)

        if weighted:
            loss = F.cross_entropy(
                out[train_mask],
                data.y[train_mask],
                weight=class_weights
            )
        else:
            loss = F.cross_entropy(
                out[train_mask],
                data.y[train_mask]
            )

        loss.backward()
        opt.step()

    model.eval()
    with torch.no_grad():
        logits = model(data)
        preds = logits[test_mask].argmax(dim=1).cpu().numpy()
        labels = data.y[test_mask].cpu().numpy()

    print("\n===== FINAL TEST (2022) – GraphSAGE (new graph) =====")
    print(classification_report(labels, preds, digits=4))

In [None]:
final_test_graphsage(data, best_cfg, train_mask, test_mask)

# GAT

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GATConv
from sklearn.metrics import classification_report
import numpy as np

## Baseline

In [None]:
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = data.to(device)

model = GAT(
    in_dim=data.num_features,
    hidden_dim=32,     # có thể thử 64
    out_dim=2,
    heads=4,
    dropout=0.5
).to(device)

optimizer = torch.optim.Adam(
    model.parameters(),
    lr=0.005,
    weight_decay=5e-4
)

In [None]:
class GAT(torch.nn.Module):
    def __init__(self, in_dim, hidden_dim=32, out_dim=2, heads=4, dropout=0.5):
        super().__init__()

        self.gat1 = GATConv(
            in_dim,
            hidden_dim,
            heads=heads,
            dropout=dropout,
            add_self_loops=True
        )

        self.gat2 = GATConv(
            hidden_dim * heads,
            out_dim,
            heads=1,
            concat=False,
            dropout=dropout,
            add_self_loops=True
        )

        self.dropout = dropout

    def forward(self, x, edge_index):
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.gat1(x, edge_index)
        x = F.elu(x)

        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.gat2(x, edge_index)

        return x

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = data.to(device)

model = GAT(
    in_dim=data.num_features,
    hidden_dim=32,
    out_dim=2,
    heads=4,
    dropout=0.5
).to(device)

In [None]:
optimizer = torch.optim.Adam(
    model.parameters(),
    lr=0.005,
    weight_decay=5e-4
)

In [None]:
def train():
    model.train()
    optimizer.zero_grad()

    out = model(data.x, data.edge_index)
    loss = F.cross_entropy(
        out[train_mask],
        data.y[train_mask]
    )

    loss.backward()
    optimizer.step()
    return loss.item()

In [None]:
@torch.no_grad()
def test():
    model.eval()
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)

    y_true = data.y[test_mask].cpu().numpy()
    y_pred = pred[test_mask].cpu().numpy()

    print("\n===== FINAL TEST (2022) – GAT (new graph) =====")
    print(classification_report(y_true, y_pred, digits=4))

In [None]:
EPOCHS = 300

for epoch in range(1, EPOCHS + 1):
    loss = train()
    if epoch % 50 == 0:
        print(f"Epoch {epoch:03d} | Loss: {loss:.4f}")

test()

Although the proposed sparse and sector-aware graph structure is theoretically suitable for attention-based GNNs, empirical results show that GAT does not outperform GraphSAGE on this dataset. This suggests that when neighborhood nodes exhibit high feature similarity and label noise, attention mechanisms may fail to provide additional discriminative power and can even amplify noise.

# R-GCN

## Baseline

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import RGCNConv
from sklearn.metrics import classification_report

In [None]:
class RGCN(torch.nn.Module):
    def __init__(self, in_dim, hidden_dim=32, out_dim=2, num_relations=2, dropout=0.5):
        super().__init__()

        self.conv1 = RGCNConv(
            in_dim,
            hidden_dim,
            num_relations=num_relations
        )

        self.conv2 = RGCNConv(
            hidden_dim,
            out_dim,
            num_relations=num_relations
        )

        self.dropout = dropout

    def forward(self, x, edge_index, edge_type):
        x = self.conv1(x, edge_index, edge_type)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv2(x, edge_index, edge_type)
        return x

In [None]:
def train():
    model.train()
    optimizer.zero_grad()

    out = model(data.x, data.edge_index, data.edge_type)

    loss = F.cross_entropy(
        out[train_mask],
        data.y[train_mask],
        weight=class_weights    # nếu không muốn, xóa dòng này
    )

    loss.backward()
    optimizer.step()
    return loss.item()

In [None]:
@torch.no_grad()
def test():
    model.eval()
    out = model(data.x, data.edge_index, data.edge_type)
    pred = out.argmax(dim=1)

    y_true = data.y[test_mask].cpu().numpy()
    y_pred = pred[test_mask].cpu().numpy()

    print("\n===== FINAL TEST (2022) – R-GCN (new graph) =====")
    print(classification_report(y_true, y_pred, digits=4))

In [None]:
model = RGCN(
    in_dim=data.num_features,
    hidden_dim=32,
    out_dim=2,
    num_relations=int(data.edge_type.max().item() + 1),
    dropout=0.5
).to(device)

In [None]:
optimizer = torch.optim.Adam(
    model.parameters(),
    lr=0.005,
    weight_decay=5e-4
)

In [None]:
EPOCHS = 300

for epoch in range(1, EPOCHS + 1):
    loss = train()
    if epoch % 50 == 0:
        print(f"Epoch {epoch:03d} | Loss: {loss:.4f}")

test()