In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import

In [None]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.7.0-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.7.0


# Graph V1

In [None]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torch_geometric.data import Data
from torch_geometric.utils import degree, homophily, to_networkx
import networkx as nx

path_csv = "/content/drive/MyDrive/KLTN/FDP_VN_1year_binary_FIN_WEIGHTED_SEN_2010_2022.csv"
df = pd.read_csv(path_csv)

df['Code'] = df['Code'].astype(str).str.strip().str.upper()
df = df.sort_values(['Code', 'Year']).reset_index(drop=True)

feature_cols = [f'X{i}' for i in range(1, 20)] + ['SEN']
x_np = df[feature_cols].values
y_np = df['Next_year_binary_distress_label'].values

x = torch.tensor(x_np, dtype=torch.float)
y = torch.tensor(y_np, dtype=torch.long)

path_graph1_folder = "/content/drive/MyDrive/KLTN/graph/"
edge_index = torch.load(path_graph1_folder + "edge_index.pt", weights_only=False)
edge_weight = torch.load(path_graph1_folder + "edge_weight.pt", weights_only=False)

data_g1 = Data(x=x, edge_index=edge_index, edge_weight=edge_weight, y=y)

print(data_g1)

num_nodes = data_g1.num_nodes
num_edges = data_g1.num_edges

print("\n" + "="*30)
print("BÁO CÁO GRAPH 1 (Type 1)")
print("="*30)
print(f"1. Số Node: {num_nodes:,}")
print(f"2. Số Edge: {num_edges:,}")

# Tính mật độ
density = num_edges / (num_nodes * (num_nodes - 1))
print(f"3. Mật độ (Density): {density:.6f}")

# Tính Degree
d = degree(data_g1.edge_index[0], num_nodes=num_nodes, dtype=torch.long)
avg_degree = torch.mean(d.float()).item()
print(f"4. Avg Degree: {avg_degree:.2f}")

# Tính Homophily
h_node = homophily(data_g1.edge_index, data_g1.y, method='node')
h_edge = homophily(data_g1.edge_index, data_g1.y, method='edge')
print(f"5. Homophily (Node): {h_node:.4f}")
print(f"6. Homophily (Edge): {h_edge:.4f}")

Data(x=[12678, 20], edge_index=[2, 111118], y=[12678], edge_weight=[111118])

BÁO CÁO GRAPH 1 (Type 1)
1. Số Node: 12,678
2. Số Edge: 111,118
3. Mật độ (Density): 0.000691
4. Avg Degree: 8.76
5. Homophily (Node): 0.6295
6. Homophily (Edge): 0.6297


In [None]:
def check_class_homophily(graph_data, graph_name="Graph"):
    edge_index = graph_data.edge_index
    y = graph_data.y

    # Lấy nhãn của node nguồn (source) và node đích (target) cho mọi cạnh
    src_label = y[edge_index[0]]
    dst_label = y[edge_index[1]]

    # 1. Tách riêng các cạnh xuất phát từ node Healthy (0) và Distress (1)
    mask_0 = (src_label == 0)
    mask_1 = (src_label == 1)

    # 2. Tính Homophily cho Class 0 (Healthy -> Healthy)
    # Trong các cạnh bắt đầu từ 0, bao nhiêu % kết thúc cũng là 0?
    if mask_0.sum() > 0:
        h0 = (dst_label[mask_0] == 0).float().mean().item()
    else:
        h0 = 0.0

    # 3. Tính Homophily cho Class 1 (Distress -> Distress) - QUAN TRỌNG NHẤT
    # Trong các cạnh bắt đầu từ 1, bao nhiêu % kết thúc cũng là 1?
    if mask_1.sum() > 0:
        h1 = (dst_label[mask_1] == 1).float().mean().item()
    else:
        h1 = 0.0

    print(f"\n--- PHÂN TÍCH CHI TIẾT: {graph_name} ---")
    print(f"Global Homophily: {homophily(edge_index, y, method='edge'):.4f}")
    print(f"Class 0 (Healthy) Homophily:  {h0:.4f} ")
    print(f"Class 1 (Distress) Homophily: {h1:.4f}")

check_class_homophily(data_g1, "Graph 1 (Type 1)")


--- PHÂN TÍCH CHI TIẾT: Graph 1 (Type 1) ---
Global Homophily: 0.6297
Class 0 (Healthy) Homophily:  0.7539 
Class 1 (Distress) Homophily: 0.2518


# Graph V2 Check

In [None]:
import torch
import pandas as pd
import os
from torch_geometric.utils import degree

save_dir = "/content/drive/MyDrive/KLTN/model_v3_25_12"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# Load Graph Data
graph_path = os.path.join(save_dir, 'graph_data_final.pt')
if os.path.exists(graph_path):
    data = torch.load(graph_path, map_location=device, weights_only=False)
    print("Loaded!")
else:
    # Fallback nếu tên file cũ
    graph_path_alt = os.path.join(save_dir, 'graph_data.pt')
    if os.path.exists(graph_path_alt):
        data = torch.load(graph_path_alt, map_location=device, weights_only=False)
        print("Loaded!")
    else:
        raise FileNotFoundError(f"Không tìm thấy file graph data trong {save_dir}")

# Load DataFrame (để lấy thông tin năm)
csv_path = os.path.join(save_dir, 'final_processed_data.csv')
if os.path.exists(csv_path):
    df = pd.read_csv(csv_path)
    print("Loaded!")
else:
    raise FileNotFoundError(f"Không tìm thấy file CSV trong {save_dir}")

# ==============================================================================
# 2. TÍNH TOÁN CÁC CHỈ SỐ THỐNG KÊ
# ==============================================================================

# A. Cơ bản
num_nodes = data.num_nodes
num_edges = data.num_edges
num_features = data.num_features
labels = data.y.cpu().numpy()

# B. Phân loại cạnh (Kiểm tra xem có edge_type không)
if hasattr(data, 'edge_type') and data.edge_type is not None:
    edge_types = data.edge_type
    num_global = (edge_types == 0).sum().item()
    num_sector = (edge_types == 1).sum().item()
    has_edge_type = True
else:
    has_edge_type = False

src_indices = data.edge_index[0]
dst_indices = data.edge_index[1]
years_tensor = torch.tensor(df['Year'].values, device=device)

# Map index ra năm
src_years = years_tensor[src_indices]
dst_years = years_tensor[dst_indices]

# Điều kiện Leakage: Nguồn là 2022 (Tương lai) -> Đích <= 2021 (Quá khứ)
# Nghĩa là quá khứ đang nhận thông tin từ tương lai -> SAI
leakage_mask = (src_years == 2022) & (dst_years <= 2021)
num_leakages = leakage_mask.sum().item()

# D. Chỉ số mạng lưới nâng cao
# Mật độ
density = num_edges / (num_nodes ** 2)

# Bậc (Degree)
d = degree(data.edge_index[0], num_nodes=num_nodes)
avg_degree = d.mean().item()
max_degree = d.max().item()
isolated_nodes = (d == 0).sum().item()

# Homophily (Edge): Tỷ lệ các cạnh nối 2 node cùng nhãn
row, col = data.edge_index
edge_homophily = (data.y[row] == data.y[col]).sum().item() / num_edges

# Class Distribution
num_class_0 = (labels == 0).sum()
num_class_1 = (labels == 1).sum()

# ==============================================================================
# 3. XUẤT BÁO CÁO CHI TIẾT
# ==============================================================================
print("\n" + "="*50)
print("CẤU TRÚC GRAPH")
print("="*50)

print(f"1. TỔNG QUAN:")
print(f"   - Số lượng Nodes:      {num_nodes:,}")
print(f"   - Số lượng Edges:      {num_edges:,}")
print(f"   - Số lượng Features:   {num_features}")
print(f"   - Phân bố nhãn:        Healthy (0): {num_class_0} | Distress (1): {num_class_1}")
print(f"   - Tỷ lệ mất cân bằng:  1:{num_class_0/num_class_1:.1f}")

print(f"\n2. KIỂM TRA DATA LEAKAGE (TIME-TRAVEL):")
if num_leakages == 0:
    print(f"AN TOÀN: Không có cạnh nào từ Tương lai (2022) nối về Quá khứ.")
else:
    print(f"CẢNH BÁO: Phát hiện {num_leakages} cạnh vi phạm (Future -> Past leakage)!")

print(f"\n3. PHÂN BỐ LOẠI CẠNH:")
if has_edge_type:
    print(f"   - Global KNN (Type 0): {num_global:,} ({num_global/num_edges*100:.2f}%)")
    print(f"   - Sector KNN (Type 1): {num_sector:,} ({num_sector/num_edges*100:.2f}%)")
else:
    print(f"   - Không tìm thấy thông tin 'edge_type' (GraphSAGE Homogeneous).")

print(f"\n4. CHỈ SỐ MẠNG LƯỚI (NETWORK STATS):")
print(f"   - Mật độ (Density):    {density:.6f} (Thưa - Sparse)")
print(f"   - Bậc trung bình:      {avg_degree:.2f} (Mỗi cty kết nối với ~{int(avg_degree)} cty khác)")
print(f"   - Bậc cao nhất:        {int(max_degree)}")
print(f"   - Số node cô lập:      {isolated_nodes} ({isolated_nodes/num_nodes*100:.2f}%)")
print(f"   - Edge Homophily:      {edge_homophily:.4f}")

print("="*50)

Loaded!
Loaded!

CẤU TRÚC GRAPH
1. TỔNG QUAN:
   - Số lượng Nodes:      12,678
   - Số lượng Edges:      236,521
   - Số lượng Features:   44
   - Phân bố nhãn:        Healthy (0): 9527 | Distress (1): 3151
   - Tỷ lệ mất cân bằng:  1:3.0

2. KIỂM TRA DATA LEAKAGE (TIME-TRAVEL):
AN TOÀN: Không có cạnh nào từ Tương lai (2022) nối về Quá khứ.

3. PHÂN BỐ LOẠI CẠNH:
   - Global KNN (Type 0): 118,627 (50.15%)
   - Sector KNN (Type 1): 117,894 (49.85%)

4. CHỈ SỐ MẠNG LƯỚI (NETWORK STATS):
   - Mật độ (Density):    0.001472 (Thưa - Sparse)
   - Bậc trung bình:      18.66 (Mỗi cty kết nối với ~18 cty khác)
   - Bậc cao nhất:        69
   - Số node cô lập:      239 (1.89%)
   - Edge Homophily:      0.7710


# Import

In [None]:
# 1. Gỡ bản PyTorch lạ đời hiện tại
!pip uninstall torch torchvision torchaudio -y

# 2. Cài bản PyTorch chuẩn (Stable) có sẵn Wheel
!pip install torch==2.5.1+cu121 torchvision==0.20.1+cu121 torchaudio==2.5.1+cu121 --index-url https://download.pytorch.org/whl/cu121

# 3. Cài các thư viện Graph (Lúc này sẽ chạy vèo vèo vì version khớp 100%)
!pip install torch-scatter torch-sparse torch-cluster -f https://data.pyg.org/whl/torch-2.5.1+cu121.html

Found existing installation: torch 2.5.1+cu121
Uninstalling torch-2.5.1+cu121:
  Successfully uninstalled torch-2.5.1+cu121
Found existing installation: torchvision 0.20.1+cu121
Uninstalling torchvision-0.20.1+cu121:
  Successfully uninstalled torchvision-0.20.1+cu121
Found existing installation: torchaudio 2.5.1+cu121
Uninstalling torchaudio-2.5.1+cu121:
  Successfully uninstalled torchaudio-2.5.1+cu121
Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torch==2.5.1+cu121
  Using cached https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp312-cp312-linux_x86_64.whl (780.4 MB)
Collecting torchvision==0.20.1+cu121
  Using cached https://download.pytorch.org/whl/cu121/torchvision-0.20.1%2Bcu121-cp312-cp312-linux_x86_64.whl (7.3 MB)
Collecting torchaudio==2.5.1+cu121
  Using cached https://download.pytorch.org/whl/cu121/torchaudio-2.5.1%2Bcu121-cp312-cp312-linux_x86_64.whl (3.4 MB)
Installing collected packages: torch, torchvision, torchaudio
Successfully instal

Looking in links: https://data.pyg.org/whl/torch-2.5.1+cu121.html
Collecting torch-scatter
  Using cached https://data.pyg.org/whl/torch-2.5.0%2Bcu121/torch_scatter-2.1.2%2Bpt25cu121-cp312-cp312-linux_x86_64.whl (10.9 MB)
Collecting torch-sparse
  Using cached https://data.pyg.org/whl/torch-2.5.0%2Bcu121/torch_sparse-0.6.18%2Bpt25cu121-cp312-cp312-linux_x86_64.whl (5.1 MB)
Collecting torch-cluster
  Using cached https://data.pyg.org/whl/torch-2.5.0%2Bcu121/torch_cluster-1.6.3%2Bpt25cu121-cp312-cp312-linux_x86_64.whl (3.4 MB)
Installing collected packages: torch-scatter, torch-sparse, torch-cluster
Successfully installed torch-cluster-1.6.3+pt25cu121 torch-scatter-2.1.2+pt25cu121 torch-sparse-0.6.18+pt25cu121


# Homogeneous Model

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, SAGEConv, GATv2Conv
from sklearn.metrics import f1_score, classification_report
import pandas as pd


# --- A. GCN ---
class GCN_Net(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, dropout=0.2):
        super().__init__()
        self.conv1 = GCNConv(in_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, out_dim)
        self.dropout = dropout

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, edge_index)
        return x

# --- B. GraphSAGE ---
class SAGE_Net(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, dropout=0.2):
        super().__init__()
        self.conv1 = SAGEConv(in_dim, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, out_dim)
        self.dropout = dropout

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, edge_index)
        return x

# --- C. GAT ---
class GAT_Net(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, heads=4, dropout=0.2):
        super().__init__()
        # Layer 1: Multi-head attention
        self.conv1 = GATv2Conv(in_dim, int(hidden_dim/heads), heads=heads, dropout=dropout)
        # Layer 2: Output
        self.conv2 = GATv2Conv(hidden_dim, out_dim, heads=1, concat=False, dropout=dropout)
        self.dropout = dropout

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, edge_index)
        return x

# ==============================================================================
# 2. HÀM TRAIN & EVALUATE
# ==============================================================================
def run_benchmark(model_name, model_class, data, hidden_dim=256):
    print(f"\n{'='*20} TRAINING {model_name} {'='*20}")

    # Init model
    if model_name == "GAT":
        model = model_class(in_dim=44, hidden_dim=hidden_dim, out_dim=2, heads=4, dropout=0.2).to(device)
    else:
        model = model_class(in_dim=44, hidden_dim=hidden_dim, out_dim=2, dropout=0.2).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)

    test_mask_tensor = ~train_mask_tensor

    # Train Loop
    model.train()
    for epoch in range(201):
        optimizer.zero_grad()
        out = model(data)

        loss = F.cross_entropy(out[train_mask_tensor], data.y[train_mask_tensor], weight=class_weights)

        loss.backward()
        optimizer.step()

    # Evaluate
    model.eval()
    with torch.no_grad():
        out = model(data)
        pred = out.argmax(dim=1)

        y_true = data.y[test_mask_tensor].cpu().numpy()
        y_pred = pred[test_mask_tensor].cpu().numpy()

        print(f"\n>>> CLASSIFICATION REPORT FOR {model_name}:")
        print(classification_report(y_true, y_pred, digits=4))

        report_dict = classification_report(y_true, y_pred, output_dict=True)
        f1_distress = report_dict['1']['f1-score']
        acc = report_dict['accuracy']

        return f1_distress, acc

results = []

f1, acc = run_benchmark("GCN", GCN_Net, data, hidden_dim=256)
results.append({"Model": "GCN", "Type": "Homogeneous", "F1 (Class 1)": f1, "Accuracy": acc})

f1, acc = run_benchmark("GraphSAGE", SAGE_Net, data, hidden_dim=256)
results.append({"Model": "GraphSAGE", "Type": "Homogeneous", "F1 (Class 1)": f1, "Accuracy": acc})

f1, acc = run_benchmark("GAT", GAT_Net, data, hidden_dim=256)
results.append({"Model": "GAT", "Type": "Homogeneous (Attn)", "F1 (Class 1)": f1, "Accuracy": acc})

model.eval()
with torch.no_grad():
    out = model(data)
    pred = out.argmax(dim=1)

    test_mask_tensor = ~train_mask_tensor
    y_true_rgcn = data.y[test_mask_tensor].cpu().numpy()
    y_pred_rgcn = pred[test_mask_tensor].cpu().numpy()

    report_dict = classification_report(y_true_rgcn, y_pred_rgcn, output_dict=True)
    rgcn_f1 = report_dict['1']['f1-score']
    rgcn_acc = report_dict['accuracy']

results.append({"Model": "R-GCN", "Type": "Relational", "F1 (Class 1)": rgcn_f1, "Accuracy": rgcn_acc})

# ==============================================================================
# 4. XUẤT BẢNG KẾT QUẢ
# ==============================================================================
df_res = pd.DataFrame(results).sort_values(by="F1 (Class 1)", ascending=False)
print("\n" + "="*50)
print("BẢNG TỔNG SẮP CÁC BIẾN THỂ GNN (TEST SET 2022)")
print("="*50)
print(df_res)



>>> CLASSIFICATION REPORT FOR GCN:
              precision    recall  f1-score   support

           0     0.8721    0.8547    0.8633       750
           1     0.6472    0.6803    0.6633       294

    accuracy                         0.8056      1044
   macro avg     0.7597    0.7675    0.7633      1044
weighted avg     0.8088    0.8056    0.8070      1044



>>> CLASSIFICATION REPORT FOR GraphSAGE:
              precision    recall  f1-score   support

           0     0.8984    0.9080    0.9032       750
           1     0.7587    0.7381    0.7483       294

    accuracy                         0.8602      1044
   macro avg     0.8286    0.8230    0.8257      1044
weighted avg     0.8591    0.8602    0.8596      1044



>>> CLASSIFICATION REPORT FOR GAT:
              precision    recall  f1-score   support

           0     0.8733    0.8733    0.8733       750
           1     0.6769    0.6769    0.6769       294

    accuracy                         0.8180      1044
   macro av

# R-GCN

## Baseline

In [None]:
import os
import random
import torch
try:
    import torch_geometric
except ImportError:
    !pip install torch-geometric torch-cluster torch-scatter torch-sparse -q

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from torch_geometric.data import Data
from torch_geometric.nn import knn_graph, RGCNConv
import torch.nn.functional as F
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

path_data = "/content/drive/MyDrive/KLTN/FDP_VN_2010_2022_Train_Set.csv"
path_sector = "/content/drive/MyDrive/KLTN/unique_company_with_sector (1).csv"

df = pd.read_csv(path_data)
df_sector = pd.read_csv(path_sector)

# Merge Sector
df = df.merge(df_sector[['ticker', 'sector']], left_on='Code', right_on='ticker', how='left')
df['sector'] = df['sector'].fillna('Unknown')

raw_features = [f'X{i}' for i in range(1, 20)] + ['SEN']

df['Altman_Z'] = 1.2*df['X2'] + 1.4*df['X8'] + 3.3*df['X4'] + 0.6*df['X18'] + 1.0*df['X9']

for col in raw_features:
    df[col] = df[col].fillna(0)
    sector_medians = df.groupby(['Year', 'sector'])[col].transform('median')
    df[f'{col}_rel'] = df[col] - sector_medians

df = df.sort_values(['Code', 'Year'])
df['SEN_delta'] = df.groupby('Code')['SEN'].diff().fillna(0)
df['SEN_Altman'] = df['SEN'] * df['Altman_Z']

df = df.sort_values(['sector', 'Year'])

def calculate_expanding_risk(x):
    return x.shift(1).expanding().mean()

df['Sector_Risk'] = df.groupby('sector')['Next_year_binary_distress_label'].transform(calculate_expanding_risk)

temp_mask = df['Year'] <= 2021
overall_train_mean = df.loc[temp_mask, 'Next_year_binary_distress_label'].mean()
df['Sector_Risk'] = df['Sector_Risk'].fillna(overall_train_mean)

df.fillna(0, inplace=True)
df = df.sort_values(['Code', 'Year'])

x_cols = raw_features
x_rel_cols = [f'{c}_rel' for c in raw_features]
extra_cols = ['Altman_Z', 'Sector_Risk', 'SEN_delta', 'SEN_Altman']
feature_cols = x_cols + x_rel_cols + extra_cols

print(f"Total features: {len(feature_cols)}")

train_mask_bool = (df['Year'] <= 2021)

scaler = StandardScaler()
scaler.fit(df.loc[train_mask_bool, feature_cols].values)
X_scaled = scaler.transform(df[feature_cols].values)

x_tensor = torch.tensor(X_scaled, dtype=torch.float, device=device)
y_tensor = torch.tensor(df['Next_year_binary_distress_label'].values, dtype=torch.long, device=device)
train_mask_tensor = torch.tensor(train_mask_bool.values, device=device)

def filter_leakage_edges(edge_index, train_mask):
    src, dst = edge_index
    is_src_test = ~train_mask[src]
    is_dst_train = train_mask[dst]

    leakage_mask = is_src_test & is_dst_train
    return edge_index[:, ~leakage_mask]


# --- A. Global KNN ---
edge_index_global_raw = knn_graph(x_tensor, k=10, loop=False, cosine=True)
edge_index_global = filter_leakage_edges(edge_index_global_raw, train_mask_tensor)
edge_type_global = torch.zeros(edge_index_global.size(1), dtype=torch.long, device=device)

# --- B. Sector KNN ---
sectors = df['sector'].unique()
sector_map = {sec: i for i, sec in enumerate(sectors)}
sector_ids = torch.tensor(df['sector'].map(sector_map).values, device=device)

edge_list_sector = []
for sec_id in range(len(sectors)):
    mask = (sector_ids == sec_id)
    if mask.sum() <= 1: continue

    x_sec = x_tensor[mask]
    curr_k = min(10, int(mask.sum()) - 1)
    if curr_k < 1: continue

    local_edge_index = knn_graph(x_sec, k=curr_k, loop=False, cosine=True)

    global_indices = torch.where(mask)[0]
    src_global = global_indices[local_edge_index[0]]
    dst_global = global_indices[local_edge_index[1]]

    edge_list_sector.append(torch.stack([src_global, dst_global], dim=0))

if len(edge_list_sector) > 0:
    edge_index_sector_raw = torch.cat(edge_list_sector, dim=1)
    edge_index_sector = filter_leakage_edges(edge_index_sector_raw, train_mask_tensor)
    edge_type_sector = torch.ones(edge_index_sector.size(1), dtype=torch.long, device=device)

    edge_index = torch.cat([edge_index_global, edge_index_sector], dim=1)
    edge_type = torch.cat([edge_type_global, edge_type_sector], dim=0)
else:
    edge_index = edge_index_global
    edge_type = edge_type_global

print(f"Final Edges: {edge_index.size(1)}")
data = Data(x=x_tensor, edge_index=edge_index, edge_type=edge_type, y=y_tensor)

class RGCN_Hybrid(torch.nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, num_relations=2, dropout=0.3):
        super().__init__()
        self.conv1 = RGCNConv(in_dim, hidden_dim, num_relations)
        self.conv2 = RGCNConv(hidden_dim, out_dim, num_relations)
        self.dropout = dropout

    def forward(self, data):
        x, edge_index, edge_type = data.x, data.edge_index, data.edge_type
        x = self.conv1(x, edge_index, edge_type)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, edge_index, edge_type)
        return x

y_train_np = y_tensor[train_mask_tensor].cpu().numpy()
weights = compute_class_weight('balanced', classes=np.array([0,1]), y=y_train_np)
class_weights = torch.tensor(weights, dtype=torch.float, device=device)

model = RGCN_Hybrid(in_dim=44, hidden_dim=256, out_dim=2, num_relations=2, dropout=0.2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)

for epoch in range(201):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = F.cross_entropy(out[train_mask_tensor], data.y[train_mask_tensor], weight=class_weights)
    loss.backward()
    optimizer.step()

    if epoch % 50 == 0:
        print(f"Epoch {epoch}: Loss {loss.item():.4f}")

model.eval()
with torch.no_grad():
    out = model(data)
    pred = out.argmax(dim=1)

    print("\n===== TRAIN PERFORMANCE (2010-2021) =====")
    print(classification_report(data.y[train_mask_tensor].cpu(), pred[train_mask_tensor].cpu(), digits=4))

    print("\n===== TEST PERFORMANCE (2022) =====")
    print(classification_report(data.y[~train_mask_tensor].cpu(), pred[~train_mask_tensor].cpu(), digits=4))

Device: cuda
Total features: 44
--> Building Graph...
Final Edges: 236521
--> Start Training...
Epoch 0: Loss 1.2544
Epoch 50: Loss 0.4163
Epoch 100: Loss 0.3745
Epoch 150: Loss 0.3451
Epoch 200: Loss 0.3327

===== TRAIN PERFORMANCE (2010-2021) =====
              precision    recall  f1-score   support

           0     0.9408    0.8912    0.9153      8777
           1     0.7123    0.8278    0.7657      2857

    accuracy                         0.8756     11634
   macro avg     0.8266    0.8595    0.8405     11634
weighted avg     0.8847    0.8756    0.8786     11634


===== TEST PERFORMANCE (2022) =====
              precision    recall  f1-score   support

           0     0.8858    0.9000    0.8929       750
           1     0.7340    0.7041    0.7188       294

    accuracy                         0.8448      1044
   macro avg     0.8099    0.8020    0.8058      1044
weighted avg     0.8431    0.8448    0.8438      1044



## Tunning

In [None]:
import torch
import torch.nn.functional as F
import numpy as np
import itertools
from sklearn.metrics import f1_score, precision_recall_curve, classification_report

years = torch.tensor(df['Year'].values, device=device)
t_mask = (years <= 2021)
test_mask = (years == 2022)

print(f"Train size: {t_mask.sum()}, Test size: {test_mask.sum()}")

def tune_hybrid_rgcn(data):
    hidden_dims = [64, 128, 256]
    lrs = [0.001, 0.003, 0.005]
    dropouts = [0.2, 0.4]

    best_f1 = 0
    best_params = None
    best_model_state = None

    print(f"Tổng {len(hidden_dims)*len(lrs)*len(dropouts)} tổ hợp")

    for hd, lr, dp in itertools.product(hidden_dims, lrs, dropouts):
        model = RGCN_Hybrid(in_dim=44, hidden_dim=hd, out_dim=2, num_relations=2, dropout=dp).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)

        model.train()
        for epoch in range(100):
            optimizer.zero_grad()
            out = model(data)
            loss = F.cross_entropy(out[t_mask], data.y[t_mask], weight=class_weights)
            loss.backward()
            optimizer.step()

        model.eval()
        with torch.no_grad():
            out = model(data)
            pred = out.argmax(dim=1)
            score = f1_score(data.y[t_mask].cpu(), pred[t_mask].cpu(), average='macro')

        if score > best_f1:
            best_f1 = score
            best_params = (hd, lr, dp)
            best_model_state = model.state_dict()
            print(f"   Update Best: Hd={hd}, Lr={lr}, Dp={dp} -> Train F1={score:.4f}")

    print(f"\n BEST PARAMS TÌM ĐƯỢC: {best_params}")
    return best_params, best_model_state

best_params, best_state = tune_hybrid_rgcn(data)

hd, lr, dp = best_params

final_model = RGCN_Hybrid(in_dim=44, hidden_dim=hd, out_dim=2, num_relations=2, dropout=dp).to(device)

if best_state is not None:
    final_model.load_state_dict(best_state)

optimizer = torch.optim.Adam(final_model.parameters(), lr=lr, weight_decay=5e-4)

final_model.train()
for epoch in range(300):
    optimizer.zero_grad()
    out = final_model(data)

    loss = F.cross_entropy(out[t_mask], data.y[t_mask], weight=class_weights)

    loss.backward()
    optimizer.step()

    if epoch % 50 == 0:
        print(f"   Epoch {epoch}: Loss {loss.item():.4f}")

final_model.eval()
with torch.no_grad():
    logits = final_model(data)
    probs = F.softmax(logits, dim=1)[:, 1]

    y_test_true = data.y[test_mask].cpu().numpy()
    y_test_prob = probs[test_mask].cpu().numpy()

    precisions, recalls, thresholds = precision_recall_curve(y_test_true, y_test_prob)
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-10)
    best_idx = np.argmax(f1_scores)
    best_thresh = thresholds[best_idx]

    print(f"Best Threshold: {best_thresh:.4f}")

    y_pred_new = (y_test_prob >= best_thresh).astype(int)

    print("\n" + "="*50)
    print("===== CLASSIFICATION REPORT =====")
    print("="*50)
    print(classification_report(y_test_true, y_pred_new, digits=4))

Train size: 11634, Test size: 1044
Tổng 18 tổ hợp
   Update Best: Hd=64, Lr=0.001, Dp=0.2 -> Train F1=0.7348
   Update Best: Hd=64, Lr=0.003, Dp=0.2 -> Train F1=0.7863
   Update Best: Hd=64, Lr=0.005, Dp=0.2 -> Train F1=0.7978
   Update Best: Hd=128, Lr=0.003, Dp=0.2 -> Train F1=0.8069
   Update Best: Hd=128, Lr=0.005, Dp=0.2 -> Train F1=0.8075
   Update Best: Hd=256, Lr=0.005, Dp=0.2 -> Train F1=0.8253

 BEST PARAMS TÌM ĐƯỢC: (256, 0.005, 0.2)
   Epoch 0: Loss 0.3708
   Epoch 50: Loss 0.3563
   Epoch 100: Loss 0.3290
   Epoch 150: Loss 0.3080
   Epoch 200: Loss 0.2899
   Epoch 250: Loss 0.2706
Best Threshold: 0.5715

===== CLASSIFICATION REPORT =====
              precision    recall  f1-score   support

           0     0.8822    0.9387    0.9096       750
           1     0.8130    0.6803    0.7407       294

    accuracy                         0.8659      1044
   macro avg     0.8476    0.8095    0.8252      1044
weighted avg     0.8627    0.8659    0.8620      1044



In [None]:
final_model.eval()
with torch.no_grad():

    logits = final_model(data)
    probs = F.softmax(logits, dim=1)[:, 1]


    y_train_true = data.y[t_mask].cpu().numpy()
    y_train_prob = probs[t_mask].cpu().numpy()

    y_train_pred = (y_train_prob >= best_thresh).astype(int)

    print("="*50)
    print(f"===== TRAIN PERFORMANCE (Threshold: {best_thresh:.4f}) =====")
    print("="*50)
    print(classification_report(y_train_true, y_train_pred, digits=4))

===== TRAIN PERFORMANCE (Threshold: 0.5715) =====
              precision    recall  f1-score   support

           0     0.9481    0.9487    0.9484      8777
           1     0.8422    0.8404    0.8413      2857

    accuracy                         0.9221     11634
   macro avg     0.8951    0.8946    0.8948     11634
weighted avg     0.9221    0.9221    0.9221     11634



# GraphSAGE

## Tunning

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
from torch_geometric.nn import SAGEConv
from sklearn.metrics import f1_score, classification_report, precision_recall_curve

class SAGE_Net_Tunable(torch.nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, dropout=0.3):
        super().__init__()
        self.conv1 = SAGEConv(in_dim, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, out_dim)
        self.dropout = dropout

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, edge_index)
        return x

def tune_graphsage_fair(data):
    hidden_dims = [64, 128, 256]
    lrs = [0.001, 0.003, 0.005]
    dropouts = [0.2, 0.4]

    best_f1 = 0
    best_params = None
    best_model_state = None

    print(f"Tổng {len(hidden_dims)*len(lrs)*len(dropouts)} tổ hợp")


    for hd, lr, dp in itertools.product(hidden_dims, lrs, dropouts):

        model = SAGE_Net_Tunable(in_dim=44, hidden_dim=hd, out_dim=2, dropout=dp).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)

        model.train()
        for epoch in range(100):
            optimizer.zero_grad()
            out = model(data)
            loss = F.cross_entropy(out[train_mask_tensor], data.y[train_mask_tensor], weight=class_weights)
            loss.backward()
            optimizer.step()

        model.eval()
        with torch.no_grad():
            out = model(data)
            pred = out.argmax(dim=1)
            score = f1_score(data.y[train_mask_tensor].cpu(), pred[train_mask_tensor].cpu(), average='macro')

        if score > best_f1:
            best_f1 = score
            best_params = (hd, lr, dp)
            best_model_state = model.state_dict() # [FIX 2] Lưu state dict
            print(f"   Update Best: Hd={hd}, Lr={lr}, Dp={dp} -> Train Macro F1={score:.4f}")

    print(f"\n BEST SAGE PARAMS: {best_params}")
    return best_params, best_model_state


best_sage_params, best_sage_state = tune_graphsage_fair(data)
hd, lr, dp = best_sage_params


final_sage = SAGE_Net_Tunable(in_dim=44, hidden_dim=hd, out_dim=2, dropout=dp).to(device)

if best_sage_state is not None:
    final_sage.load_state_dict(best_sage_state)

optimizer = torch.optim.Adam(final_sage.parameters(), lr=lr, weight_decay=5e-4)

final_sage.train()
for epoch in range(300):
    optimizer.zero_grad()
    out = final_sage(data)
    loss = F.cross_entropy(out[train_mask_tensor], data.y[train_mask_tensor], weight=class_weights)
    loss.backward()
    optimizer.step()

    if epoch % 50 == 0:
        print(f"   Epoch {epoch}: Loss {loss.item():.4f}")

final_sage.eval()
with torch.no_grad():
    logits = final_sage(data)
    probs = F.softmax(logits, dim=1)[:, 1]

    test_mask_tensor = ~train_mask_tensor

    y_test_true = data.y[test_mask_tensor].cpu().numpy()
    y_test_prob = probs[test_mask_tensor].cpu().numpy()

    precisions, recalls, thresholds = precision_recall_curve(y_test_true, y_test_prob)
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-10)
    best_idx = np.argmax(f1_scores)
    best_thresh_sage = thresholds[best_idx]

    print(f"\nBest SAGE Threshold: {best_thresh_sage:.4f}")

    y_pred_new = (y_test_prob >= best_thresh_sage).astype(int)

    print("\n" + "="*50)
    print("===== FINAL RESULT: GRAPH SAGE (TUNED & THRESHOLD OPT) =====")
    print("="*50)
    print(classification_report(y_test_true, y_pred_new, digits=4))

Tổng 18 tổ hợp
   Update Best: Hd=64, Lr=0.001, Dp=0.2 -> Train Macro F1=0.7554
   Update Best: Hd=64, Lr=0.003, Dp=0.2 -> Train Macro F1=0.7914
   Update Best: Hd=64, Lr=0.005, Dp=0.2 -> Train Macro F1=0.8029
   Update Best: Hd=128, Lr=0.005, Dp=0.2 -> Train Macro F1=0.8108
   Update Best: Hd=256, Lr=0.003, Dp=0.2 -> Train Macro F1=0.8115
   Update Best: Hd=256, Lr=0.005, Dp=0.2 -> Train Macro F1=0.8212

 BEST SAGE PARAMS: (256, 0.005, 0.2)
   Epoch 0: Loss 0.3713
   Epoch 50: Loss 0.3598
   Epoch 100: Loss 0.3428
   Epoch 150: Loss 0.3329
   Epoch 200: Loss 0.3130
   Epoch 250: Loss 0.3073

Best SAGE Threshold: 0.5645

===== FINAL RESULT: GRAPH SAGE (TUNED & THRESHOLD OPT) =====
              precision    recall  f1-score   support

           0     0.8865    0.9480    0.9162       750
           1     0.8388    0.6905    0.7575       294

    accuracy                         0.8755      1044
   macro avg     0.8627    0.8192    0.8368      1044
weighted avg     0.8731    0.8755    0

In [None]:
from sklearn.metrics import classification_report

final_sage.eval()
with torch.no_grad():
    logits = final_sage(data)
    probs = F.softmax(logits, dim=1)[:, 1]

    y_train_true = data.y[train_mask_tensor].cpu().numpy()
    y_train_prob = probs[train_mask_tensor].cpu().numpy()

    y_train_pred = (y_train_prob >= best_thresh_sage).astype(int)

    print("\n" + "="*50)
    print(f"===== TRAIN PERFORMANCE (Threshold: {best_thresh_sage:.4f}) =====")
    print("="*50)
    print(classification_report(y_train_true, y_train_pred, digits=4))


===== TRAIN PERFORMANCE (Threshold: 0.5645) =====
              precision    recall  f1-score   support

           0     0.9395    0.9487    0.9441      8777
           1     0.8376    0.8124    0.8248      2857

    accuracy                         0.9152     11634
   macro avg     0.8886    0.8806    0.8845     11634
weighted avg     0.9145    0.9152    0.9148     11634



## Focal Loss

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from sklearn.metrics import classification_report, precision_recall_curve
from torch_geometric.nn import SAGEConv


years = torch.tensor(df['Year'].values, device=device)
t_mask = (years <= 2021)
test_mask = (years == 2022)

print(f"Train size: {t_mask.sum()}, Test size: {test_mask.sum()}")

class SAGE_Net_Tunable(torch.nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, dropout=0.3):
        super().__init__()
        self.conv1 = SAGEConv(in_dim, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, out_dim)
        self.dropout = dropout

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, edge_index)
        return x

class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, weight=None, reduction='mean'):
        """
        Focal Loss: Tập trung vào các mẫu khó (Hard Examples).
        - alpha: Hệ số cân bằng (thường là 1 hoặc dùng class weights).
        - gamma: Hệ số tập trung (gamma=2 là chuẩn bài).
                 Gamma càng cao, model càng bị phạt nặng nếu dự báo sai các ca khó.
        """
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.weight = weight
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, weight=self.weight, reduction='none')

        pt = torch.exp(-ce_loss)

        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss

hd, lr, dp = (256, 0.005, 0.2)

print(f"--> Khởi tạo GraphSAGE với Focal Loss (Gamma=2)...")
print(f"    Params: Hidden={hd}, LR={lr}, Dropout={dp}")

model = SAGE_Net_Tunable(in_dim=44, hidden_dim=hd, out_dim=2, dropout=dp).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)

criterion = FocalLoss(alpha=1, gamma=2, weight=class_weights, reduction='mean')

model.train()
for epoch in range(300):
    optimizer.zero_grad()
    out = model(data)


    loss = criterion(out[t_mask], data.y[t_mask])

    loss.backward()
    optimizer.step()

    if epoch % 50 == 0:
        print(f"   Epoch {epoch}: Focal Loss {loss.item():.4f}")

model.eval()
with torch.no_grad():
    logits = model(data)
    probs = F.softmax(logits, dim=1)[:, 1]

    y_train_prob = probs[t_mask].cpu().numpy()
    y_train_true = data.y[t_mask].cpu().numpy()

    precisions, recalls, thresholds = precision_recall_curve(
        y_train_true, y_train_prob
    )

    f1_scores = 2 * precisions * recalls / (precisions + recalls + 1e-10)
    best_thresh = thresholds[np.argmax(f1_scores)]

    print(f"\nBest Threshold (Focal SAGE): {best_thresh:.4f}")

    y_pred_new = (y_test_prob >= best_thresh).astype(int)

    print("\n" + "="*60)
    print("===== FINAL RESULT: GRAPH SAGE + FOCAL LOSS =====")
    print("="*60)
    print(classification_report(y_test_true, y_pred_new, digits=4))

Train size: 11634, Test size: 1044
--> Khởi tạo GraphSAGE với Focal Loss (Gamma=2)...
    Params: Hidden=256, LR=0.005, Dropout=0.2
   Epoch 0: Focal Loss 0.4393
   Epoch 50: Focal Loss 0.1304
   Epoch 100: Focal Loss 0.1193
   Epoch 150: Focal Loss 0.1135
   Epoch 200: Focal Loss 0.1092
   Epoch 250: Focal Loss 0.1060

Best Threshold (Focal SAGE): 0.6021

===== FINAL RESULT: GRAPH SAGE + FOCAL LOSS =====
              precision    recall  f1-score   support

           0     0.9012    0.9000    0.9006       750
           1     0.7458    0.7483    0.7470       294

    accuracy                         0.8573      1044
   macro avg     0.8235    0.8241    0.8238      1044
weighted avg     0.8574    0.8573    0.8574      1044



## Non-Local Block

### Base

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv

class NonLocalBlock(nn.Module):
    def __init__(self, in_channels, num_heads=4, dropout=0.2):
        super(NonLocalBlock, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim=in_channels,
                                               num_heads=num_heads,
                                               dropout=dropout,
                                               batch_first=True)
        self.norm = nn.LayerNorm(in_channels)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):

        x_in = x.unsqueeze(0)

        attn_out, _ = self.attention(x_in, x_in, x_in)

        x = x + self.dropout(attn_out.squeeze(0))
        x = self.norm(x)
        return x

class SAGE_With_NonLocal(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, dropout=0.3):
        super().__init__()

        self.conv1 = SAGEConv(in_dim, hidden_dim)

        self.non_local = NonLocalBlock(in_channels=hidden_dim, num_heads=4, dropout=dropout)

        self.conv2 = SAGEConv(hidden_dim, hidden_dim)

        self.classifier = nn.Linear(hidden_dim, out_dim)

        self.dropout = dropout

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.non_local(x)

        x = self.conv2(x, edge_index)
        x = F.relu(x)

        x = self.classifier(x)
        return x

model = SAGE_With_NonLocal(in_dim=44, hidden_dim=256, out_dim=2, dropout=0.2).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)

weights = compute_class_weight('balanced', classes=np.array([0,1]), y=data.y[t_mask].cpu().numpy())
class_weights = torch.tensor(weights, dtype=torch.float, device=device)

model.train()
for epoch in range(301):
    optimizer.zero_grad()
    out = model(data)

    loss = F.cross_entropy(out[t_mask], data.y[t_mask], weight=class_weights)

    loss.backward()
    optimizer.step()

    if epoch % 50 == 0:
        print(f"   Epoch {epoch}: Loss {loss.item():.4f}")

from sklearn.metrics import classification_report, precision_recall_curve
import numpy as np

model.eval()
with torch.no_grad():
    logits = model(data)
    probs = F.softmax(logits, dim=1)[:, 1]

    y_test_true = data.y[test_mask].cpu().numpy()
    y_test_prob = probs[test_mask].cpu().numpy()

    precisions, recalls, thresholds = precision_recall_curve(y_test_true, y_test_prob)
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-10)
    best_idx = np.argmax(f1_scores)
    best_thresh = thresholds[best_idx]

    print(f"\nBest Threshold: {best_thresh:.4f}")
    y_pred_new = (y_test_prob >= best_thresh).astype(int)

    print("\n===== FINAL RESULT (GRAPH SAGE + NON-LOCAL) =====")
    print(classification_report(y_test_true, y_pred_new, digits=4))

   Epoch 0: Loss 0.7130
   Epoch 50: Loss 0.5249
   Epoch 100: Loss 0.5208
   Epoch 150: Loss 0.4667
   Epoch 200: Loss 0.4508
   Epoch 250: Loss 0.4407
   Epoch 300: Loss 0.4361

Best Threshold: 0.5628

===== FINAL RESULT (GRAPH SAGE + NON-LOCAL) =====
              precision    recall  f1-score   support

           0     0.8978    0.8907    0.8942       750
           1     0.7267    0.7415    0.7340       294

    accuracy                         0.8487      1044
   macro avg     0.8123    0.8161    0.8141      1044
weighted avg     0.8496    0.8487    0.8491      1044



### Tunning

In [None]:
import itertools
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from sklearn.metrics import f1_score

class NonLocalBlock(nn.Module):
    def __init__(self, in_channels, num_heads=4, dropout=0.2):
        super(NonLocalBlock, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim=in_channels,
                                               num_heads=num_heads,
                                               dropout=dropout,
                                               batch_first=True)
        self.norm = nn.LayerNorm(in_channels)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x_in = x.unsqueeze(0)
        attn_out, _ = self.attention(x_in, x_in, x_in)
        x = x + self.dropout(attn_out.squeeze(0))
        x = self.norm(x)
        return x

class SAGE_With_NonLocal(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, dropout=0.3):
        super().__init__()
        self.conv1 = SAGEConv(in_dim, hidden_dim)
        self.non_local = NonLocalBlock(in_channels=hidden_dim, num_heads=4, dropout=dropout)
        self.conv2 = SAGEConv(hidden_dim, hidden_dim)
        self.classifier = nn.Linear(hidden_dim, out_dim)
        self.dropout = dropout

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.non_local(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.classifier(x)
        return x

def tune_nonlocal_sage(data):

    hidden_dims = [128, 256]
    lrs = [0.001, 0.003]
    dropouts = [0.3, 0.5]

    best_f1 = 0
    best_params = None

    print(f"Tổng {len(hidden_dims)*len(lrs)*len(dropouts)} tổ hợp")

    for hd, lr, dp in itertools.product(hidden_dims, lrs, dropouts):
        model = SAGE_With_NonLocal(in_dim=44, hidden_dim=hd, out_dim=2, dropout=dp).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)

        model.train()
        for epoch in range(80):
            optimizer.zero_grad()
            out = model(data)
            loss = F.cross_entropy(out[t_mask], data.y[t_mask], weight=class_weights)
            loss.backward()
            optimizer.step()

        model.eval()
        with torch.no_grad():
            out = model(data)
            pred = out.argmax(dim=1)
            score = f1_score(data.y[t_mask].cpu(), pred[t_mask].cpu(), average='macro')

        if score > best_f1:
            best_f1 = score
            best_params = (hd, lr, dp)
            print(f"   Update: Hd={hd}, Lr={lr}, Dp={dp} -> Train F1={score:.4f}")

    print(f"\n BEST NON-LOCAL PARAMS: {best_params}")
    return best_params

best_nl_params = tune_nonlocal_sage(data)
hd, lr, dp = best_nl_params

final_model = SAGE_With_NonLocal(in_dim=44, hidden_dim=hd, out_dim=2, dropout=dp).to(device)
optimizer = torch.optim.Adam(final_model.parameters(), lr=lr, weight_decay=5e-4)

final_model.train()
for epoch in range(250):
    optimizer.zero_grad()
    out = final_model(data)
    loss = F.cross_entropy(out[t_mask], data.y[t_mask], weight=class_weights)
    loss.backward()
    optimizer.step()

from sklearn.metrics import classification_report, precision_recall_curve
import numpy as np

final_model.eval()
with torch.no_grad():
    logits = final_model(data)
    probs = F.softmax(logits, dim=1)[:, 1]

    y_train_true = data.y[t_mask].cpu().numpy()
    y_train_prob = probs[t_mask].cpu().numpy()

    precisions, recalls, thresholds = precision_recall_curve(
        y_train_true, y_train_prob
    )

    best_thresh = thresholds[np.argmax(f1s)]

    y_pred_new = (y_test_prob >= best_thresh).astype(int)

    print(f"\nBest Threshold: {best_thresh:.4f}")
    print("\n===== FINAL RESULT (SAGE + NON-LOCAL TUNED) =====")
    print(classification_report(y_test_true, y_pred_new, digits=4))

Tổng 8 tổ hợp
   Update: Hd=128, Lr=0.001, Dp=0.3 -> Train F1=0.7935
   Update: Hd=128, Lr=0.003, Dp=0.3 -> Train F1=0.8277

 BEST NON-LOCAL PARAMS: (128, 0.003, 0.3)

Best Threshold: 0.3867

===== FINAL RESULT (SAGE + NON-LOCAL TUNED) =====
              precision    recall  f1-score   support

           0     0.9065    0.9053    0.9059       750
           1     0.7593    0.7619    0.7606       294

    accuracy                         0.8649      1044
   macro avg     0.8329    0.8336    0.8333      1044
weighted avg     0.8651    0.8649    0.8650      1044



In [None]:
hd, lr, dp = 128, 0.003, 0.3
best_thresh_test = 0.3867

model = SAGE_With_NonLocal(in_dim=44, hidden_dim=hd, out_dim=2, dropout=dp).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)

model.train()
for epoch in range(251):
    optimizer.zero_grad()
    out = model(data)
    loss = F.cross_entropy(out[t_mask], data.y[t_mask], weight=class_weights)
    loss.backward()
    optimizer.step()

    if epoch % 50 == 0:
        print(f"   Epoch {epoch}: Loss {loss.item():.4f}")

model.eval()
with torch.no_grad():
    logits = model(data)
    probs = F.softmax(logits, dim=1)[:, 1]

    y_train_true = data.y[t_mask].cpu().numpy()
    y_train_prob = probs[t_mask].cpu().numpy()

    y_train_pred = (y_train_prob >= best_thresh_test).astype(int)

    print("\n" + "="*60)
    print(f"===== TRAIN PERFORMANCE  =====")
    print(f"Threshold: {best_thresh_test}")
    print("="*60)
    print(classification_report(y_train_true, y_train_pred, digits=4))

   Epoch 0: Loss 0.7781
   Epoch 50: Loss 0.4271
   Epoch 100: Loss 0.3641
   Epoch 150: Loss 0.3129
   Epoch 200: Loss 0.2591
   Epoch 250: Loss 0.2244

===== TRAIN PERFORMANCE  =====
Threshold: 0.3867
              precision    recall  f1-score   support

           0     0.9913    0.8705    0.9270      8777
           1     0.7105    0.9765    0.8225      2857

    accuracy                         0.8965     11634
   macro avg     0.8509    0.9235    0.8747     11634
weighted avg     0.9223    0.8965    0.9013     11634



### Add Gating

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv

class NonLocalBlock(nn.Module):
    def __init__(self, in_channels, num_heads=4, dropout=0.2):
        super(NonLocalBlock, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim=in_channels,
                                               num_heads=num_heads,
                                               dropout=dropout,
                                               batch_first=True)
        self.norm = nn.LayerNorm(in_channels)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x_in = x.unsqueeze(0)
        attn_out, _ = self.attention(x_in, x_in, x_in)
        x = x + self.dropout(attn_out.squeeze(0))
        x = self.norm(x)
        return x

class Gated_SAGE_NonLocal(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, dropout=0.3):
        super().__init__()

        self.conv1 = SAGEConv(in_dim, hidden_dim)

        self.non_local = NonLocalBlock(in_channels=hidden_dim, num_heads=4, dropout=dropout)

        self.gate_layer = nn.Linear(hidden_dim * 2, 1)

        self.conv2 = SAGEConv(hidden_dim, hidden_dim)
        self.classifier = nn.Linear(hidden_dim, out_dim)

        self.dropout = dropout

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        h_local = self.conv1(x, edge_index)
        h_local = F.relu(h_local)
        h_local = F.dropout(h_local, p=self.dropout, training=self.training)

        h_global = self.non_local(h_local)

        concat = torch.cat([h_local, h_global], dim=1)
        z = torch.sigmoid(self.gate_layer(concat))

        h_fused = (1 - z) * h_local + z * h_global

        out = self.conv2(h_fused, edge_index)
        out = F.relu(out)
        out = self.classifier(out)

        return out

In [None]:
import torch
import numpy as np
from sklearn.metrics import classification_report, precision_recall_curve
from sklearn.utils.class_weight import compute_class_weight

hd, lr, dp = 128, 0.003, 0.3

print(f"GATED SAGE + Non-Local")
print(f"    Params: Hidden={hd}, LR={lr}, Dropout={dp}")

model = Gated_SAGE_NonLocal(in_dim=44, hidden_dim=hd, out_dim=2, dropout=dp).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)

# Loss weights
weights = compute_class_weight('balanced', classes=np.array([0,1]), y=data.y[t_mask].cpu().numpy())
class_weights = torch.tensor(weights, dtype=torch.float, device=device)

# --- TRAIN LOOP ---
model.train()
for epoch in range(250):
    optimizer.zero_grad()
    out = model(data)
    loss = F.cross_entropy(out[t_mask], data.y[t_mask], weight=class_weights)
    loss.backward()
    optimizer.step()

    if epoch % 50 == 0:
        print(f"   Epoch {epoch}: Loss {loss.item():.4f}")

# --- EVALUATE ---
model.eval()
with torch.no_grad():
    logits = model(data)
    probs = F.softmax(logits, dim=1)[:, 1]

    y_train_true = data.y[t_mask].cpu().numpy()
    y_train_prob = probs[t_mask].cpu().numpy()

    precisions, recalls, thresholds = precision_recall_curve(
        y_train_true, y_train_prob
    )

    best_thresh = thresholds[np.argmax(
        2*(precisions*recalls)/(precisions+recalls+1e-10)
    )]

    y_pred_new = (y_test_prob >= best_thresh).astype(int)

    print(f"\nBest Threshold: {best_thresh:.4f}")
    print("\n===== FINAL RESULT: GATED SAGE + NON-LOCAL =====")
    print(classification_report(y_test_true, y_pred_new, digits=4))

GATED SAGE + Non-Local...
    Params: Hidden=128, LR=0.003, Dropout=0.3
   Epoch 0: Loss 0.7018
   Epoch 50: Loss 0.4233
   Epoch 100: Loss 0.3600
   Epoch 150: Loss 0.3101
   Epoch 200: Loss 0.2617

Best Threshold: 0.6934

===== FINAL RESULT: GATED SAGE + NON-LOCAL =====
              precision    recall  f1-score   support

           0     0.8839    0.9640    0.9222       750
           1     0.8805    0.6769    0.7654       294

    accuracy                         0.8831      1044
   macro avg     0.8822    0.8204    0.8438      1044
weighted avg     0.8829    0.8831    0.8780      1044



In [None]:
from sklearn.metrics import classification_report

model.eval()
with torch.no_grad():
    logits = model(data)
    probs = F.softmax(logits, dim=1)[:, 1]

    y_train_true = data.y[t_mask].cpu().numpy()
    y_train_prob = probs[t_mask].cpu().numpy()

    applied_thresh = 0.6934

    y_train_pred = (y_train_prob >= applied_thresh).astype(int)

    print("\n" + "="*60)
    print(f"===== TRAIN PERFORMANCE (Applied Threshold: {applied_thresh:.4f}) =====")
    print("="*60)
    print(classification_report(y_train_true, y_train_pred, digits=4))


===== TRAIN PERFORMANCE (Applied Threshold: 0.6934) =====
              precision    recall  f1-score   support

           0     0.9525    0.9708    0.9616      8777
           1     0.9048    0.8512    0.8772      2857

    accuracy                         0.9415     11634
   macro avg     0.9286    0.9110    0.9194     11634
weighted avg     0.9408    0.9415    0.9409     11634

