<a href="https://colab.research.google.com/github/MaryamKazemit/OEBGNN/blob/main/csicc2025_paper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **General**

In [None]:
!pip install torch torch-geometric scikit-learn imbalanced-learn

Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.6.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import zipfile
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch_geometric.data import Data
from sklearn.neighbors import NearestNeighbors
from torch_geometric.nn import GCNConv, GATConv, SAGEConv
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# enable detailed CUDA debugging
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# ***Thesis structure***

In [None]:
def create_graph_structure(X, y, similarity_threshold=2.0):
    distances = euclidean_distances(X)
    edge_index = torch.tensor(
        [[i, j] for i in range(len(X)) for j in range(len(X)) if i != j and distances[i, j] < similarity_threshold],
        dtype=torch.long
    ).t()
    x_tensor = torch.tensor(X, dtype=torch.float)
    y_tensor = torch.tensor(y, dtype=torch.long)
    return Data(x=x_tensor, edge_index=edge_index, y=y_tensor)

In [None]:
def apply_graphsmote(data, minority_class=1, synthetic_ratio=1.5):
    minority_mask = data.y == minority_class
    minority_indices = torch.where(minority_mask)[0]
    minority_features = data.x[minority_indices].numpy()
    nn = NearestNeighbors(n_neighbors=5).fit(minority_features)
    neighbors = nn.kneighbors(minority_features, return_distance=False)
    synthetic_features = []
    for i in range(len(minority_features)):
        for _ in range(int(synthetic_ratio)):
            neighbors_i = neighbors[i]
            sampled_neighbor = minority_features[np.random.choice(neighbors_i)]
            synthetic_features.append((minority_features[i] + sampled_neighbor) / 2)
    synthetic_features = torch.tensor(synthetic_features, dtype=torch.float)
    synthetic_labels = torch.tensor([minority_class] * len(synthetic_features), dtype=torch.long)
    data.x = torch.cat([data.x, synthetic_features], dim=0)
    data.y = torch.cat([data.y, synthetic_labels], dim=0)
    num_original_nodes = data.x.size(0) - synthetic_features.size(0)
    synthetic_edges = []
    for i in range(synthetic_features.size(0)):
        synthetic_index = num_original_nodes + i
        original_node = np.random.choice(minority_indices.numpy())
        synthetic_edges.extend([[synthetic_index, original_node], [original_node, synthetic_index]])
    synthetic_edges = torch.tensor(synthetic_edges, dtype=torch.long).t()
    data.edge_index = torch.cat([data.edge_index, synthetic_edges], dim=1)
    return data

In [None]:
def chunk_graph_with_diversity(data, chunk_size):
    chunks = []
    current_chunk_x, current_chunk_y, current_chunk_edges = [], [], []
    for i in range(0, data.num_nodes, chunk_size):
        chunk_x = data.x[i:i + chunk_size]
        chunk_y = data.y[i:i + chunk_size]
        edge_mask = (data.edge_index[0] >= i) & (data.edge_index[0] < i + chunk_size) & \
                    (data.edge_index[1] >= i) & (data.edge_index[1] < i + chunk_size)
        chunk_edges = data.edge_index[:, edge_mask] - i
        unique_classes = torch.unique(chunk_y)
        if len(unique_classes) < 2:
            current_chunk_x.append(chunk_x)
            current_chunk_y.append(chunk_y)
            current_chunk_edges.append(chunk_edges)
        else:
            if current_chunk_x:
                merged_chunk_x = torch.cat(current_chunk_x + [chunk_x], dim=0)
                merged_chunk_y = torch.cat(current_chunk_y + [chunk_y], dim=0)
                merged_chunk_edges = torch.cat(current_chunk_edges + [chunk_edges], dim=1)
                chunks.append(Data(x=merged_chunk_x, edge_index=merged_chunk_edges, y=merged_chunk_y))
                current_chunk_x, current_chunk_y, current_chunk_edges = [], [], []
            else:
                chunks.append(Data(x=chunk_x, edge_index=chunk_edges, y=chunk_y))
    return chunks

In [None]:
class GCNModel(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCNModel, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)
        self.dropout = nn.Dropout(0.5)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index).relu()
        x = self.dropout(x)
        return self.conv2(x, edge_index)

In [None]:
class SAGEModel(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(SAGEModel, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)
        self.dropout = nn.Dropout(0.5)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index).relu()
        x = self.dropout(x)
        return self.conv2(x, edge_index)

In [None]:
class GATModel(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GATModel, self).__init__()
        self.conv1 = GATConv(in_channels, hidden_channels, heads=4, concat=True, dropout=0.6)
        self.conv2 = GATConv(hidden_channels * 4, out_channels, heads=1, concat=False, dropout=0.6)
        self.dropout = torch.nn.Dropout(0.6)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index).relu()
        x = self.dropout(x)
        return self.conv2(x, edge_index)

In [None]:
class EnsembleGNN(nn.Module):
    def __init__(self, models):
        super(EnsembleGNN, self).__init__()
        self.models = nn.ModuleList(models)

    def forward(self, data):
        outputs = [model(data) for model in self.models]
        return torch.mean(torch.stack(outputs), dim=0)

In [None]:
class WeightedEnsembleGNN(nn.Module):
    def __init__(self, models, weights):
        super(WeightedEnsembleGNN, self).__init__()
        self.models = nn.ModuleList(models)
        self.weights = torch.tensor(weights, dtype=torch.float)

    def forward(self, data):
        outputs = [model(data) for model in self.models]
        weighted_outputs = [w * o for w, o in zip(self.weights, outputs)]
        return torch.mean(torch.stack(weighted_outputs), dim=0)

In [None]:
def train(model, optimizer, criterion, data, epochs=30, device="cpu"):
    model.train()
    for epoch in range(epochs):
      for chunk in chunks:
            chunk = chunk.to(device)
            optimizer.zero_grad()
            outputs = model(chunk)
            loss = criterion(outputs, chunk.y)
            loss.backward()
            optimizer.step()
      print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

In [None]:
def evaluate_model(model, chunks, device="cpu"):
    model.eval()
    g_means, roc_aucs = [], []
    for chunk in chunks:
        chunk = chunk.to(device)
        with torch.no_grad():
            outputs = model(chunk)
            preds = outputs.argmax(dim=1).cpu().numpy()
            labels = chunk.y.cpu().numpy()
            if len(np.unique(labels)) < 2:
                print("Skipping chunk with insufficient class diversity.")
                continue
            tn, fp, fn, tp = confusion_matrix(labels, preds, labels=[0, 1]).ravel()
            sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
            specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
            g_mean = np.sqrt(sensitivity * specificity)
            roc_auc = roc_auc_score(labels, outputs[:, 1].exp().cpu().numpy())
            g_means.append(g_mean)
            roc_aucs.append(roc_auc)
        print(f"G-Mean: {g_mean:.4f}, ROC-AUC: {roc_auc:.4f}")
    avg_g_mean = np.nanmean(g_means)
    avg_roc_auc = np.nanmean(roc_aucs)
    print(f"Avg G-Mean: {avg_g_mean:.4f}, Avg ROC-AUC: {avg_roc_auc:.4f}")
    return avg_g_mean, avg_roc_auc

In [None]:
def evaluate_ensemble(models, chunks, device="cpu"):
    g_means, roc_aucs = [], []
    for chunk in chunks:
        chunk = chunk.to(device)
        with torch.no_grad():
            outputs = torch.mean(torch.stack([model(chunk) for model in models]), dim=0)
            preds = outputs.argmax(dim=1).cpu().numpy()
            labels = chunk.y.cpu().numpy()
            if len(np.unique(labels)) < 2:
                print("Skipping chunk with insufficient class diversity.")
                continue
            tn, fp, fn, tp = confusion_matrix(labels, preds, labels=[0, 1]).ravel()
            sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
            specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
            g_mean = np.sqrt(sensitivity * specificity)
            g_means.append(g_mean)
            try:
                roc_auc = roc_auc_score(labels, outputs[:, 1].exp().cpu().numpy())
                roc_aucs.append(roc_auc)
            except ValueError:
                print("Skipping ROC-AUC calculation for this chunk.")
                continue
        print(f"G-Mean: {g_mean:.4f}, ROC-AUC: {roc_auc:.4f}")
    avg_g_mean = np.nanmean(g_means)
    avg_roc_auc = np.nanmean(roc_aucs)
    print(f"Ensemble Avg G-Mean: {avg_g_mean:.4f}, Avg ROC-AUC: {avg_roc_auc:.4f}")
    return avg_g_mean, avg_roc_auc

##[Pima Indians Diabetes Database dataset evaluations](https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database)

In [None]:
file_path = '/content/drive/My Drive/diabetes.csv'
df = pd.read_csv(file_path)

In [None]:
def preprocess_dataset(file_path):
    df = pd.read_csv(file_path)
    X = df.drop(columns=['Outcome']).values
    y = df['Outcome'].values
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    return X, y

In [None]:
# main exe of program
file_path = '/content/drive/My Drive/diabetes.csv'
X, y = preprocess_dataset(file_path)
graph_data = create_graph_structure(X, y)
balanced_graph_data = apply_graphsmote(graph_data)

chunks = chunk_graph_with_diversity(balanced_graph_data,100)

gcn_model = GCNModel(in_channels=X.shape[1], hidden_channels=64, out_channels=2).to(device)
sage_model = SAGEModel(in_channels=X.shape[1], hidden_channels=64, out_channels=2).to(device)
gat_model = GATModel(in_channels=X.shape[1], hidden_channels=64, out_channels=2).to(device)
model_weights = [0.5, 0.2, 0.5]
ensemble_model = WeightedEnsembleGNN(models=[gcn_model, sage_model, gat_model], weights=model_weights).to(device)
# ensemble_model = EnsembleGNN(models=[gcn_model, sage_model, gat_model]).to(device)

class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
criterion = nn.CrossEntropyLoss(weight=torch.tensor(class_weights, dtype=torch.float).to(device))
optimizer = torch.optim.Adam(ensemble_model.parameters(), lr=0.005, weight_decay=5e-4)

train(ensemble_model, optimizer, criterion, chunks, epochs=30, device=device)

# seperately
print("\nEvaluating GCN...")
gcn_g_mean, gcn_roc_auc = evaluate_model(gcn_model, chunks, device=device)
print("\nEvaluating GraphSAGE...")
sage_g_mean, sage_roc_auc = evaluate_model(sage_model, chunks, device=device)
print("\nEvaluating GAT...")
gat_g_mean, gat_roc_auc = evaluate_model(gat_model, chunks, device=device)
# ensemble form
print("\nEvaluating Ensemble...")
ensemble_g_mean, ensemble_roc_auc = evaluate_ensemble([gcn_model, sage_model, gat_model], chunks, device=device)

Epoch [1/30], Loss: 0.5785
Epoch [2/30], Loss: 0.5064
Epoch [3/30], Loss: 0.4759
Epoch [4/30], Loss: 0.4948
Epoch [5/30], Loss: 0.4732
Epoch [6/30], Loss: 0.4523
Epoch [7/30], Loss: 0.5137
Epoch [8/30], Loss: 0.4252
Epoch [9/30], Loss: 0.4537
Epoch [10/30], Loss: 0.4560
Epoch [11/30], Loss: 0.4069
Epoch [12/30], Loss: 0.4331
Epoch [13/30], Loss: 0.4215
Epoch [14/30], Loss: 0.4178
Epoch [15/30], Loss: 0.4419
Epoch [16/30], Loss: 0.4387
Epoch [17/30], Loss: 0.4145
Epoch [18/30], Loss: 0.4099
Epoch [19/30], Loss: 0.4170
Epoch [20/30], Loss: 0.4094
Epoch [21/30], Loss: 0.4055
Epoch [22/30], Loss: 0.4074
Epoch [23/30], Loss: 0.4166
Epoch [24/30], Loss: 0.3765
Epoch [25/30], Loss: 0.4023
Epoch [26/30], Loss: 0.4205
Epoch [27/30], Loss: 0.3789
Epoch [28/30], Loss: 0.4098
Epoch [29/30], Loss: 0.3762
Epoch [30/30], Loss: 0.3940

Evaluating GCN...
G-Mean: 0.6731, ROC-AUC: 0.7917
G-Mean: 0.6901, ROC-AUC: 0.8251
G-Mean: 0.6558, ROC-AUC: 0.7577
G-Mean: 0.7313, ROC-AUC: 0.8062
G-Mean: 0.6633, ROC-AU

##[Haberman's Survival dataset evaluation](https://archive.ics.uci.edu/dataset/43/haberman+s+survival)

In [None]:
zip_path = '/content/drive/My Drive/Haberman\'s Survival.zip'
extract_path = '/content/haberman_data'
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

In [None]:
def preprocess_dataset_haberman(file_path):
    df = pd.read_csv(file_path, header=None)
    df.columns = ['Age', 'Operation_Year', 'Axillary_Nodes', 'Survival_Status']
    # Map class labels to 0 and 1
    df['Survival_Status'] = df['Survival_Status'].map({1: 0, 2: 1})  # 0: survived, 1: died
    print(df.head())
    X = df.drop(columns=['Survival_Status']).values
    y = df['Survival_Status'].values
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    return X, y

In [None]:
# main exe of program
file_path = '/content/haberman_data/haberman.data'
X, y = preprocess_dataset_haberman(file_path)
graph_data = create_graph_structure(X, y)
balanced_graph_data = apply_graphsmote(graph_data)

chunks = chunk_graph_with_diversity(balanced_graph_data,100)

gcn_model = GCNModel(in_channels=X.shape[1], hidden_channels=64, out_channels=2).to(device)
sage_model = SAGEModel(in_channels=X.shape[1], hidden_channels=64, out_channels=2).to(device)
gat_model = GATModel(in_channels=X.shape[1], hidden_channels=64, out_channels=2).to(device)
model_weights = [0.5, 0.2, 0.5]
ensemble_model = WeightedEnsembleGNN(models=[gcn_model, sage_model, gat_model], weights=model_weights).to(device)
# ensemble_model = EnsembleGNN(models=[gcn_model, sage_model, gat_model]).to(device)

class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
criterion = nn.CrossEntropyLoss(weight=torch.tensor(class_weights, dtype=torch.float).to(device))
optimizer = torch.optim.Adam(ensemble_model.parameters(), lr=0.005, weight_decay=5e-4)

train(ensemble_model, optimizer, criterion, chunks, epochs=30, device=device)

# seperately
print("\nEvaluating GCN...")
gcn_g_mean, gcn_roc_auc = evaluate_model(gcn_model, chunks, device=device)
print("\nEvaluating GraphSAGE...")
sage_g_mean, sage_roc_auc = evaluate_model(sage_model, chunks, device=device)
print("\nEvaluating GAT...")
gat_g_mean, gat_roc_auc = evaluate_model(gat_model, chunks, device=device)
# ensemble form
print("\nEvaluating Ensemble...")
ensemble_g_mean, ensemble_roc_auc = evaluate_ensemble([gcn_model, sage_model, gat_model], chunks, device=device)

## [Stanford Twitter Stream dataset evaluation](https://jmgomezhidalgo.blogspot.com/2013/01/a-list-of-datasets-for-opinion-mining.html)