<a href="https://colab.research.google.com/github/MaryamKazemit/OEBGNN/blob/main/successful_ver_of_Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
print("PyTorch version:", torch.__version__)
print("CUDA version:", torch.version.cuda)


PyTorch version: 2.5.1+cu121
CUDA version: 12.1


In [None]:
!pip install torch torchvision torchaudio
# !pip install torch-geometric
# !pip install torch-scatter torch-sparse torch-cluster torch-spline-conv
# !pip install torch==2.0.1+cu117 torchvision==0.15.2+cu117 torchaudio==2.0.2+cu117 -f https://download.pytorch.org/whl/torch_stable.html
!pip install torch-geometric gensim

Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.6.1


In [None]:
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GATConv
import torch.nn.functional as F
import torch.nn as nn
from torch.optim import Adam
import numpy as np
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import gc

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file_path = '/content/drive/My Drive/DBLP-citation-Jan8-dataset.tar.bz2'

In [None]:
import tarfile
import os

extract_path = '/content/dblp_dataset'
with tarfile.open(file_path, 'r:bz2') as tar:
    tar.extractall(path=extract_path)

# Verify extracted files
!ls /content/dblp_dataset

DBLP-citation-Jan8.txt


In [None]:
# Initialize structures for metadata and edges
papers = {}
edge_index = []

# Replace with the actual extracted file name
txt_file_path = '/content/dblp_dataset/DBLP-citation-Jan8.txt'

with open(txt_file_path, 'r', encoding='utf-8', errors='ignore') as file:
    current_paper = None
    for line in file:
        line = line.strip()
        if line.startswith("#index"):
            current_paper = line.replace("#index", "").strip()
            papers[current_paper] = {
                "title": None,
                "authors": [],
                "year": None,
                "conference": None,
                "abstract": None,
                "citations": []
            }
        elif line.startswith("#*") and current_paper:
            papers[current_paper]["title"] = line.replace("#*", "").strip()
        elif line.startswith("#@") and current_paper:
            authors = line.replace("#@", "").strip().split(",")
            papers[current_paper]["authors"] = [author.strip() for author in authors]
        elif line.startswith("#t") and current_paper:
            papers[current_paper]["year"] = line.replace("#t", "").strip()
        elif line.startswith("#c") and current_paper:
            papers[current_paper]["conference"] = line.replace("#c", "").strip()
        elif line.startswith("#%") and current_paper:
            cited_paper = line.replace("#%", "").strip()
            papers[current_paper]["citations"].append(cited_paper)
            try:
                source = int(current_paper)
                target = int(cited_paper)
                edge_index.append([source, target])
            except ValueError:
                continue
        elif line.startswith("#!") and current_paper:
            papers[current_paper]["abstract"] = line.replace("#!", "").strip()

In [None]:
# Convert edge list to tensor
if edge_index:
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
else:
    print("No edges found. Check the file format.")

In [None]:
paper_ids = list(papers.keys())
id_to_idx = {paper_id: idx for idx, paper_id in enumerate(paper_ids)}
num_nodes = len(paper_ids)

# Replace None abstracts with empty strings
abstracts = [papers[paper_id].get("abstract", "") or "" for paper_id in paper_ids]
tagged_abstracts = [TaggedDocument(words=abstract.split(), tags=[str(i)]) for i, abstract in enumerate(abstracts)]
doc2vec_model = Doc2Vec(tagged_abstracts, vector_size=128, window=5, min_count=1, workers=4)
doc2vec_features = [doc2vec_model.dv[str(i)] for i in range(len(abstracts))]
x = torch.tensor(doc2vec_features, dtype=torch.float).to(device)

In [None]:
# Initialize labels (0: negative class, 1: positive class)
y = torch.zeros(num_nodes, dtype=torch.long)

# Set labels based on conference (e.g., 'Computer Vision' as positive class)
for paper_id, metadata in papers.items():
    idx = id_to_idx[paper_id]
    conference = metadata.get("conference")
    if conference and "Computer Vision" in conference:
        y[idx] = 1

In [None]:
# Update edge_index to use indices instead of paper IDs
new_edge_index = []
for source_id, target_id in zip(edge_index[0], edge_index[1]):
    source = id_to_idx.get(str(source_id.item()))
    target = id_to_idx.get(str(target_id.item()))
    if source is not None and target is not None:
        new_edge_index.append([source, target])

# Convert to tensor
edge_index = torch.tensor(new_edge_index, dtype=torch.long).t().contiguous()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
graph_data = Data(x=x.to(device), edge_index=edge_index.to(device), y=y.to(device))

In [None]:
# Compute class weights
labels = graph_data.y.cpu().numpy()
class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

In [None]:
class ImprovedGATClassifier(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(ImprovedGATClassifier, self).__init__()
        self.conv1 = GATConv(in_channels, hidden_channels, heads=8, concat=True)
        self.conv2 = GATConv(hidden_channels * 8, hidden_channels, heads=8, concat=True)
        self.conv3 = GATConv(hidden_channels * 8, out_channels, heads=1, concat=False)
        self.dropout = nn.Dropout(0.5)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.elu(self.conv1(x, edge_index))
        x = self.dropout(x)
        x = F.elu(self.conv2(x, edge_index))
        x = self.dropout(x)
        x = self.conv3(x, edge_index)
        return F.log_softmax(x, dim=1)

gnn = ImprovedGATClassifier(in_channels=x.shape[1], hidden_channels=128, out_channels=2).to(device)


In [None]:
class WeightedEnsembleClassifierWithFocalLoss(nn.Module):
    def __init__(self, gnn, gamma=2.0, class_weights=None):
        super(WeightedEnsembleClassifierWithFocalLoss, self).__init__()
        self.gnn = gnn
        self.gamma = gamma
        self.class_weights = class_weights

    def forward(self, data):
        return self.gnn(data)

    def compute_loss(self, outputs, targets):
        ce_loss = F.cross_entropy(outputs, targets, weight=self.class_weights, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** self.gamma * ce_loss).mean()
        return focal_loss

ensemble = WeightedEnsembleClassifierWithFocalLoss(gnn, gamma=2.0, class_weights=class_weights).to(device)


In [None]:
def evaluate_model(model, data):
    model.eval()
    with torch.no_grad():
        output = model(data)
        preds = output.argmax(dim=1).cpu().numpy()
        labels = data.y.cpu().numpy()
        if len(np.unique(labels)) == 1 or len(np.unique(preds)) == 1:
            g_mean = 0.0
            roc_auc = 0.5
        else:
            tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
            sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
            specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
            g_mean = np.sqrt(sensitivity * specificity)
            probs = torch.exp(output)[:, 1].cpu().numpy()
            roc_auc = roc_auc_score(labels, probs)
        print(f'G-Mean: {g_mean:.4f}, ROC-AUC: {roc_auc:.4f}')
        return g_mean, roc_auc

In [None]:
def train_with_early_stopping(model, data, optimizer, patience=5, epochs=100):
    best_g_mean = 0
    patience_counter = 0
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        output = model(data)
        loss = model.compute_loss(output, data.y)
        loss.backward()
        optimizer.step()
        g_mean, roc_auc = evaluate_model(model, data)
        print(f'Epoch {epoch + 1}, Loss: {loss.item():.4f}, G-Mean: {g_mean:.4f}, ROC-AUC: {roc_auc:.4f}')
        if g_mean > best_g_mean:
            best_g_mean = g_mean
            patience_counter = 0
        else:
            patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break
        torch.cuda.empty_cache()
        gc.collect()

In [None]:
optimizer = Adam(ensemble.parameters(), lr=0.005)
train_with_early_stopping(ensemble, graph_data, optimizer, patience=10, epochs=200)

G-Mean: 0.0000, ROC-AUC: 0.5000
Epoch 1, Loss: 0.6936, G-Mean: 0.0000, ROC-AUC: 0.5000
G-Mean: 0.0008, ROC-AUC: 0.6713
Epoch 2, Loss: 0.6823, G-Mean: 0.0008, ROC-AUC: 0.6713
G-Mean: 0.6186, ROC-AUC: 0.6770
Epoch 3, Loss: 0.6687, G-Mean: 0.6186, ROC-AUC: 0.6770
G-Mean: 0.6194, ROC-AUC: 0.6839
Epoch 4, Loss: 0.6593, G-Mean: 0.6194, ROC-AUC: 0.6839
G-Mean: 0.6274, ROC-AUC: 0.6886
Epoch 5, Loss: 0.6550, G-Mean: 0.6274, ROC-AUC: 0.6886
G-Mean: 0.6384, ROC-AUC: 0.6956
Epoch 6, Loss: 0.6499, G-Mean: 0.6384, ROC-AUC: 0.6956
G-Mean: 0.6468, ROC-AUC: 0.7014
Epoch 7, Loss: 0.6416, G-Mean: 0.6468, ROC-AUC: 0.7014
G-Mean: 0.6559, ROC-AUC: 0.7081
Epoch 8, Loss: 0.6328, G-Mean: 0.6559, ROC-AUC: 0.7081
G-Mean: 0.6716, ROC-AUC: 0.7180
Epoch 9, Loss: 0.6233, G-Mean: 0.6716, ROC-AUC: 0.7180
G-Mean: 0.6890, ROC-AUC: 0.7367
Epoch 10, Loss: 0.6130, G-Mean: 0.6890, ROC-AUC: 0.7367
G-Mean: 0.6934, ROC-AUC: 0.7489
Epoch 11, Loss: 0.6015, G-Mean: 0.6934, ROC-AUC: 0.7489
G-Mean: 0.6952, ROC-AUC: 0.7521
Epoch 12,

In [None]:
g_mean, roc_auc = evaluate_model(ensemble, graph_data)
print(f'Final G-Mean: {g_mean:.4f}, Final ROC-AUC: {roc_auc:.4f}')

G-Mean: 0.7564, ROC-AUC: 0.8428
Final G-Mean: 0.7564, Final ROC-AUC: 0.8428
