In [1]:
import graphIO
from imblearn.over_sampling import RandomOverSampler
import pandas as pd
import gcn_model
import torch
from torch.utils.data import random_split
import torch.nn as nn
import torch.optim as optim
from torch_geometric.data import DataLoader

In [2]:
DATASET_DIR = "data/GNN_Benchmark/"
MAT_DIR = DATASET_DIR + "matrices2326/"
DATASET = DATASET_DIR + "data/Thickness_Swapped_wID.xlsx"
MAPPINGS_DIR = DATASET_DIR + "data/region_name_mapping.json"
ID_COL = 'ID'
LABEL_COL = 'DX'
RANDOM_STATE = 33

In [3]:
matrices = graphIO.read_adj_matrices_from_directory(MAT_DIR)
mappings = graphIO.read_mappings_from_json(MAPPINGS_DIR)
ids = list(matrices.keys())

In [4]:
columns = [ID_COL] + \
    [f'Node {node_idx}' for node_idx in range(1, 161)] + [LABEL_COL]

In [5]:
nodes = graphIO.read_nodes_from_excel(
    DATASET, columns=columns, ids_col=ID_COL, ids=ids)

In [6]:
nodes = nodes.loc[nodes[LABEL_COL].isin(['AD', 'CN'])]

In [7]:
nodes[LABEL_COL] = nodes[LABEL_COL].map({'AD': 1, 'CN': 0})

In [8]:
nodes[LABEL_COL].value_counts()

DX
0    844
1    240
Name: count, dtype: int64

In [9]:
# Initialize the RandomOverSampler
oversampler = RandomOverSampler(random_state=RANDOM_STATE)

# Perform oversampling
X, labels = oversampler.fit_resample(
    nodes.iloc[:, :-1], nodes.iloc[:, -1])

# Print the class distribution after oversampling (optional)
print("Class distribution after oversampling:")
print(pd.Series(labels).value_counts())

Class distribution after oversampling:
DX
0    844
1    844
Name: count, dtype: int64


In [10]:
# Extract node features
graph_ids = X[ID_COL].values
node_features = X.drop(columns=[ID_COL]).values


In [11]:
print(
    f'IDs shape: {graph_ids.shape}, Node features shape: {node_features.shape}, Labels shape: {labels.shape}')

IDs shape: (1688,), Node features shape: (1688, 160), Labels shape: (1688,)


In [12]:
# # Split data into training, validation, and test sets
# X_train, X_temp, y_train, y_temp = train_test_split(
#     node_features, y, test_size=0.2, random_state=RANDOM_STATE)
# X_val, X_test, y_val, y_test = train_test_split(
#     X_temp, y_temp, test_size=0.5, random_state=RANDOM_STATE)

In [13]:
# Define your GCN model
# Example dimensions, replace with your actual values
input_dim = len(node_features[1])
hidden_dim = 64
output_dim = 2
model = gcn_model.GCN(input_dim, hidden_dim, output_dim)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [14]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [15]:
from torch_geometric.data import Data
def create_pyg_data(node_features, adj_matrix, labels):
    # Convert node features, adjacency matrix, and labels to PyTorch tensors
    x = torch.tensor(node_features, dtype=torch.float)
    # Assuming your adjacency matrix is in COO format
    edge_index = torch.tensor(adj_matrix, dtype=torch.long)
    y = torch.tensor(labels, dtype=torch.long)
    # Create a PyTorch Geometric data object
    data = Data(x=x, edge_index=edge_index, y=y)
    return data

In [16]:
import networkx as nx
pyg_data_list = []
for i in range(len(node_features)):
    graph_data = create_pyg_data(
        node_features[i], nx.adjacency_matrix(matrices[graph_ids[i]]).getnnz(), labels[i])
    pyg_data_list.append(graph_data)

KeyError: '941_S_6094_I921886'

In [None]:
# Split dataset into training and validation sets
train_dataset, val_dataset = random_split(pyg_data_list, [int(
    0.8 * len(pyg_data_list)), len(pyg_data_list) - int(0.8 * len(pyg_data_list))])

In [None]:
# Create DataLoader for training and validation sets
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

In [None]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out, data.y)
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for data in val_loader:
            data = data.to(device)
            out = model(data)
            val_loss += criterion(out, data.y).item()
            _, predicted = torch.max(out, 1)
            total += data.y.size(0)
            correct += (predicted == data.y).sum().item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}, Val Loss: {val_loss/len(val_loader):.4f}, Val Acc: {(correct/total)*100:.2f}%")