In [1]:
## Augmentation
import os 
import numpy as np
import torch
from imblearn.over_sampling import SMOTE
import pickle
from sklearn.model_selection import train_test_split

# load data
DATA_PATH = os.path.abspath("../dataset/weifeng/all_graphs_2.pkl")
data = np.load(DATA_PATH, allow_pickle=True)


# data_x 
x = [item['adj'] for item in data]
x = np.array(x)
x_shape = x.shape
x = x.reshape(x.shape[0], -1)
y = [item['y'] for item in data]
y = np.array(y)
y = (y[:, 2] + y[:, 3] + y[:, 4] >= 1).astype(int)

# split data
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=42)

# SMOTE
sm = SMOTE(random_state=42)
train_x_smote, train_y_smote = sm.fit_resample(train_x, train_y)

# save data
data = []
for i in range(train_x.shape[0]):
    data.append({'adj': train_x[i].reshape(x_shape[1], x_shape[2]), 'y': train_y[i]})
with open("../dataset/weifeng/all_graphs_2_external_train.pkl", 'wb') as f:
    pickle.dump(data, f)

data = []
for i in range(train_x_smote.shape[0]):
    data.append({'adj': train_x_smote[i].reshape(x_shape[1], x_shape[2]), 'y': train_y_smote[i]})
with open("../dataset/weifeng/all_graphs_2_external_train_smote.pkl", 'wb') as f:
    pickle.dump(data, f)

data = []
for i in range(test_x.shape[0]):
    data.append({'adj': test_x[i].reshape(x_shape[1], x_shape[2]), 'y': test_y[i]})
with open("../dataset/weifeng/all_graphs_2_external_test.pkl", 'wb') as f:
    pickle.dump(data, f)

## Source: Dataset

In [1]:
import os
import torch
from torch.utils.data import Dataset
from torch_geometric.data import InMemoryDataset, Data
# global mean pooling and global max pooling
from torch_geometric.nn import global_mean_pool, global_max_pool
from torch_sparse import SparseTensor
import os
import pickle
import sys
import dgl
import numpy as np
import pdb
from tqdm import tqdm
from torch_geometric.data import DataLoader
import torch.nn as nn
from torch_geometric.nn import GCNConv
from sklearn.metrics import roc_auc_score, accuracy_score


# from utils.utils import get_spectral_embedding
from dgl.data import DGLDataset

from typing import Any, Dict, List, Tuple

# set the path to the root directory of the project
sys.path.append(os.path.abspath("../"))

DATA_PATH = os.path.abspath("../dataset/weifeng/all_graphs_2.pkl")
# DATA_PATH = os.path.abspath("../dataset/processed/data_dict_site16_binary.pkl")


def pre_transform(data: Dict[str, Any]) -> Data:
    """Transform the data into torch Data type"""
    if "all_graphs" in  DATA_PATH :
        x = torch.tensor(data['adj'])
        adj = torch.tensor(data["adj"], dtype=torch.float32)
        adj = (adj - adj.min()) / (adj.max() - adj.min())
        edge_index_ = (adj >= -1).nonzero().t().contiguous()
        edge_index_ = edge_index_[:, edge_index_[0] != edge_index_[1]]
        edge_weight = adj[edge_index_[0], edge_index_[1]]
        try:
            label = torch.tensor(data['y']).unsqueeze(0)
        except:
            label = torch.tensor(data['y'])
        return Data(
            x=x,
            x_SC=None,
            edge_index=edge_index_,
            edge_weight=edge_weight,
            edge_index_SC=None,
            edge_weight_SC=None,
            y=label
        )
    else:
        x = torch.tensor(data["FC"], dtype=torch.float32)
        x_SC = torch.tensor(data["SC"], dtype=torch.float32)
        x_SC = (x_SC.max() - x_SC) / (x_SC.max() - x_SC.min())

        edge_index_FC = (x >= 0.8).nonzero().t().contiguous()
        edge_index_FC = edge_index_FC[:, edge_index_FC[0] != edge_index_FC[1]]
        row, col = edge_index_FC
        edge_weight_FC = x[row, col]

        edge_index_SC =  (x_SC > x_SC.mean()).nonzero().t().contiguous()
        edge_index_SC = edge_index_SC[:, edge_index_SC[0] != edge_index_SC[1]]
        row, col = edge_index_SC
        edge_weight_SC = torch.tensor(x_SC[row, col], dtype=torch.float32)

        feature = torch.tensor(data['feature'], dtype=torch.float32).unsqueeze(0) if 'feature' in data.keys() else None
        label_tensor = torch.tensor(data['label'], dtype=torch.float32).unsqueeze(0)


        return Data(
            x=x,
            x_SC=x_SC,
            edge_index=edge_index_FC,
            edge_weight=edge_weight_FC,
            edge_index_SC=edge_index_SC,
            edge_weight_SC=edge_weight_SC,
            y=label_tensor,
            feature=feature
        )


class Brain(InMemoryDataset):
    def __init__(
        self,
        task,
        x_attributes=None,
        processed_path="../data/processed",
        rawdata_path=DATA_PATH,
        suffix=None,
        args=None,
    ):
        
        if suffix is None:
            suffix = ""
        self.processed_path = os.path.join(processed_path, f"{task}_data{suffix}.pt")

        self.task = task
        self.x_attributes = x_attributes
        self.rawdata_path = rawdata_path
        if "train" in suffix or "test" in suffix:
            self.rawdata_path = rawdata_path.replace(".pkl", suffix + ".pkl")
        self.suffix = suffix
        self.pre_transform = pre_transform

        super().__init__(pre_transform=self.pre_transform)

        self.data, self.slices = torch.load(self.processed_path)

        """modify"""
        # task = args.task_idx
        # task = [1]
        # self.data.y = (self.data.y)[:, task]
        """modify end"""


    def processed_file_names(self):
        return os.path.basename(self.processed_path)

    def process(self) -> None:
        with open(self.rawdata_path, "rb") as f:
            data = pickle.load(f)

        data_list = []
        for i in tqdm(range(len(data))):
            if self.pre_transform is not None:
                data_list.append(self.pre_transform(data[i]))
        
        self.data, self.slices = self.collate(data_list)
        print("Saving...")
        torch.save((self.data, self.slices), self.processed_path)

    @property
    def processed_dir(self):
        return os.path.dirname(self.processed_path)

    def process_data(self, data):
        data_list = []
        for i in range(len(data)):
            try:
                data_list.append(self.pre_transform(data[i]))
            except:
                pdb.set_trace()

        data, slices = self.collate(data_list)
        print("Saving...")
        torch.save((data, slices), self.processed_path)
        

train_dataset_smote = Brain(task='classification', x_attributes=['adj'], suffix="_external_train_smote")
test_dataset = Brain(task='classification', x_attributes=['adj'], suffix="_external_test")
train_dataset = Brain(task='classification', x_attributes=['adj'], suffix="_external_train")

  from .autonotebook import tqdm as notebook_tqdm


## Model

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GINConv, global_mean_pool, global_max_pool, GCNConv, SAGEConv, GATConv, GatedGraphConv, SGConv, ResGatedGraphConv 

import pdb
from torch_geometric.nn import TransformerConv


class GIN_pyg(nn.Module):
    def __init__(self, net_params, args):
        super(GIN_pyg, self).__init__()
        in_channels = net_params["in_channels"]
        hidden_channels = net_params["hidden_channels"]
        out_channels = net_params["out_channels"]
        num_layers = net_params["num_layers"]
        dropout = net_params["dropout"]

        self.readout_type = net_params["readout"]
        self.in_channels = in_channels
        self.hidden_channels = hidden_channels
        self.out_channels = out_channels
        self.num_layers = num_layers
        self.dropout = dropout

        self.conv1 = GCNConv(self.in_channels, self.hidden_channels)
        self.conv2 = GCNConv(self.hidden_channels, self.hidden_channels)
        self.fc = nn.Linear(self.hidden_channels, 1)
        
        

    def forward(self, data):
        x, edge_index, edge_weight, batch = data.x, data.edge_index, data.edge_weight.unsqueeze(-1), data.batch
        x = F.relu(self.conv1(x, edge_index, edge_weight)) 
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = F.relu(self.conv2(x, edge_index, edge_weight))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = global_mean_pool(x, batch)
        predict = self.fc(x)

        return predict


## Main

In [4]:
from torch_geometric.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import wandb

wandb.init(project="brain", name="Train")


device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
net_params = {
    "in_channels": 379,
    "hidden_channels": 64,
    "out_channels": 1,
    "num_layers": 2,
    "dropout": 0.2,
    "readout": "mean"
}
model = GIN_pyg(net_params, None).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
criterion = nn.BCEWithLogitsLoss()

# split data
# train_ratio = 0.7
# rand_idx = torch.randperm(len(dataset))
# train_idx = rand_idx[:int(train_ratio * len(dataset))]
# test_idx = rand_idx[int(train_ratio * len(dataset)):]
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

def test(model, data_loader):
    y_true = []
    y_pred = []
    for batch in data_loader:
        data = batch.to(device)
        model.eval()
        with torch.no_grad():
            output = model(data)
            y_true.append(data.y.cpu().numpy())
            y_pred.append(output.cpu().numpy())
    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)
    auc = roc_auc_score(y_true, y_pred)
    wandb.log({"test_AUC": auc})
    print(f"test_AUC: {auc}")

def train(model, optimizer, criterion, train_loader, test_loader):
    for epoch in range(100):
        model.train()
        loss_all = 0
        y_true = []
        y_pred = []
        for batch in train_loader:
            data = batch.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output.squeeze(), data.y.float())
            loss.backward()
            optimizer.step()
            loss_all += loss.item()
            y_true.append(data.y.cpu().numpy())
            y_pred.append(output.detach().cpu().numpy())
        y_true = np.concatenate(y_true)
        y_pred = np.concatenate(y_pred)
        auc = roc_auc_score(y_true, y_pred)
        wandb.log({"train_loss": loss_all, "train_AUC": auc})
        print(f"Epoch: {epoch}, Loss: {loss_all}, AUC: {auc}")
        test(model, test_loader)
        
        
    
train(model, optimizer, criterion, train_loader, test_loader)



0,1
test_AUC,█▄▅▅▅▆▅▄▃▂▂▂▁
train_AUC,▃▄▁▄▆▅▅▆▆▇▇██
train_loss,█▇▆▄▃▂▂▂▁▂▁▁▁

0,1
test_AUC,0.49538
train_AUC,0.62274
train_loss,2.30055




Epoch: 0, Loss: 3.09226593375206, AUC: 0.5353618421052632
test_AUC: 0.5343589743589744
Epoch: 1, Loss: 2.534818321466446, AUC: 0.5563322368421053
test_AUC: 0.4902564102564103
Epoch: 2, Loss: 2.3919230699539185, AUC: 0.6195175438596492
test_AUC: 0.46461538461538465
Epoch: 3, Loss: 2.390120804309845, AUC: 0.594640899122807
test_AUC: 0.45743589743589746
Epoch: 4, Loss: 2.3118708729743958, AUC: 0.6787280701754387
test_AUC: 0.4502564102564103
Epoch: 5, Loss: 2.2106895446777344, AUC: 0.7206688596491228
test_AUC: 0.4461538461538462
Epoch: 6, Loss: 2.2817825078964233, AUC: 0.7361567982456141
test_AUC: 0.4574358974358974
Epoch: 7, Loss: 2.180802345275879, AUC: 0.7804961622807017
test_AUC: 0.4461538461538461
Epoch: 8, Loss: 2.1407677084207535, AUC: 0.7540433114035088
test_AUC: 0.4471794871794872
Epoch: 9, Loss: 1.969567984342575, AUC: 0.828673245614035
test_AUC: 0.45743589743589746
Epoch: 10, Loss: 1.882045492529869, AUC: 0.8886376096491228
test_AUC: 0.4533333333333333
Epoch: 11, Loss: 1.7047531

## Conclusion

1. use SMOTE to augment train dataset will only worsen the results

## DANN Model

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GINConv, global_mean_pool, global_max_pool, GCNConv, SAGEConv, GATConv, GatedGraphConv, SGConv, ResGatedGraphConv 

import pdb
from torch_geometric.nn import TransformerConv


class GIN_pyg_DANN(nn.Module):
    def __init__(self, net_params, args):
        super().__init__()
        in_channels = net_params["in_channels"]
        hidden_channels = net_params["hidden_channels"]
        out_channels = net_params["out_channels"]
        num_layers = net_params["num_layers"]
        dropout = net_params["dropout"]

        self.readout_type = net_params["readout"]
        self.in_channels = in_channels
        self.hidden_channels = hidden_channels
        self.out_channels = out_channels
        self.num_layers = num_layers
        self.dropout = dropout

        self.conv1 = GCNConv(self.in_channels, self.hidden_channels)
        self.conv2 = GCNConv(self.hidden_channels, self.hidden_channels)
        self.fc_class = nn.Linear(self.hidden_channels, 1)
        self.fc_domain = nn.Linear(self.hidden_channels, 1)
        
        

    def forward(self, data):
        x, edge_index, edge_weight, batch = data.x, data.edge_index, data.edge_weight.unsqueeze(-1), data.batch
        x = F.relu(self.conv1(x, edge_index, edge_weight)) 
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = F.relu(self.conv2(x, edge_index, edge_weight))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = global_mean_pool(x, batch)
        predict = self.fc_class(x)
        domain = self.fc_domain(x)

        return predict, domain


## DANN

In [4]:
from torch_geometric.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import wandb

wandb.init(project="brain", name="DANN")


device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
net_params = {
    "in_channels": 379,
    "hidden_channels": 64,
    "out_channels": 1,
    "num_layers": 2,
    "dropout": 0.2,
    "readout": "mean"
}
model = GIN_pyg_DANN(net_params, None).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion_class = nn.BCEWithLogitsLoss()
criterion_domain = nn.BCEWithLogitsLoss()

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

def test(model, data_loader):
    y_true = []
    y_pred = []
    for batch in data_loader:
        data = batch.to(device)
        model.eval()
        with torch.no_grad():
            output, _ = model(data)
            y_true.append(data.y.cpu().numpy())
            y_pred.append(output.cpu().numpy())
    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)
    auc = roc_auc_score(y_true, y_pred)
    wandb.log({"test_AUC": auc})
    print(f"test_AUC: {auc}")

def train(model, optimizer, criterion_class, criterion_domain, train_loader, test_loader):
    for epoch in range(100):
        model.train()
        loss_all = 0
        y_true = []
        y_pred = []
        for batch in train_loader:
            data = batch.to(device)
            # split src and tgt
            src_idx = torch.randperm(len(data), device=device)[:int(3 * len(data) // 4)]
            tgt_idx = torch.tensor(list(set(torch.arange(len(batch))) - set(src_idx))).to(device)

            optimizer.zero_grad()
            class_output, domain_output = model(data)
            

            ## src
            class_output_src = class_output[src_idx]
            domain_output_src = domain_output[src_idx]
            domain_label_src = torch.zeros(len(src_idx))
            domain_label_src = domain_label_src.to(device)
            class_loss = criterion_class(class_output_src.squeeze(-1), batch.y.float()[:len(src_idx)])
            domain_loss_src = criterion_domain(domain_output_src.squeeze(-1), domain_label_src)

            ## tgt
            class_output_tgt = class_output[tgt_idx]
            domain_output_tgt = domain_output[tgt_idx]
            domain_label_tgt = torch.ones(len(tgt_idx))
            domain_label_tgt = domain_label_tgt.to(device)

            domain_loss_tgt = criterion_domain(domain_output_tgt.squeeze(-1), domain_label_tgt)

            loss = class_loss + domain_loss_src + domain_loss_tgt

            loss.backward()
            optimizer.step()
            loss_all += loss.item()
            y_true.append(data.y.cpu().numpy())
            y_pred.append(class_output.detach().cpu().numpy())
        y_true = np.concatenate(y_true)
        y_pred = np.concatenate(y_pred)
        auc = roc_auc_score(y_true, y_pred)
        wandb.log({"train_loss": loss_all, "train_AUC": auc}, commit=False)
        print(f"Epoch: {epoch}, Loss: {loss_all}, AUC: {auc}")
        test(model, test_loader)
        
        

train(model, optimizer, criterion_class, criterion_domain, train_loader, test_loader)



0,1
train_AUC,▁
train_loss,▁

0,1
train_AUC,0.54016
train_loss,12.23437




Epoch: 0, Loss: 12.298264741897583, AUC: 0.5041803728070176
test_AUC: 0.5897435897435898
Epoch: 1, Loss: 11.973011493682861, AUC: 0.5061677631578947
test_AUC: 0.5794871794871794
Epoch: 2, Loss: 11.563602447509766, AUC: 0.5213130482456141
test_AUC: 0.5764102564102564
Epoch: 3, Loss: 11.146029949188232, AUC: 0.5134320175438597
test_AUC: 0.5692307692307693
Epoch: 4, Loss: 10.94868290424347, AUC: 0.5535910087719298
test_AUC: 0.5671794871794872
Epoch: 5, Loss: 10.711503863334656, AUC: 0.5563322368421053
test_AUC: 0.5671794871794872
Epoch: 6, Loss: 10.8728609085083, AUC: 0.5703125
test_AUC: 0.5671794871794872
Epoch: 7, Loss: 10.722944140434265, AUC: 0.5722313596491228
test_AUC: 0.5641025641025641
Epoch: 8, Loss: 10.820069432258606, AUC: 0.5782620614035089
test_AUC: 0.5620512820512821
Epoch: 9, Loss: 10.917051553726196, AUC: 0.5656524122807017
test_AUC: 0.5610256410256411
Epoch: 10, Loss: 10.53225064277649, AUC: 0.5790844298245613
test_AUC: 0.557948717948718
Epoch: 11, Loss: 10.60682809352874