In [23]:
import os
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.data import Data, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [24]:
global HIDDEN_DIM, OUTPUT_DIM
HIDDEN_DIM = 64
OUTPUT_DIM = 32
BATCH_SIZE = 256  # Adjust based on GPU memory
EPOCHS = 50
LEARNING_RATE = 0.003
SAVE_INTERVAL = 10  # Save model every 5 epochs
TEST_NUM_FILE=5000
ISA="x86"

In [25]:
class GraphPairDataset(torch.utils.data.Dataset):
    def __init__(self,pos_dir,neg_dir):
        self.pos_dir=pos_dir
        self.neg_dir=neg_dir
        self.file_pairs=self._get_file_pairs()

    def _get_file_pairs(self):
        pos_pairs = self._get_pairs_from_dir(self.pos_dir, is_positive=True)
        neg_pairs = self._get_pairs_from_dir(self.neg_dir, is_positive=False)
        return pos_pairs + neg_pairs
    
    def _get_pairs_from_dir(self,directory,is_positive):
        all_files=os.listdir(directory)[:TEST_NUM_FILE]
        file_pairs=[]
        for file in all_files:
            if file.endswith('.csv'):
                pair_id=file.split('$')[0]
                file_b=file.replace('$a$','$b$')
                if file_b in all_files:
                    file_pairs.append((os.path.join(directory, file), 
                                       os.path.join(directory, file_b), 
                                       int(is_positive)))
        return file_pairs
    

    def __len__(self):
        return len(self.file_pairs)
    
    def __getitem__(self, idx):
        file_a, file_b, label = self.file_pairs[idx]
        graph_a = self._create_graph(file_a)
        graph_b = self._create_graph(file_b)
        return graph_a, graph_b, label
    
    def _create_graph(self, file_path):
        adj_matrix = pd.read_csv(file_path, header=None).values
        edge_index = (adj_matrix > 0).nonzero()
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        
        num_nodes = adj_matrix.shape[0]
        x = torch.eye(num_nodes)  # Use one-hot encoding for node features
        
        return Data(x=x, edge_index=edge_index)

In [26]:
class BinGNNModel(torch.nn.Module):
    def __init__(self,input_dim,hidden_dim,output_dim):
        super(BinGNNModel,self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.conv3 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = self.conv3(x, edge_index)
        
        x = global_mean_pool(x, data.batch)
        return x    

In [27]:
class SiameseGNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SiameseGNN, self).__init__()
        self.gnn = BinGNNModel(input_dim, hidden_dim, output_dim)
        self.fc = torch.nn.Linear(output_dim * 2, 1)

    def forward(self, graph1, graph2):
        out1 = self.gnn(graph1)
        out2 = self.gnn(graph2)
        out = torch.cat([out1, out2], dim=1)
        return torch.sigmoid(self.fc(out))

In [28]:
def train_model(model, train_loader, optimizer, device):
    model.train()
    total_loss = 0
    for graph1, graph2, labels in train_loader:
        graph1, graph2, labels = graph1.to(device), graph2.to(device), labels.float().to(device)
        optimizer.zero_grad()
        output = model(graph1, graph2).squeeze()
        loss = F.binary_cross_entropy(output, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

In [29]:
def evaluate_model(model, loader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for graph1, graph2, labels in loader:
            graph1, graph2, labels = graph1.to(device), graph2.to(device), labels.to(device)
            output = model(graph1, graph2).squeeze()
            predicted = (output > 0.5).float()
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

In [30]:
def save_model(model, optimizer, epoch, path):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, path)

In [31]:
def load_model(model, optimizer, path):
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    return model, optimizer, epoch

In [32]:
def test_saved_model(model_path, test_loader, device):
    # Initialize model and optimizer (with dummy parameters)
    input_dim = next(iter(test_loader))[0].num_node_features
    model = SiameseGNN(input_dim, HIDDEN_DIM, OUTPUT_DIM).to(device)
    optimizer = torch.optim.Adam(model.parameters())

    # Load the saved model
    model, _, _ = load_model(model, optimizer, model_path)

    # Evaluate the model
    test_acc = evaluate_model(model, test_loader, device)
    print(f'Loaded Model Test Accuracy: {test_acc:.4f}')

In [33]:
def main():
    # Hyperparameters


    # Set up device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Load and prepare data
    pos_dir = f"dataset/similar_func_pairs/{ISA}/pos"
    neg_dir = f"dataset/similar_func_pairs/{ISA}/neg"
    dataset = GraphPairDataset(pos_dir, neg_dir)
    train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=42)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

    # Initialize model
    input_dim = dataset[0][0].num_node_features
    model = SiameseGNN(input_dim, HIDDEN_DIM, OUTPUT_DIM).to(device)

    # Set up optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

    # Training loop
    for epoch in range(EPOCHS):
        train_loss = train_model(model, train_loader, optimizer, device)
        train_acc = evaluate_model(model, train_loader, device)
        test_acc = evaluate_model(model, test_loader, device)
        print(f'Epoch {epoch+1}/{EPOCHS}, Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

        # Save model periodically
        if (epoch + 1) % SAVE_INTERVAL == 0:
            save_path = f'model_checkpoint_epoch_{epoch+1}.pkl'
            save_model(model, optimizer, epoch, save_path)
            print(f'Model saved to {save_path}')

    # Final evaluation
    final_test_acc = evaluate_model(model, test_loader, device)
    print(f'Final Test Accuracy: {final_test_acc:.4f}')

    # Save final model
    final_save_path = 'model_final.pkl'
    save_model(model, optimizer, EPOCHS, final_save_path)
    print(f'Final model saved to {final_save_path}')

    # Test the saved model
    test_saved_model(final_save_path, test_loader, device)

if __name__ == '__main__':
    main()

  edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()


RuntimeError: Sizes of tensors must match except in dimension 0. Expected size 1 but got size 16 for tensor number 1 in the list.