In [None]:
import torch
import torch.nn.functional as F
from torch import tensor
from torch.optim import Adam
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import recall_score, f1_score, roc_auc_score, precision_score
from torch_geometric.data import DataLoader
from torch_geometric.nn import global_mean_pool, MessagePassing
from torch_geometric.nn import GATConv, GCNConv, GINConv, SAGEConv, RGCNConv
from torch_geometric.utils import to_dense_adj, add_self_loops, degree

In [None]:
class GraphSAGEModel(torch.nn.Module):
    def __init__(self, dataset, num_layers, hidden, feed_forward_embedding, activation_function, dropout_p):
        super(GraphSAGEModel, self).__init__()

        num_features = dataset.num_features
        num_classes = 2

        self.input_layer = torch.nn.Linear(num_features, hidden)
        self.sage_layers = torch.nn.ModuleList(
            [SAGEConv(hidden, hidden) for _ in range(num_layers)]
        )
        self.output_layer = torch.nn.Linear(hidden, num_classes)

        self.feed_forward_embedding = feed_forward_embedding
        if feed_forward_embedding:
            self.embedding = torch.nn.Embedding(53, hidden)

        if activation_function == 'relu':
          self.ac = torch.nn.ReLU()
        elif activation_function == 'leaky_relu':
          self.ac = torch.nn.LeakyReLU(negative_slope=0.01)
        elif activation_function == 'sigmoid':
          self.ac = torch.nn.Sigmoid()
        elif activation_function == 'tanh':
          self.ac = torch.nn.Tanh()
        elif activation_function == 'elu':
          self.ac = torch.nn.ELU(alpha=1.0)

        self.dropout_p = dropout_p

    def forward(self, data):
        # x, edge_index, batch = data.x.to(torch.float32), data.edge_index, data.batch

        # x = F.relu(self.input_layer(x))

        if self.feed_forward_embedding:
            x, edge_index, batch = data.x.to(torch.long), data.edge_index, data.batch
            x = self.embedding(x)
        else:
            x, edge_index, batch = data.x.to(torch.float32), data.edge_index, data.batch
            x = self.ac(self.input_layer(x))

        for sage_layer in self.sage_layers:
            x = self.ac(sage_layer(x, edge_index))

        x = global_mean_pool(x, batch)
        x = F.dropout(x, p=self.dropout_p, training=self.training)
        x = self.output_layer(x)

        return F.log_softmax(x, dim=-1)

In [None]:
from torch_geometric.utils import from_networkx
from torch_geometric.data import Dataset
import time
import os
import csv
import math
import time

def get_dataset(graphs):
    data_list = []
    for graph in graphs.values():
        data = from_networkx(graph)
        data.y = torch.tensor([graph.graph['label']], dtype=torch.long)
        # data.x = graph.node_features
        data.x = torch.tensor(graph.node_features)
        data_list.append(data)

    dataset = CustomGraphDataset(data_list)

    return dataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def cross_validation_with_val_set(dataset, model, folds, epochs, batch_size,
                                  lr, lr_decay_factor, lr_decay_step_size,
                                  weight_decay, logger=None, info=None, save_model=False):
    model_name = info['name']
    layer = info['layer']
    hidden = info['hidden']
    word = info['word_embedding']
    ac = info['ac']
    dp = info['dp']
    print(model_name, layer, hidden,ac, dp)
    test_losses, train_accs, test_accs, durations = [], [], [], []


    # best_acc, best_pre, best_recall, best_f1, best_auc
    train_best_result = [0, 0, 0, 0, 0]
    test_best_result = [0, 0, 0, 0, 0]

    train_results = {"loss": [], 'std': []}
    test_results = {"loss": [], "accuracy": [], "recall": [], "precision": [], "f1": [], "auc": [], 'std': []}
    # Define the directory where the results will be saved
    results_dir = os.path.expanduser('/content/drive/MyDrive/malware_project/results')
    filename = f"{model_name}_dropout_{dp}_results.json"

    # Create the directory if it doesn't exist
    os.makedirs(results_dir, exist_ok=True)

    model.to(device)
    optimizer = Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    for fold, (train_idx, test_idx) in enumerate(zip(*k_fold(dataset, folds))):

        train_dataset = dataset[train_idx]
        test_dataset = dataset[test_idx]

        train_loader = DataLoader(train_dataset, batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size, shuffle=False)

        if fold == 0:
          epochs = 200
        else:
          epochs = 100


        for epoch in range(1, epochs + 1):
            start_time = time.time()
            train_loss, train_acc, train_precision,train_recall, train_f1, train_auc, train_std = train(model, optimizer, train_loader)

            test_loss, test_accuracy, test_recall, test_precision, test_f1, test_auc, test_std = eval_metrics(model, test_loader)

            # best_acc, best_pre, best_recall, best_f1, best_auc
            train_best_result = [0, 0, 0, 0, 0]
            test_best_result = [0, 0, 0, 0, 0]
            test_losses.append(test_loss)

            for best_metric in test_best_result:
                if train_acc > train_best_result[0]:
                    train_best_result[0] = train_acc

                if test_accuracy > test_best_result[0]:
                    test_best_result[0] = test_accuracy

                if train_precision > train_best_result[1]:
                    train_best_result[1] = train_precision

                if test_precision > test_best_result[1]:
                    test_best_result[1] = test_precision

                if train_recall > train_best_result[2]:
                    train_best_result[2] = train_recall

                if test_recall > test_best_result[2]:
                    test_best_result[2] = test_recall

                if train_f1 > train_best_result[3]:
                    train_best_result[3] = train_f1

                if test_f1 > test_best_result[3]:
                    test_best_result[3] = test_f1

                if train_auc > train_best_result[4]:
                    train_best_result[4] = train_auc

                if test_auc > test_best_result[4]:
                    test_best_result[4] = test_auc

            # Record metrics
            train_results["loss"].append(train_loss)
            train_results["std"].append(train_std)

            test_results["loss"].append(test_loss)
            test_results["accuracy"].append(test_accuracy)
            test_results["recall"].append(test_recall)
            test_results["precision"].append(test_precision)
            test_results["f1"].append(test_f1)
            test_results["auc"].append(test_auc)
            test_results["std"].append(test_std)
            with open(os.path.join(results_dir, f"train_{filename}"), 'w') as f:
              json.dump(train_results, f)
            with open(os.path.join(results_dir, f"test_{filename}"), 'w') as f:
              json.dump(test_results, f)

            eval_info = {
                'fold': fold,
                'epoch': epoch,
                'train_loss': train_loss,
                'val_loss': test_losses[-1],
                'train_acc': train_acc,
                'train_recall': train_recall,
                'train_f1':train_f1,
                'train_auc': train_auc,
                'test_acc': test_accuracy,
                'test_recall': test_recall,
                'test_f1': test_f1,
                'test_auc': test_auc,
                'train_std': train_std,
                'test_std' : test_std
            }

            if logger is not None:
                logger(eval_info)

            if epoch % lr_decay_step_size == 0:
                for param_group in optimizer.param_groups:
                    new_lr =  lr_decay_factor * param_group['lr']
                    print(new_lr)
                    param_group['lr'] = new_lr
            end_time = time.time()
            epoch_duration = end_time - start_time
            eval_info['time'] = epoch_duration
            print(eval_info)

        print("Best train results: Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-score: {:.4f}, AUC: {:.4f}".format(
            train_best_result[0], train_best_result[1], train_best_result[2], train_best_result[3], train_best_result[4]))

        print("Best test results: Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-score: {:.4f}, AUC: {:.4f}".format(
            test_best_result[0], test_best_result[1], test_best_result[2], test_best_result[3], test_best_result[4]))

        if save_model:
          model_save_path = f"/content/drive/MyDrive/malware_project/model/model_{model_name}_fold_{fold+1}.pt"
          torch.save(model.state_dict(), model_save_path)
          print("Model saved at", model_save_path)

    return train_best_result, test_best_result

def k_fold(dataset, folds):
    skf = StratifiedKFold(folds, shuffle=True, random_state=12345)

    test_indices, train_indices = [], []
    for train_idx, test_idx in skf.split(torch.zeros(len(dataset)), dataset.data.y):
        train_indices.append(torch.from_numpy(train_idx))
        test_indices.append(torch.from_numpy(test_idx))

    return train_indices, test_indices


def num_graphs(data):
    if data.batch is not None:
        return data.num_graphs
    else:
        return data.x.size(0)


def train(model, optimizer, loader):
    model.train()

    correct = 0
    true_labels = []
    predicted_labels = []
    predicted_scores = []

    total_loss = 0
    total_square_loss = 0
    for data in loader:
      optimizer.zero_grad()
      data = data.to(device)
      out = model(data)
      loss = F.nll_loss(out, data.y.view(-1))
      loss.backward()
      total_loss += loss.item() * num_graphs(data)
      total_square_loss += (loss.item() * num_graphs(data)) ** 2

      optimizer.step()

      out = out.detach().cpu()
      pred = out.max(1)[1]
      pred_prob = torch.sigmoid(out)[:, 1].cpu().numpy()


      correct += pred.eq(data.y.cpu().view(-1)).sum().item()
      true_labels.extend(data.y.cpu().numpy())
      predicted_labels.extend(pred.cpu().numpy())
      predicted_scores.extend(pred_prob)

    avg_loss = total_loss / len(loader)
    loss_variance = total_square_loss/len(loader) - (avg_loss)**2
    loss_std = math.sqrt(loss_variance)

    accuracy = correct / len(loader.dataset)
    precision = precision_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)
    auc_roc = roc_auc_score(true_labels, predicted_scores)

    return avg_loss, accuracy, precision,recall, f1, auc_roc, loss_std


def eval_acc(model, loader):
    model.eval()

    correct = 0
    for data in loader:
        data = data.to(device)
        with torch.no_grad():
            pred = model(data).max(1)[1]
        correct += pred.eq(data.y.view(-1)).sum().item()
    return correct / len(loader.dataset)


def eval_loss(model, loader):
    model.eval()

    loss = 0
    for data in loader:
        data = data.to(device)
        with torch.no_grad():
            out = model(data)
        loss += F.nll_loss(out, data.y.view(-1), reduction='sum').item()
    return loss / len(loader.dataset)

def eval_metrics(model, loader):
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    correct = 0
    true_labels = []
    predicted_labels = []
    predicted_scores = []

    total_loss = 0
    total_square_loss = 0
    for data in loader:
        data = data.to(device)
        with torch.no_grad():
            output = model(data)
            pred = output.max(1)[1]
            pred_prob = torch.sigmoid(output)[:, 1].cpu().numpy()
            loss = F.nll_loss(output, data.y.view(-1))
            total_loss += loss.item() * num_graphs(data)
            total_square_loss += (loss.item() * num_graphs(data)) ** 2


        correct += pred.eq(data.y.view(-1)).sum().item()
        true_labels.extend(data.y.cpu().numpy())
        predicted_labels.extend(pred.cpu().numpy())
        predicted_scores.extend(pred_prob)

    avg_loss = total_loss / len(loader)
    loss_variance = total_square_loss/len(loader) - (avg_loss)**2
    loss_std = math.sqrt(loss_variance)

    accuracy = correct / len(loader.dataset)
    precision = precision_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)
    auc_roc = roc_auc_score(true_labels, predicted_scores)

    return avg_loss, accuracy, precision,recall, f1, auc_roc, loss_std

class CustomGraphDataset(Dataset):
    def __init__(self, data_list, transform=None, pre_transform=None):
        super().__init__(None, transform, pre_transform)
        self.data_list = data_list

    def len(self):
        return len(self.data_list)

    def get(self, idx):
        return self.data_list[idx]

    @property
    def data(self):
        class DataY:
            def __init__(self, data_list):
                self.data_list = data_list

            @property
            def y(self):
                return torch.tensor([data.y.item() for data in self.data_list], dtype=torch.long)

        return DataY(self.data_list)

def save_experiment_results(directory, word_embedding, layers, hidden, model, train_metrics, test_metrics):
     # Ensure the directory exists
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Set the CSV file path
    csv_file_path = os.path.join(directory, 'experiment_results.csv')
    # Extract metrics
    train_accuracy, train_precision, train_recall, train_f1_score, train_auc_roc = train_metrics
    test_accuracy, test_precision, test_recall, test_f1_score, test_auc_roc = test_metrics

    # Prepare the row data
    row_data = [word_embedding, layers, hidden, model, train_accuracy, train_precision, train_recall, train_f1_score, train_auc_roc,
                    test_accuracy, test_precision, test_recall, test_f1_score, test_auc_roc]

    # Check if the file exists
    file_exists = False
    try:
        with open(csv_file_path, 'r') as file:
            file_exists = True
    except FileNotFoundError:
        pass

    # Write the row data to the CSV file
    with open(csv_file_path, 'a', newline='') as file:
        writer = csv.writer(file)

        if not file_exists:
            # Write the header if the file doesn't exist
            header = ['Word Embedding', 'Layers', 'Hidden', 'Model', 'Train_Accuracy', 'Train_Precision', 'Train_Recall', 'Train_F1-score', 'Train_AUC-ROC', 'Test_Accuracy', 'Test_Precision', 'Test_Recall', 'Test_F1-score', 'Test_AUC-ROC']
            writer.writerow(header)

        # Write the row data
        writer.writerow(row_data)

In [None]:
from itertools import product
import json
import os

def logger(info):
    fold, epoch = info['fold'] + 1, info['epoch']
    train_loss, val_loss = info['train_loss' ], info['val_loss']
    train_std, test_std = info['train_std'], info['test_std']
    train_acc, train_recall, train_f1, train_auc = info['train_acc'], info['train_recall'], info['train_f1'], info['train_auc']
    test_acc, test_recall, test_f1, test_auc = info['test_acc'], info['test_recall'], info['test_f1'], info['test_auc']
    print('{:02d}/{:03d}: Train_loss: {:.4f}, Val Loss: {:.4f}, Train Accuracy: {:.3f}, Test Accuracy: {:.3f},\
    Train Recall: {:.3f}, Test Recall: {:.3f}, Train F1-Score: {:.3f}, Test F1-Score: {:.3f},\
    Train Auc_Roc: {:.3f}, Test Auc_Roc: {:.3f}, train_std: {:.3f}, test_std: {:.3f}'.format(fold, epoch, train_loss, val_loss, train_acc, test_acc,
                                                        train_recall, test_recall, train_f1, test_f1, train_auc, test_auc, train_std, test_std)
    )

epochs = 200
layers = [6]
hiddens = [64]
lr_decay_factor = 0.2
lr_decay_step_size = 50
batch_size = 64
lr = 0.001
folds = 5


# nets = ['GIN', 'SAGE', 'GCN']
nets = ['SAGE']
# activation_functions = ['relu', 'leaky_relu', 'sigmoid', 'tanh', 'elu']
activation_functions = ['leaky_relu']
dropout_p = [0.2]
# word_embeddings = ['one_hot']
csv_file_path = '/content/drive/MyDrive/malware_project/'

def release_model(model):
    # Move the model to CPU
    model.to('cpu')

    # Delete the model and its variables
    del model

    # Clear the GPU cache
    torch.cuda.empty_cache()
word_embedding = 'one_hot'
dataset = datasets_one_hot
layer = 12
hidden = 128
for net, ac, dp in product(nets, activation_functions, dropout_p):
  if net == 'GCN':
    model = GCNModel(datasets_one_hot, layer, hidden, False, ac, dp)
  elif net == 'GCN':
    model = GINModel(datasets_one_hot, layer, hidden, False, ac, dp)
  else:
    model = GraphSAGEModel(datasets_one_hot, layer, hidden, False, ac, dp)
  info = {'name':net, 'layer': layer, 'hidden': hidden, 'word_embedding': word_embedding, 'ac': ac, 'dp': dp}
  print(info)
  train_best_result, test_best_result = cross_validation_with_val_set(
      dataset,
      model,
      folds=folds,
      epochs=epochs,
      batch_size=batch_size,
      lr=lr,
      lr_decay_factor=lr_decay_factor,
      lr_decay_step_size=lr_decay_step_size,
      weight_decay=0,
      logger= logger,
      info=info
  )
  release_model(model)

  save_experiment_results(csv_file_path, word_embedding, layer, hidden, net, train_best_result, test_best_result)