# Train and Validation

In [None]:
!pip install networkx==2.6

In [None]:
from pandas.core.base import value_counts
from sklearn.metrics import classification_report, roc_auc_score
import numpy as np
import time
from sklearn.svm import OneClassSVM as OCSVM

def define_gammas():
  gammas = ['scale', 'auto']
  return gammas

def define_nus():
  nus = []
  for n in range(5,90,5):
    nus.append(n/100)
  for n in range(5,90,5):
    nus.append(n/1000)

  return nus

def define_kernels():
  return ['rbf', 'sigmoid','linear', 'poly']

def evaluation_one_class(preds_interest, preds_outliers):
    y_true = [1] * len(preds_interest) + [-1] * len(preds_outliers)
    y_pred = list(preds_interest) + list(preds_outliers)
    return classification_report(y_true, y_pred, output_dict=True)

def evaluate_model(X_train, X_test, X_outlier, model):

    one_class_classifier = model.fit(X_train)

    Y_pred_interest = one_class_classifier.predict(X_test)

    Y_pred_ruido = one_class_classifier.predict(X_outlier)

    y_true = np.array([1] * len(X_test) + [-1] * len(X_outlier))

    dic = evaluation_one_class(Y_pred_interest, Y_pred_ruido)

    return dic

def init_metrics():
    metrics = {
        '1': {
            'precision': [],
            'recall': [],
            'f1-score': []
        },
        '-1': {
            'precision': [],
            'recall': [],
            'f1-score': []
        },
        'macro avg': {
            'precision': [],
            'recall': [],
            'f1-score': []
        },
        'weighted avg': {
            'precision': [],
            'recall': [],
            'f1-score': []
        },
        'accuracy': [],
        'time': []
    }
    return metrics


def save_values(metrics, values):
    for key in metrics.keys():
      if key == 'accuracy' or key == 'time':
        metrics[key].append(values[key])
      else:
        for key2 in metrics[key].keys():
          metrics[key][key2].append(values[key][key2])


def extract_emb_from_graph(graph, representation_name):

  x_train, x_int_val, x_nint_val, x_int_test, x_nint_test = [],[],[],[],[]

  for node in graph.nodes():
    if graph.nodes[node]['train'] == 1:
      x_train.append(graph.nodes[node][representation_name])
    elif graph.nodes[node]['val'] == 1 and graph.nodes[node]['label'] == 1:
      x_int_val.append(graph.nodes[node][representation_name])
    elif graph.nodes[node]['val'] == 1 and graph.nodes[node]['label'] == 0:
      x_nint_val.append(graph.nodes[node][representation_name])
    elif graph.nodes[node]['test'] == 1 and graph.nodes[node]['label'] == 1:
      x_int_test.append(graph.nodes[node][representation_name])
    elif graph.nodes[node]['test'] == 1 and graph.nodes[node]['label'] == 0:
      x_nint_test.append(graph.nodes[node][representation_name])

  return x_train, x_int_val, x_nint_val, x_int_test, x_nint_test

def evaluate_models(l_graphs, representation_name, path, fn):

    file_name = fn + representation_name + '_OCSVM.csv'

    for kernel in define_kernels():
      for gamma in define_gammas():
        for nu in define_nus():
          ocsvm = OCSVM(kernel=kernel,nu=nu,gamma=gamma)
          line_parameters =  'kernel:' + kernel + '_gamma:' + gamma + '_nu:' + str(nu)
          metrics = init_metrics()

          for graph in l_graphs:

            x_train,x_int_val,x_nint_val,x_int_test,x_nint_test = extract_emb_from_graph(graph, representation_name)

            start = time.time()
            values = evaluate_model(x_train, x_int_val, x_nint_val, ocsvm)
            end = time.time()
            time_ = end - start
            values['time'] = time_
            save_values(metrics, values)

          write_results(metrics, file_name, line_parameters, path)


def write_results(metrics, file_name, line_parameters, path):
    if not Path(path + file_name).is_file():
        file_ = open(path + file_name, 'w')
        string = 'Parameters'
        for key in metrics.keys():
            if key == 'accuracy' or key == 'time':
              string += ';' + key + '-mean;' + key + '-std'
            else:
              for key2 in metrics[key].keys():
                string += ';' + key + '_' + key2 + '-mean;' + key + '_' + key2 + '-std'

        string += '\n'
        file_.write(string)
        file_.close()

    file_ = open(path + file_name, 'a')
    string = line_parameters

    for key in metrics.keys():
      if key == 'accuracy' or key == 'time':
        string += ';' + str(np.mean(metrics[key])) + ';' + str(np.std(metrics[key]))
      else:
        for key2 in metrics[key].keys():
          string += ';' + str(np.mean(metrics[key][key2])) + ';' + str(np.std(metrics[key][key2]))

    string += '\n'
    file_.write(string)
    file_.close()


In [None]:
import networkx as nx
from pathlib import Path
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

pt = '/content/drive/MyDrive/USP/Doctorate/Research/Articles/OLGA: One cLass Graph Autoencoder/datasets/graphs/'

basepath = Path(pt)

path_results = '/content/drive/MyDrive/USP/Doctorate/Research/Articles/OLGA: One cLass Graph Autoencoder/results/

datasets = basepath.iterdir()

for dataset in ['fakenews', 'terrorism', 'relevant_reviews', 'food', 'strawberry', 'pneumonia', 'musk', 'TUANDROMD']:
  print('no dataset: ' + dataset)
  for k in ['k=1', 'k=2', 'k=3']:
    print('no k: ' + k)
    l_graphs = []
    for fold in range(10):
      path = pt + dataset + '/' + k + '/' + dataset + '_' + k + '_fold=' + str(fold) + '.gpickle'

      graph = nx.read_gpickle(path)
      l_graphs.append(graph)

    for rep_initial in ['features_node2vec', 'features_deepwalk', 'features_gae','features_node2vec_3', 'features_deepwalk_3', 'features_gae_3', 'features_gae_2', 'features_deepwalk_2', 'features_node2vec_2']:

      print('com rep: ' + rep_initial)
      evaluate_models(l_graphs, rep_initial, path_results, dataset + '_' + k + '_')

# Test

In [None]:
path_results = '/content/drive/MyDrive/USP/Doctorate/Research/Articles/OLGA: One cLass Graph Autoencoder/results/'

path_results_test = '/content/drive/MyDrive/USP/Doctorate/Research/Articles/OLGA: One cLass Graph Autoencoder/results_test/'

pt = '/content/drive/MyDrive/USP/Doctorate/Research/Articles/OLGA: One cLass Graph Autoencoder/datasets/graphs/'

basepath = Path(path_results)
datasets = basepath.iterdir()

for dataset in datasets:
  dataset = dataset.name
  print('Dataset: ' + dataset)
  basepath2 = Path(path_results + dataset)
  ks = basepath2.iterdir()

  for k in ks:
    k = k.name
    print('K: ' + k)
    basepath3 = Path(path_results + dataset + '/' + k)
    methods = basepath3.iterdir()

    l_graphs = []
    for fold in range(10):
      path = pt + dataset + '/' + k + '/' + dataset + '_' + k + '_fold=' + str(fold) + '.gpickle'

      graph = nx.read_gpickle(path)
      l_graphs.append(graph)

    pr = path_results_test + dataset + '_' + k + '_'

    for method in methods:
      if method.is_file() and method.name.split('-')[0] != 'OC':

        method = method.name
        df = pd.read_csv(path_results + dataset + '_' + k + '_' + method, sep=';')

        best_f1 = max(df['macro avg_f1-score-mean'])

        parameters = df[df['macro avg_f1-score-mean'] == best_f1]['Parameters'].iloc[0]

        parts = parameters.split('_')

        kernel = parts[0].split(':')[1]

        gamma = parts[1].split(':')[1]

        nu = float(parts[2].split(':')[1])

        ocsvm = OCSVM(kernel=kernel,nu=nu,gamma=gamma)
        line_parameters =  'kernel:' + kernel + '_gamma:' + gamma + '_nu:' + str(nu)
        metrics = init_metrics()

        for graph in l_graphs:

          x_train, x_int_val, _, x_int_test, x_nint_test = extract_emb_from_graph(graph, method.replace('_OCSVM.csv', ''))

          x_train = np.concatenate([x_train,x_int_val])

          start = time.time()
          values = evaluate_model(x_train, x_int_test, x_nint_test, ocsvm)
          end = time.time()
          time_ = end - start
          values['time'] = time_
          save_values(metrics, values)

        write_results(metrics, method, line_parameters, pr)