In [None]:
from google.colab import drive

drive.mount("/content/drive", force_remount = True)

Mounted at /content/drive


In [None]:
PATH_TO_GAE_GRAPHS = "/content/drive/MyDrive/USP/Doctorate/Research/Articles/Early Fusion For One-class Learning On Heterogeneous Graphs/grafos_gae"

In [61]:
from sklearn.model_selection import ParameterGrid

HYPERPARAMETER_GRID = [
{
    'conv_type': ["GCN"],
    'hidden': [[32], [64], [32, 32], [64, 64]],
    'lr': [1e-2, 1e-3, 1e-4],
    'patience': [50, 100]
},
{
    'conv_type': ["GRAPHSAGE"],
    'aggr': ['mean', 'max'],
    'hidden': [[32], [64], [32, 32], [64, 64]],
    'lr': [1e-2, 1e-3, 1e-4],
    'patience': [50, 100]
},
{
    'conv_type': ["GAT"],
    'heads': [4, 8],
    'hidden': [[32], [64], [32, 32], [64, 64]],
    'lr': [1e-2, 1e-3, 1e-4],
    'patience': [50, 100]
}]

hyperparameter_list_gae = {'GCN' : list(ParameterGrid(HYPERPARAMETER_GRID[0])),
                           'GRAPHSAGE' : list(ParameterGrid(HYPERPARAMETER_GRID[1])),
                           'GAT' : list(ParameterGrid(HYPERPARAMETER_GRID[2]))}

In [62]:
import os

graphs = os.listdir(PATH_TO_GAE_GRAPHS)
graphs

['rec_sys_1',
 'rec_sys_3',
 'rec_sys_2',
 'rec_sys_4',
 'event',
 'rec_sys_5',
 'music',
 'fakenews']

In [60]:
from pandas.core.base import value_counts
from sklearn.metrics import classification_report, roc_auc_score
import numpy as np
import time
from pathlib import Path
from sklearn.svm import OneClassSVM as OCSVM
from sklearn.model_selection import KFold

def define_gammas():
  gammas = ['scale', 'auto']
  return gammas

def define_nus():
  nus = []
  for n in range(5,90,5):
    nus.append(n/100)
  for n in range(5,90,5):
    nus.append(n/1000)

  return nus

def define_kernels():
  return ['rbf', 'sigmoid', 'linear', 'poly']

def evaluation_one_class(preds_interest, preds_outliers):
    y_true = [1] * len(preds_interest) + [-1] * len(preds_outliers)
    y_pred = list(preds_interest) + list(preds_outliers)
    return classification_report(y_true, y_pred, output_dict=True)

def evaluate_model(X_train, X_test, X_outlier, model):

    one_class_classifier = model.fit(X_train)

    Y_pred_interest = one_class_classifier.predict(X_test)

    Y_pred_ruido = one_class_classifier.predict(X_outlier)

    y_true = np.array([1] * len(X_test) + [-1] * len(X_outlier))

    dic = evaluation_one_class(Y_pred_interest, Y_pred_ruido)

    return dic

def init_metrics():
    metrics = {
        '1': {
            'precision': [],
            'recall': [],
            'f1-score': []
        },
        '-1': {
            'precision': [],
            'recall': [],
            'f1-score': []
        },
        'macro avg': {
            'precision': [],
            'recall': [],
            'f1-score': []
        },
        'weighted avg': {
            'precision': [],
            'recall': [],
            'f1-score': []
        },
        'accuracy': [],
        'time': []
    }
    return metrics


def save_values(metrics, values):
    for key in metrics.keys():
      if key == 'accuracy' or key == 'time':
        metrics[key].append(values[key])
      else:
        for key2 in metrics[key].keys():
          metrics[key][key2].append(values[key][key2])


def operation(graph, list_nodes, representation_name, operador, dataset):
  x = []

  for node in list_nodes:
    rep = graph.nodes[node][representation_name]

    if dataset == 'event' :
      dic_rep = {'what' : [],
                 'when' : [],
                 'where' : [],
                 'who' : [],
                 'how' : [],
                 'cluster_code' : [],
                 'iptc_code': []}

      for n in graph.neighbors(node):
        for key in dic_rep:
          if key in n:
             dic_rep[key].append(graph.nodes[n][representation_name])

      for key in dic_rep:
        dic_rep[key] = np.mean(dic_rep[key], axis=0)

      new_rep=None
      if operator == 'concatenate':
        new_rep = np.concatenate([rep, dic_rep['what'], dic_rep['when'], dic_rep['where'], dic_rep['who'], dic_rep['how'], dic_rep['cluster_code'], dic_rep['iptc_code']])
      elif operator == 'sum':
        new_rep = np.sum([rep, dic_rep['what'], dic_rep['when'], dic_rep['where'], dic_rep['who'], dic_rep['how'], dic_rep['cluster_code'], dic_rep['iptc_code']], axis=0)
      elif operator == 'sub':
        new_rep = rep
        for key in dic_rep:
          new_rep = np.subtract(new_rep, dic_rep[key])
      elif operator == 'avg':
        new_rep = np.mean([rep, dic_rep['what'], dic_rep['when'], dic_rep['where'], dic_rep['who'], dic_rep['how'], dic_rep['cluster_code'], dic_rep['iptc_code']], axis=0)
      elif operator == 'min':
        new_rep = np.min([rep, dic_rep['what'], dic_rep['when'], dic_rep['where'], dic_rep['who'], dic_rep['how'], dic_rep['cluster_code'], dic_rep['iptc_code']], axis=0)
      elif operator == 'max':
        new_rep = np.max([rep, dic_rep['what'], dic_rep['when'], dic_rep['where'], dic_rep['who'], dic_rep['how'], dic_rep['cluster_code'], dic_rep['iptc_code']], axis=0)
      elif operator == 'multiply':
        new_rep = rep
        for key in dic_rep:
          new_rep = np.multiply(new_rep, dic_rep[key])

    else:
      reps = []
      for n in graph.neighbors(node):
        reps.append(graph.nodes[n][representation_name])

      rep_o = np.mean(reps, axis=0)

      new_rep=None
      if operator == 'concatenate':
        new_rep = np.concatenate([rep, rep_o])
      elif operator == 'sum':
        new_rep = np.sum([rep, rep_o], axis=0)
      elif operator == 'sub':
        new_rep = np.subtract(rep, rep_o)
      elif operator == 'avg':
        new_rep = np.mean([rep, rep_o], axis=0)
      elif operator == 'min':
        new_rep = np.min([rep, rep_o], axis=0)
      elif operator == 'max':
        new_rep = np.max([rep, rep_o], axis=0)
      elif operator == 'multiply':
        new_rep = np.multiply(rep, rep_o)

    x.append(new_rep)

  return x

def extract_emb_from_graph(graph, representation_name, interest_class, nodes_train, nodes_test, nodes_out, operator, principal_node, dataset):

  if operator == 'without':
    x_train, x_int_test, x_nint_test = [], [], []

    for node in nodes_train:
      x_train.append(graph.nodes[node][representation_name])

    for node in nodes_test:
      x_int_test.append(graph.nodes[node][representation_name])

    for node in nodes_out:
      x_nint_test.append(graph.nodes[node][representation_name])
  else:
    x_train = operation(graph, nodes_train, representation_name, operator, dataset)
    x_int_test = operation(graph, nodes_test, representation_name, operator, dataset)
    x_nint_test = operation(graph, nodes_out, representation_name, operator, dataset)

  return x_train, x_int_test, x_nint_test

def evaluate_models(G, representation_name, path, file_name, hyperparams_gae, interest_class, l_int, l_nint, operator, principal_node, dataset):

    kf = KFold(n_splits=5, shuffle=True, random_state=81)

    for kernel in define_kernels():
      for gamma in define_gammas():
        for nu in define_nus():
          ocsvm = OCSVM(kernel=kernel,nu=nu,gamma=gamma)
          line_parameters = str(hyperparams_gae) + '_kernel:' + kernel + '_gamma:' + gamma + '_nu:' + str(nu)
          metrics = init_metrics()

          for train_index, test_index in kf.split(l_int):
            nodes_train = np.array(l_int)[train_index]
            nodes_test = np.array(l_int)[test_index]

            x_train, x_int_test,x_nint_test = extract_emb_from_graph(G, representation_name, interest_class, nodes_train, nodes_test, l_nint, operator, principal_node, dataset)

            start = time.time()
            values = evaluate_model(x_train, x_int_test, x_nint_test, ocsvm)
            end = time.time()
            time_ = end - start
            values['time'] = time_
            save_values(metrics, values)

          write_results(metrics, file_name, line_parameters, path)


def write_results(metrics, file_name, line_parameters, path):
    if not Path(path + file_name).is_file():
        file_ = open(path + file_name, 'w')
        string = 'Parameters'
        for key in metrics.keys():
            if key == 'accuracy' or key == 'time':
              string += ';' + key + '-mean;' + key + '-std'
            else:
              for key2 in metrics[key].keys():
                string += ';' + key + '_' + key2 + '-mean;' + key + '_' + key2 + '-std'

        string += '\n'
        file_.write(string)
        file_.close()

    file_ = open(path + file_name, 'a')
    string = line_parameters

    for key in metrics.keys():
      if key == 'accuracy' or key == 'time':
        string += ';' + str(np.mean(metrics[key])) + ';' + str(np.std(metrics[key]))
      else:
        for key2 in metrics[key].keys():
          string += ';' + str(np.mean(metrics[key][key2])) + ';' + str(np.std(metrics[key][key2]))

    string += '\n'
    file_.write(string)
    file_.close()


# Music

In [None]:
import pickle as pkl
import networkx as nx
import warnings
warnings.filterwarnings('ignore')

GNN = 'GRAPHSAGE'

path_results = '/content/drive/MyDrive/USP/Doctorate/Research/Articles/Early Fusion For One-class Learning On Heterogeneous Graphs/resultados/' + GNN + '/'

dataset = 'music'

interest_class = 'hit'

principal_node = 'song'

## REG

In [None]:
for operator in ['without', 'sum', 'sub', 'avg', 'min', 'max', 'multiply', 'concatenate']:
  for hyperparams_gae, pickle in zip(hyperparameter_list_gae, os.listdir(f"{PATH_TO_GAE_GRAPHS}/{dataset}/{GNN}")):
    print(f"Dataset: {dataset} - GAE Hyperparameters: {hyperparams_gae}")
    with open(f"{PATH_TO_GAE_GRAPHS}/{dataset}/{pickle}", "rb") as file:
      G = pkl.load(file)

      l_int, l_nint = [], []
      for node in G.nodes():
        if principal_node in node and G.nodes[node]['label'] == interest_class:
          l_int.append(node)
        elif principal_node in node and G.nodes[node]['label'] != interest_class:
          l_nint.append(node)

      evaluate_models(G, 'f_features', path_results, dataset + '_' + operator + '.csv', 'reg', interest_class, l_int, l_nint, operator, principal_node, dataset)
      break

## GAE

In [None]:
for operator in ['without', 'sum', 'sub', 'avg', 'min', 'max', 'multiply', 'concatenate']:
  print(operator)
  for hyperparams_gae, pickle in zip(hyperparameter_list_gae[GNN], os.listdir(f"{PATH_TO_GAE_GRAPHS}/{dataset}/{GNN}/")):
    print(f"Dataset: {dataset} - GAE Hyperparameters: {hyperparams_gae}")
    with open(f"{PATH_TO_GAE_GRAPHS}/{dataset}/{GNN}/{pickle}", "rb") as file:
      G = pkl.load(file)

      l_int, l_nint = [], []
      for node in G.nodes():
        if principal_node in node and G.nodes[node]['label'] == interest_class:
          l_int.append(node)
        elif principal_node in node and G.nodes[node]['label'] != interest_class:
          l_nint.append(node)

      evaluate_models(G, 'gae_features', path_results, dataset + '-gae_' + operator + '.csv', hyperparams_gae, interest_class, l_int, l_nint, operator, principal_node, dataset)

# fakenews


In [None]:
import pickle as pkl
import networkx as nx
import warnings
warnings.filterwarnings('ignore')

GNN = 'GRAPHSAGE'

path_results = '/content/drive/MyDrive/USP/Doctorate/Research/Articles/Early Fusion For One-class Learning On Heterogeneous Graphs/resultados/' + GNN + '/'

dataset = 'fakenews'

interest_class = 'fake'

## REG

In [None]:
for operator in ['concatenate', 'without', 'sum', 'sub', 'avg', 'min', 'max', 'multiply']:
  for hyperparams_gae, pickle in zip(hyperparameter_list_gae, os.listdir(f"{PATH_TO_GAE_GRAPHS}/{dataset}/{GNN}")):
    print(f"Dataset: {dataset} - GAE Hyperparameters: {hyperparams_gae}")
    with open(f"{PATH_TO_GAE_GRAPHS}/{dataset}/{pickle}", "rb") as file:
      G = pkl.load(file)

      l_int, l_nint = [], []
      for node in G.nodes():
        if type(node) == int and G.nodes[node]['label'] == interest_class:
          l_int.append(node)
        elif type(node) == int and G.nodes[node]['label'] != interest_class:
          l_nint.append(node)

      evaluate_models(G, 'f_features', path_results, dataset + '_' + operator + '.csv', 'reg', interest_class, l_int, l_nint, operator, '', dataset)
      break

## GAE

In [None]:
for operator in ['without', 'sum', 'sub', 'avg', 'min', 'max', 'multiply', 'concatenate']:
  print(operator)
  for hyperparams_gae, pickle in zip(hyperparameter_list_gae[GNN], os.listdir(f"{PATH_TO_GAE_GRAPHS}/{dataset}/{GNN}/")):
    print(f"Dataset: {dataset} - GAE Hyperparameters: {hyperparams_gae}")
    with open(f"{PATH_TO_GAE_GRAPHS}/{dataset}/{GNN}/{pickle}", "rb") as file:
      G = pkl.load(file)

      l_int, l_nint = [], []
      for node in G.nodes():
        if type(node) == int and G.nodes[node]['label'] == interest_class:
          l_int.append(node)
        elif type(node) == int and G.nodes[node]['label'] != interest_class:
          l_nint.append(node)

      evaluate_models(G, 'gae_features', path_results, dataset + '-gae_' + operator + '.csv', hyperparams_gae, interest_class, l_int, l_nint, operator, '', dataset)

# Eventos

In [63]:
import pickle as pkl
import networkx as nx
import warnings
warnings.filterwarnings('ignore')

GNN = 'GAT'

path_results = '/content/drive/MyDrive/USP/Doctorate/Research/Articles/Early Fusion For One-class Learning On Heterogeneous Graphs/resultados/' + GNN + '/'

dataset = 'event'

interest_class = 'f1'

principal_node = 'event'

## REG

In [None]:
for operator in ['concatenate', 'min', 'without', 'sum', 'sub', 'avg', 'max', 'multiply']:
  for hyperparams_gae, pickle in zip(hyperparameter_list_gae, os.listdir(f"{PATH_TO_GAE_GRAPHS}/{dataset}/{GNN}")):
    print(f"Dataset: {dataset} - GAE Hyperparameters: {hyperparams_gae}")
    with open(f"{PATH_TO_GAE_GRAPHS}/{dataset}/{pickle}", "rb") as file:
      G = pkl.load(file)

      l_int, l_nint = [], []
      for node in G.nodes():
        if principal_node in node and G.nodes[node]['label'] == interest_class:
          l_int.append(node)
        elif principal_node in node and G.nodes[node]['label'] != interest_class:
          l_nint.append(node)

      evaluate_models(G, 'f_features', path_results, dataset + '_' + operator + '.csv', 'reg', interest_class, l_int, l_nint, operator, principal_node, dataset)
      break

## GAE

In [None]:
for operator in ['without', 'sum', 'sub', 'avg', 'min', 'max', 'multiply', 'concatenate']:
  print(operator)
  for hyperparams_gae, pickle in zip(hyperparameter_list_gae[GNN], os.listdir(f"{PATH_TO_GAE_GRAPHS}/{dataset}/{GNN}/")):
    print(f"Dataset: {dataset} - GAE Hyperparameters: {hyperparams_gae}")
    with open(f"{PATH_TO_GAE_GRAPHS}/{dataset}/{GNN}/{pickle}", "rb") as file:
      G = pkl.load(file)

      l_int, l_nint = [], []
      for node in G.nodes():
        if principal_node in node and G.nodes[node]['label'] == interest_class:
          l_int.append(node)
        elif principal_node in node and G.nodes[node]['label'] != interest_class:
          l_nint.append(node)

      evaluate_models(G, 'gae_features', path_results, dataset + '-gae_' + operator + '.csv', hyperparams_gae, interest_class, l_int, l_nint, operator, principal_node, dataset)

# Rec Sys

In [None]:
!gdown 1XFiH0-J1r9DepyhfAzQUCe8pIClddNHw
!gdown 1U-qJ0Aayp2srzlIxiztEpTjIolp9fWya

Downloading...
From: https://drive.google.com/uc?id=1XFiH0-J1r9DepyhfAzQUCe8pIClddNHw
To: /content/df_interest.pkl
100% 624k/624k [00:00<00:00, 24.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1U-qJ0Aayp2srzlIxiztEpTjIolp9fWya
To: /content/df_outlier.pkl
100% 171k/171k [00:00<00:00, 80.7MB/s]


In [None]:
import pandas as pd

df_int = pd.read_pickle('df_interest.pkl')
df_out = pd.read_pickle('df_outlier.pkl')

In [None]:
def train_test_split_ocl_recommendation(kf, df_int):
    train_test = []

    for train_index, test_index in kf.split(df_int):
        train_test.append((df_int.iloc[train_index], df_int.iloc[test_index]))

    return train_test

def foldValidation(folds):
    kf = KFold(n_splits=folds, shuffle=True, random_state=42)
    return kf

def operation(graph, representation_name, operator, dataset, df_train, df):

  users = df_train['user'].unique()
  items = df_train['item'].unique()

  x = []

  for index,row in df.iterrows():
    continue_ = 0
    user = str(row['user']) + ':user'
    item = str(row['item']) + ':item'

    if user in graph.nodes() and item in graph.nodes():
      rep_i = graph.nodes[item][representation_name]

      rep_u = graph.nodes[user][representation_name]

      dic_rep = {'review' : [],
                  'genre' : [],
                  'keyword' : []}

      for n in graph.neighbors(item):
        for key in dic_rep:
          if key in n:
            dic_rep[key].append(graph.nodes[n][representation_name])

      for key in dic_rep:
        if len(dic_rep[key]) == 0:
          continue_ = 1
        else:
          dic_rep[key] = np.mean(dic_rep[key], axis=0)

      if continue_ == 0 :
        new_rep=None
        if operator == 'concatenate':
          new_rep = np.concatenate([rep_i, rep_u, dic_rep['review'], dic_rep['genre'], dic_rep['keyword']])
        elif operator == 'sum':
          new_rep = np.sum([rep_i, rep_u, dic_rep['review'], dic_rep['genre'], dic_rep['keyword']], axis=0)
        elif operator == 'sub':
          new_rep = np.subtract(rep_i, rep_u)
          for key in dic_rep:
            new_rep = np.subtract(new_rep, dic_rep[key])
        elif operator == 'avg':
          new_rep = np.mean([rep_i, rep_u, dic_rep['review'], dic_rep['genre'], dic_rep['keyword']], axis=0)
        elif operator == 'min':
          new_rep = np.min([rep_i, rep_u, dic_rep['review'], dic_rep['genre'], dic_rep['keyword']], axis=0)
        elif operator == 'max':
          new_rep = np.max([rep_i, rep_u, dic_rep['review'], dic_rep['genre'], dic_rep['keyword']], axis=0)
        elif operator == 'multiply':
          new_rep = np.multiply(rep_i, rep_u)
          for key in dic_rep:
            new_rep = np.multiply(new_rep, dic_rep[key])

        x.append(new_rep)

  return x

def extract_emb_from_graph(graph, representation_name, operator, df_train, df_test, df_out):

  users = df_train['user'].unique()
  items = df_train['item'].unique()

  if operator == 'without':
    x_train, x_int_test, x_nint_test = [], [], []

    for index,row in df_train.iterrows():
      user = str(row['user']) + ':user'
      item = str(row['item']) + ':item'
      if user in graph.nodes() and item in graph.nodes():
        x_train.append(np.concatenate([graph.nodes[user][representation_name], graph.nodes[item][representation_name]]))

    for index,row in df_test.iterrows():
      user = str(row['user']) + ':user'
      item = str(row['item']) + ':item'
      if user in graph.nodes() and item in graph.nodes():
        x_int_test.append(np.concatenate([graph.nodes[user][representation_name], graph.nodes[item][representation_name]]))

    for index,row in df_out.iterrows():
      user = str(row['user']) + ':user'
      item = str(row['item']) + ':item'
      if user in graph.nodes() and item in graph.nodes():
        x_nint_test.append(np.concatenate([graph.nodes[user][representation_name], graph.nodes[item][representation_name]]))

  else:
    x_train = operation(graph, representation_name, operator, dataset, df_train, df_train)
    x_int_test = operation(graph,representation_name, operator, dataset, df_train, df_test)
    x_nint_test = operation(graph, representation_name, operator, dataset, df_train, df_out)

  return x_train, x_int_test, x_nint_test

def evaluate_rec_sys(representation_name, path, file_name, hyperparams_gae, operator, dataset, train_test, df_out, pickle):

  for kernel in define_kernels():
    if kernel != 'rbf':
      for gamma in define_gammas():
        for nu in define_nus():
          ocsvm = OCSVM(kernel=kernel,nu=nu,gamma=gamma)
          metrics = init_metrics()

          line_parameters = str(hyperparams_gae) + '_kernel:' + kernel + '_gamma:' + gamma + '_nu:' + str(nu)
          for i in range(1,5):
            df_train, df_test = train_test[i]

            with open(f"{PATH_TO_GAE_GRAPHS}/{dataset}_{i}/{GNN}/{pickle}", "rb") as file:
              G = pkl.load(file)

              x_train, x_int_test, x_nint_test = extract_emb_from_graph(G, representation_name, operator, df_train, df_test, df_out)

              start = time.time()
              values = evaluate_model(x_train, x_int_test, x_nint_test, ocsvm)
              end = time.time()
              time_ = end - start
              values['time'] = time_
              save_values(metrics, values)

          write_results(metrics, file_name, line_parameters, path)

In [None]:
import pickle as pkl
import networkx as nx
import warnings
warnings.filterwarnings('ignore')

GNN = 'GRAPHSAGE'

path_results = '/content/drive/MyDrive/USP/Doctorate/Research/Articles/Early Fusion For One-class Learning On Heterogeneous Graphs/resultados/' + GNN + '/'

dataset = 'rec_sys'

principal_node = ''

folds = 5
kf = foldValidation(folds)
train_test = train_test_split_ocl_recommendation(kf, df_int)

## REG

In [None]:
for operator in ['without', 'sum', 'sub', 'avg', 'min', 'max', 'multiply', 'concatenate']:
  evaluate_rec_sys('f_features', path_results, dataset + '_' + operator + '.csv', 'reg', operator, dataset, train_test, df_out, os.listdir(f"{PATH_TO_GAE_GRAPHS}/{dataset}_1/{GNN}/")[0])

## GAE

In [None]:
for operator in ['without', 'sum', 'sub', 'avg', 'min', 'max', 'multiply', 'concatenate']:
  for hyperparams_gae, pickle in zip(hyperparameter_list_gae[GNN], os.listdir(f"{PATH_TO_GAE_GRAPHS}/{dataset}_1/{GNN}")):
    evaluate_rec_sys('gae_features', path_results, dataset + '_' + operator + '.csv', hyperparams_gae, operator, dataset, train_test, df_out, pickle)