In [1]:
!gdown 1cP2VcLXJnj1ntyg26UNPYD9NzYb__iRS
!unzip preprocessed.zip

from pathlib import Path
import pandas as pd
basepath = Path('preprocessed/')
files_in_basepath = basepath.iterdir()
datasets = {}
for item in files_in_basepath:
  if item.is_file():
    df = pd.read_pickle('preprocessed/' + item.name)
    datasets[item.name.replace('.pkl','')] = df

Downloading...
From: https://drive.google.com/uc?id=1cP2VcLXJnj1ntyg26UNPYD9NzYb__iRS
To: /content/preprocessed.zip
100% 57.7M/57.7M [00:02<00:00, 26.0MB/s]
Archive:  preprocessed.zip
   creating: preprocessed/
  inflating: preprocessed/fakenews.pkl  
  inflating: preprocessed/strawberry.pkl  
  inflating: preprocessed/relevant_reviews.pkl  
  inflating: preprocessed/terrorism.pkl  
  inflating: preprocessed/food.pkl   
  inflating: preprocessed/musk.pkl   
  inflating: preprocessed/pneumonia.pkl  
  inflating: preprocessed/TUANDROMD.pkl  


In [None]:
from scipy.sparse.lil import lil_matrix
from sklearn.neighbors import kneighbors_graph
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

def train_test_split_OCL(df, seed=81, folds=10):

  kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

  df_int = df[df.is_interest == 1]
  df_nint = df[df.is_interest == 0]

  l_index_int = []

  for train_index, test_index in kf.split(df_int):
    df_train = df_int.iloc[train_index]
    df_test = df_int.iloc[test_index]

    df_int_train, df_int_val = train_test_split(df_train, test_size=0.1, random_state=seed)

    l_index_int.append([df_int_train.index, df_int_val.index, df_test.index])

  l_index_nint = []
  for i in range(folds):
    df_nint_val, df_nint_test = train_test_split(df_nint, test_size=0.5, random_state=i)
    l_index_nint.append([df_nint_val.index, df_nint_test.index])

  return l_index_int, l_index_nint

def generate_graph(df,k,metric, folds=10):

  G = kneighbors_graph(df['features'].to_list(), k, mode='connectivity', include_self=False, metric=metric)

  graph_networkx = nx.Graph(G)

  for i,row in df.iterrows():
    graph_networkx.nodes[i]['features'] = row['features']
    graph_networkx.nodes[i]['label'] = row['is_interest']

  return graph_networkx

  from scipy.sparse.lil import lil_matrix


In [None]:
import networkx as nx
import random
import numpy as np
from typing import List
from tqdm import tqdm
from gensim.models.word2vec import Word2Vec
import argparse
import os.path as osp


class DeepWalk:
    def __init__(self, window_size: int, embedding_size: int, walk_length: int, walks_per_node: int):
        """
        :param window_size: window size for the Word2Vec model
        :param embedding_size: size of the final embedding
        :param walk_length: length of the walk
        :param walks_per_node: number of walks per node
        """
        self.window_size = window_size
        self.embedding_size = embedding_size
        self.walk_length = walk_length
        self.walk_per_node = walks_per_node

    def random_walk(self, g: nx.Graph, start: str, use_probabilities: bool = False) -> List[str]:
        """
        Generate a random walk starting on start
        :param g: Graph
        :param start: starting node for the random walk
        :param use_probabilities: if True take into account the weights assigned to each edge to select the next candidate
        :return:
        """
        np.random.seed(81)
        random.seed(81)
        walk = [start]
        for i in range(self.walk_length):
            neighbours = g.neighbors(walk[i])
            neighs = list(neighbours)
            if use_probabilities:
                probabilities = [g.get_edge_data(walk[i], neig)["weight"] for neig in neighs]
                sum_probabilities = sum(probabilities)
                probabilities = list(map(lambda t: t / sum_probabilities, probabilities))
                p = np.random.choice(neighs, p=probabilities)
            else:
                p = random.choice(neighs)
            walk.append(p)

        return [str(w) for w in walk]

    def get_walks(self, g: nx.Graph, use_probabilities: bool = False) -> List[List[str]]:
        """
        Generate all the random walks
        :param g: Graph
        :param use_probabilities:
        :return:
        """
        np.random.seed(81)
        random.seed(81)
        random_walks = []
        for _ in range(self.walk_per_node):
            random_nodes = list(g.nodes)
            random.shuffle(random_nodes)
            for node in random_nodes:
                random_walks.append(self.random_walk(g=g, start=node, use_probabilities=use_probabilities))
        return random_walks

    def compute_embeddings(self, walks: List[List[str]]):
        """
        Compute the node embeddings for the generated walks
        :param walks: List of walks
        :return:
        """
        model = Word2Vec(sentences=walks, window=self.window_size, size=self.embedding_size)
        return model.wv

In [None]:
!pip install torch torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.12.0+cu113.html

In [None]:
import torch
import torch_geometric.utils as utils
import torch_geometric.transforms as T
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GAE, GCNConv

class GCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels)
        self.conv2 = GCNConv(2 * out_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)


def train(model, optimizer, dataset):
    model.train()
    optimizer.zero_grad()
    z = model.encode(dataset.features.float(), dataset.edge_index)
    loss = model.recon_loss(z, dataset.edge_index)
    loss.backward()
    optimizer.step()
    return float(loss)

def gae_train(g):

  random.seed(81)
  np.random.seed(81)
  torch.manual_seed(81)
  torch.cuda.manual_seed_all(81)

  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  dataset = utils.from_networkx(g)

  in_channels, out_channels = len(dataset.features[0]), 2

  model = GAE(GCNEncoder(in_channels, out_channels))

  model = model.to(device)
  model = model.float()

  dataset = dataset.to(device)

  optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

  for epoch in range(100):
      loss = train(model, optimizer, dataset)

  with torch.no_grad():
    embs = model.encode(dataset.features.float(), dataset.edge_index)
    return embs.cpu().numpy()

In [None]:
!pip install node2vec

In [None]:
from node2vec import Node2Vec

def node_to_vec(g):

  np.random.seed(81)
  random.seed(81)
  model = Node2Vec(g, dimensions=2, walk_length=10, num_walks=15, q=1, p=4)

  model = model.fit(window=10, min_count=1, batch_words=4)

  for node in g.nodes():
    g.nodes[node]['features_node2vec_2'] = model[str(node)]

def deepwalk(g):

  random.seed(81)
  np.random.seed(81)
  dw = DeepWalk(window_size = 10, embedding_size = 2, walk_length = 10, walks_per_node = 15)

  walks = dw.get_walks(g)

  emb = dw.compute_embeddings(walks).vectors

  for node in g.nodes():
    g.nodes[node]['features_deepwalk_2'] = emb[node]

def gae(g):
  embs = gae_train(g)

  for node in g.nodes():
    g.nodes[node]['features_gae_2'] = embs[node]

In [None]:
for dataset in datasets.keys():
  print(dataset)
  for k in [1,2,3]:
    print(k)

    df = datasets[dataset]

    g = generate_graph(df,k,'euclidean')

    deepwalk(g)
    node_to_vec(g)
    gae(g)

    l_index_int, l_index_nint = train_test_split_OCL(df, folds=10)

    for f in range(10):
      for i in g.nodes():
        if i in l_index_int[f][0]:
          g.nodes[i]['train'] = 1
          g.nodes[i]['val'] = 0
          g.nodes[i]['test'] = 0
        elif i in l_index_int[f][1] or i in l_index_nint[f][0]:
          g.nodes[i]['train'] = 0
          g.nodes[i]['val'] = 1
          g.nodes[i]['test'] = 0
        elif i in l_index_int[f][2] or i in l_index_nint[f][1]:
          g.nodes[i]['train'] = 0
          g.nodes[i]['val'] = 0
          g.nodes[i]['test'] = 1

      path = '/content/drive/MyDrive/USP/Doctorate/Research/Articles/Auto One-Class Graph Neural Network/datasets/graphs/' + dataset + '/k=' + str(k) + '/'
      name = dataset + '_k=' + str(k) + '_fold=' + str(f) + '.gpickle'
      nx.write_gpickle(g, path+name)