In [2]:
#!pip install dgl

Collecting dgl
  Downloading https://files.pythonhosted.org/packages/33/83/44f1e8dfb27c352f65b8d8e5e24997dae5eaeae1b41c59f871e8623ea0db/dgl-0.4.1-cp37-cp37m-win_amd64.whl (2.5MB)
Installing collected packages: dgl
Successfully installed dgl-0.4.1


In [5]:
#!pip install torch==1.3.1+cpu torchvision==0.4.2+cpu -f https://download.pytorch.org/whl/torch_stable.html

Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.3.1+cpu
  Downloading https://download.pytorch.org/whl/cpu/torch-1.3.1%2Bcpu-cp37-cp37m-win_amd64.whl (71.3MB)
Collecting torchvision==0.4.2+cpu
  Downloading https://download.pytorch.org/whl/cpu/torchvision-0.4.2%2Bcpu-cp37-cp37m-win_amd64.whl (750kB)
Installing collected packages: torch, torchvision
Successfully installed torch-1.3.1+cpu torchvision-0.4.2+cpu


In [1]:
from dgl.data import citation_graph as citegrh
import dgl
import dgl.function as fn
import torch as th
import torch.nn as nn
import torch.nn.functional as F
from dgl import DGLGraph
import numpy as np

In [2]:
#Slightly adjusted code from https://github.com/phanein/deepwalk
"""Graph utilities."""

import logging
import sys
from io import open
from os import path
from time import time
from glob import glob
from six.moves import range, zip, zip_longest
from six import iterkeys
from collections import defaultdict, Iterable
import random
from random import shuffle
from itertools import product,permutations
from scipy.io import loadmat
from scipy.sparse import issparse

logger = logging.getLogger("deepwalk")


__author__ = "Bryan Perozzi"
__email__ = "bperozzi@cs.stonybrook.edu"

LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"

class Graph(defaultdict):
  """Efficient basic implementation of nx `Graph' â€“ Undirected graphs with self loops"""  
  def __init__(self):
    super(Graph, self).__init__(list)

  def nodes(self):
    return self.keys()

  def adjacency_iter(self):
    return self.iteritems()

  def subgraph(self, nodes={}):
    subgraph = Graph()
    
    for n in nodes:
      if n in self:
        subgraph[n] = [x for x in self[n] if x in nodes]
        
    return subgraph

  def make_undirected(self):
  
    t0 = time()

    for v in self.keys():
      for other in self[v]:
        if v != other:
          self[other].append(v)
    
    t1 = time()
    logger.info('make_directed: added missing edges {}s'.format(t1-t0))

    self.make_consistent()
    return self

  def make_consistent(self):
    t0 = time()
    for k in iterkeys(self):
      self[k] = list(sorted(set(self[k])))
    
    t1 = time()
    logger.info('make_consistent: made consistent in {}s'.format(t1-t0))

    self.remove_self_loops()

    return self

  def remove_self_loops(self):

    removed = 0
    t0 = time()

    for x in self:
      if x in self[x]: 
        self[x].remove(x)
        removed += 1
    
    t1 = time()

    logger.info('remove_self_loops: removed {} loops in {}s'.format(removed, (t1-t0)))
    return self

  def check_self_loops(self):
    for x in self:
      for y in self[x]:
        if x == y:
          return True
    
    return False

  def has_edge(self, v1, v2):
    if v2 in self[v1] or v1 in self[v2]:
      return True
    return False

  def degree(self, nodes=None):
    if isinstance(nodes, Iterable):
      return {v:len(self[v]) for v in nodes}
    else:
      return len(self[nodes])

  def order(self):
    "Returns the number of nodes in the graph"
    return len(self)    

  def number_of_edges(self):
    "Returns the number of nodes in the graph"
    return sum([self.degree(x) for x in self.keys()])/2

  def number_of_nodes(self):
    "Returns the number of nodes in the graph"
    return self.order()

  def random_walk(self, path_length, alpha=0, rand=random.Random(), start=None):
    """ Returns a truncated random walk.
        path_length: Length of the random walk.
        alpha: probability of restarts.
        start: the start node of the random walk.
    """
    G = self
    if start:
      path = [start]
    else:
      # Sampling is uniform w.r.t V, and not w.r.t E
      path = [rand.choice(list(G.keys()))]

    while len(path) < path_length:
      cur = path[-1]
      if len(G[cur]) > 0:
        if rand.random() >= alpha:
          path.append(rand.choice(G[cur]))
        else:
          path.append(path[0])
      else:
        break
    return [str(node) for node in path]

# TODO add build_walks in here

def build_deepwalk_corpus(G, num_paths, path_length, alpha=0,
                      rand=random.Random(0)):
  walks = []

  nodes = list(G.nodes())
  
  for cnt in range(num_paths):
    rand.shuffle(nodes)
    for node in nodes:
      walks.append(G.random_walk(path_length, rand=rand, alpha=alpha, start=node))
  
  return walks

def build_deepwalk_corpus_iter(G, num_paths, path_length, alpha=0,
                      rand=random.Random(0)):
  walks = []

  nodes = list(G.nodes())

  for cnt in range(num_paths):
    rand.shuffle(nodes)
    for node in nodes:
      yield G.random_walk(path_length, rand=rand, alpha=alpha, start=node)


def clique(size):
    return from_adjlist(permutations(range(1,size+1)))


# http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks-in-python
def grouper(n, iterable, padvalue=None):
    "grouper(3, 'abcdefg', 'x') --> ('a','b','c'), ('d','e','f'), ('g','x','x')"
    return zip_longest(*[iter(iterable)]*n, fillvalue=padvalue)

def parse_adjacencylist(f):
  adjlist = []
  for l in f:
    if l and l[0] != "#":
      introw = [int(x) for x in l.strip().split()]
      row = [introw[0]]
      row.extend(set(sorted(introw[1:])))
      adjlist.extend([row])
  
  return adjlist

def parse_adjacencylist_unchecked(f):
  adjlist = []
  for l in f:
    if l and l[0] != "#":
      adjlist.extend([[int(x) for x in l.strip().split()]])
  
  return adjlist

def load_adjacencylist(file_, undirected=False, chunksize=10000, unchecked=True):

  if unchecked:
    parse_func = parse_adjacencylist_unchecked
    convert_func = from_adjlist_unchecked
  else:
    parse_func = parse_adjacencylist
    convert_func = from_adjlist

  adjlist = []

  t0 = time()
  
  total = 0 
  with open(file_) as f:
    for idx, adj_chunk in enumerate(map(parse_func, grouper(int(chunksize), f))):
      adjlist.extend(adj_chunk)
      total += len(adj_chunk)
  
  t1 = time()

  logger.info('Parsed {} edges with {} chunks in {}s'.format(total, idx, t1-t0))

  t0 = time()
  G = convert_func(adjlist)
  t1 = time()

  logger.info('Converted edges to graph in {}s'.format(t1-t0))

  if undirected:
    t0 = time()
    G = G.make_undirected()
    t1 = time()
    logger.info('Made graph undirected in {}s'.format(t1-t0))

  return G 


def load_edgelist(file_, undirected=True):
  G = Graph()
  with open(file_) as f:
    for l in f:
      x, y = l.strip().split()[:2]
      x = int(x)
      y = int(y)
      G[x].append(y)
      if undirected:
        G[y].append(x)
  
  G.make_consistent()
  return G


def load_matfile(file_, variable_name="network", undirected=True):
  mat_varables = loadmat(file_)
  mat_matrix = mat_varables[variable_name]

  return from_numpy(mat_matrix, undirected)


def from_networkx(G_input, undirected=True):
    G = Graph()

    for idx, x in enumerate(G_input.nodes()):
        for y in iterkeys(G_input[x]):
            G[x].append(y)

    if undirected:
        G.make_undirected()

    return G


def from_numpy(x, undirected=True):
    G = Graph()

    if issparse(x):
        cx = x.tocoo()
        for i,j,v in zip(cx.row, cx.col, cx.data):
            G[i].append(j)
    else:
      raise Exception("Dense matrices not yet supported.")

    if undirected:
        G.make_undirected()

    G.make_consistent()
    return G


def from_adjlist(adjlist):
    G = Graph()
    
    for row in adjlist:
        node = row[0]
        neighbors = row[1:]
        G[node] = list(sorted(set(neighbors)))

    return G


def from_adjlist_unchecked(adjlist):
    G = Graph()
    
    for row in adjlist:
        node = row[0]
        neighbors = row[1:]
        G[node] = neighbors

    return G


  if sys.path[0] == '':


In [3]:
import logging
from io import open
from os import path
from time import time
from multiprocessing import cpu_count
import random
from concurrent.futures import ProcessPoolExecutor
from collections import Counter

from six.moves import zip

logger = logging.getLogger("deepwalk")

__current_graph = None

# speed up the string encoding
__vertex2str = None

def count_words(file):
  """ Counts the word frequences in a list of sentences.
  Note:
    This is a helper function for parallel execution of `Vocabulary.from_text`
    method.
  """
  c = Counter()
  with open(file, 'r') as f:
    for l in f:
      words = l.strip().split()
      c.update(words)
  return c


def count_textfiles(files, workers=1):
  c = Counter()
  with ProcessPoolExecutor(max_workers=workers) as executor:
    for c_ in executor.map(count_words, files):
      c.update(c_)
  return c


def count_lines(f):
  if path.isfile(f):
    num_lines = sum(1 for line in open(f))
    return num_lines
  else:
    return 0

def _write_walks_to_disk(args):
  num_paths, path_length, alpha, rand, f = args
  G = __current_graph
  t_0 = time()
  with open(f, 'w') as fout:
    for walk in build_deepwalk_corpus_iter(G=G, num_paths=num_paths, path_length=path_length,
                                                 alpha=alpha, rand=rand):
      fout.write(u"{}\n".format(u" ".join(v for v in walk)))
  logger.debug("Generated new file {}, it took {} seconds".format(f, time() - t_0))
  return f

def write_walks_to_disk(G, filebase, num_paths, path_length, alpha=0, rand=random.Random(0), num_workers=cpu_count(),
                        always_rebuild=True):
  global __current_graph
  __current_graph = G
  files_list = ["{}.{}".format(filebase, str(x)) for x in list(range(num_paths))]
  expected_size = len(G)
  args_list = []
  files = []

  if num_paths <= num_workers:
    paths_per_worker = [1 for x in range(num_paths)]
  else:
    paths_per_worker = [len(list(filter(lambda z: z!= None, [y for y in x])))
                        for x in grouper(int(num_paths / num_workers)+1, range(1, num_paths+1))]

  with ProcessPoolExecutor(max_workers=num_workers) as executor:
    for size, file_, ppw in zip(executor.map(count_lines, files_list), files_list, paths_per_worker):
      if always_rebuild or size != (ppw*expected_size):
        args_list.append((ppw, path_length, alpha, random.Random(rand.randint(0, 2**31)), file_))
      else:
        files.append(file_)

  with ProcessPoolExecutor(max_workers=num_workers) as executor:
    for file_ in executor.map(_write_walks_to_disk, args_list):
      files.append(file_)

  return files

class WalksCorpus(object):
  def __init__(self, file_list):
    self.file_list = file_list
  def __iter__(self):
    for file in self.file_list:
      with open(file, 'r') as f:
        for line in f:
          yield line.split()

def combine_files_iter(file_list):
  for file in file_list:
    with open(file, 'r') as f:
      for line in f:
        yield line.split()

In [4]:
from collections import Counter, Mapping
from concurrent.futures import ProcessPoolExecutor
import logging
from multiprocessing import cpu_count
from six import string_types

from gensim.models import Word2Vec
from gensim.models.word2vec import Vocab

logger = logging.getLogger("deepwalk")

class Skipgram(Word2Vec):
    """A subclass to allow more customization of the Word2Vec internals."""

    def __init__(self, vocabulary_counts=None, **kwargs):

        self.vocabulary_counts = None

        kwargs["min_count"] = kwargs.get("min_count", 0)
        kwargs["workers"] = kwargs.get("workers", cpu_count())
        kwargs["size"] = kwargs.get("size", 128)
        kwargs["sentences"] = kwargs.get("sentences", None)
        kwargs["window"] = kwargs.get("window", 10)
        kwargs["sg"] = 1
        kwargs["hs"] = 1

        if vocabulary_counts != None:
          self.vocabulary_counts = vocabulary_counts

        super(Skipgram, self).__init__(**kwargs)

  """Entry point for launching an IPython kernel.


In [5]:
import os
import sys
import random
from io import open
from argparse import ArgumentParser, FileType, ArgumentDefaultsHelpFormatter
from collections import Counter
from concurrent.futures import ProcessPoolExecutor
import logging

from gensim.models import Word2Vec

from six import text_type as unicode
from six import iteritems
from six.moves import range

import psutil
from multiprocessing import cpu_count

p = psutil.Process(os.getpid())
try:
    p.set_cpu_affinity(list(range(cpu_count())))
except AttributeError:
    try:
        p.cpu_affinity(list(range(cpu_count())))
    except AttributeError:
        pass

logger = logging.getLogger(__name__)
LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"


def debug(type_, value, tb):
  if hasattr(sys, 'ps1') or not sys.stderr.isatty():
    sys.__excepthook__(type_, value, tb)
  else:
    import traceback
    import pdb
    traceback.print_exception(type_, value, tb)
    print(u"\n")
    pdb.pm()


def process(args):

  if args.format == "adjlist":
    G = load_adjacencylist(args.input, undirected=args.undirected)
  elif args.format == "edgelist":
    G = load_edgelist(args.input, undirected=args.undirected)
  elif args.format == "mat":
    G = load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
  else:
    raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)

  print("Number of nodes: {}".format(len(G.nodes())))

  num_walks = len(G.nodes()) * args.number_walks

  print("Number of walks: {}".format(num_walks))

  data_size = num_walks * args.walk_length

  print("Data size (walks*length): {}".format(data_size))

  if data_size < args.max_memory_data_size:
    print("Walking...")
    walks = build_deepwalk_corpus(G, num_paths=args.number_walks,
                                        path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
    print("Training...")
    model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers)
  else:
    print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(data_size, args.max_memory_data_size))
    print("Walking...")

    walks_filebase = args.output + ".walks"
    walk_files = write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks,
                                         path_length=args.walk_length, alpha=0, rand=random.Random(args.seed),
                                         num_workers=args.workers)

    print("Counting vertex frequency...")
    if not args.vertex_freq_degree:
      vertex_counts = count_textfiles(walk_files, args.workers)
    else:
      # use degree distribution for frequency in tree
      vertex_counts = G.degree(nodes=G.iterkeys())

    print("Training...")
    walks_corpus = WalksCorpus(walk_files)
    model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts,
                     size=args.representation_size,
                     window=args.window_size, min_count=0, trim_rule=None, workers=args.workers)

  model.wv.save_word2vec_format(args.output)


def start_deepwalk(cmdargs):
  parser = ArgumentParser("deepwalk",
                          formatter_class=ArgumentDefaultsHelpFormatter,
                          conflict_handler='resolve')

  parser.add_argument("--debug", dest="debug", action='store_true', default=False,
                      help="drop a debugger if an exception is raised.")

  parser.add_argument('--format', default='adjlist',
                      help='File format of input file')

  parser.add_argument('--input', nargs='?', required=True,
                      help='Input graph file')

  parser.add_argument("-l", "--log", dest="log", default="INFO",
                      help="log verbosity level")

  parser.add_argument('--matfile-variable-name', default='network',
                      help='variable name of adjacency matrix inside a .mat file.')

  parser.add_argument('--max-memory-data-size', default=1000000000, type=int,
                      help='Size to start dumping walks to disk, instead of keeping them in memory.')

  parser.add_argument('--number-walks', default=10, type=int,
                      help='Number of random walks to start at each node')

  parser.add_argument('--output', required=True,
                      help='Output representation file')

  parser.add_argument('--representation-size', default=64, type=int,
                      help='Number of latent dimensions to learn for each node.')

  parser.add_argument('--seed', default=0, type=int,
                      help='Seed for random walk generator.')

  parser.add_argument('--undirected', default=True, type=bool,
                      help='Treat graph as undirected.')

  parser.add_argument('--vertex-freq-degree', default=False, action='store_true',
                      help='Use vertex degree to estimate the frequency of nodes '
                           'in the random walks. This option is faster than '
                           'calculating the vocabulary.')

  parser.add_argument('--walk-length', default=40, type=int,
                      help='Length of the random walk started at each node')

  parser.add_argument('--window-size', default=5, type=int,
                      help='Window size of skipgram model.')

  parser.add_argument('--workers', default=1, type=int,
                      help='Number of parallel processes.')

  args = parser.parse_args(cmdargs.split())

  numeric_level = getattr(logging, args.log.upper(), None)
  logging.basicConfig(format=LOGFORMAT)
  logger.setLevel(numeric_level)

  if args.debug:
   sys.excepthook = debug

  process(args)


In [10]:
"""scoring.py: Script that demonstrates the multi-label classification used."""

__author__      = "Bryan Perozzi"

import numpy
import sys

from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from collections import defaultdict
from gensim.models import Word2Vec, KeyedVectors
from six import iteritems
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from scipy.io import loadmat
from sklearn.utils import shuffle as skshuffle
from sklearn.preprocessing import MultiLabelBinarizer
from numpy import genfromtxt
from scipy.sparse import csr_matrix

class TopKRanker(OneVsRestClassifier):
    def predict(self, X, top_k_list):
        assert X.shape[0] == len(top_k_list)
        probs = numpy.asarray(super(TopKRanker, self).predict_proba(X))
        all_labels = []
        for i, k in enumerate(top_k_list):
            probs_ = probs[i, :]
            labels = self.classes_[probs_.argsort()[-k:]].tolist()
            all_labels.append(labels)
        return all_labels

def sparse2graph(x):
    G = defaultdict(lambda: set())
    cx = x.tocoo()
    for i,j,v in zip(cx.row, cx.col, cx.data):
        G[i].add(j)
    return {str(k): [str(x) for x in v] for k,v in iteritems(G)}

def scoring(cmdargs):
  parser = ArgumentParser("scoring",
                          formatter_class=ArgumentDefaultsHelpFormatter,
                          conflict_handler='resolve')
  parser.add_argument("--emb", required=True, help='Embeddings file')
  parser.add_argument("--network",
                      help='A .mat file containing the adjacency matrix and node labels of the input network.')
  parser.add_argument("--adj-matrix-name", default='network',
                      help='Variable name of the adjacency matrix inside the .mat file.')
  parser.add_argument("--label-matrix-name", default='group',
                      help='Variable name of the labels matrix inside the .mat file.')
  parser.add_argument("--labels",
                      help='A CSV file of graph labels. The line number represents the node ID. Each class is a column.')
  parser.add_argument("--labels-sep",
                      help='Separator in labels CSV file. Default: "%(default)s"', type=str, default=',')
  parser.add_argument("--num-shuffles", default=2, type=int, help='Number of shuffles.')
  parser.add_argument("--all", default=False, action='store_true',
                      help='The embeddings are evaluated on all training percents from 10 to 90 when this flag is set to true. '
                      'By default, only training percents of 10, 50 and 90 are used.')

  args = parser.parse_args(cmdargs.split())
  # 0. Files
  embeddings_file = args.emb
  matfile = args.network
  
  # 1. Load Embeddings
  model = KeyedVectors.load_word2vec_format(embeddings_file, binary=False)
  
  # 2. Load labels
  if args.network: # Matlab file is given
    mat = loadmat(matfile)
    labels_matrix = mat[args.label_matrix_name]
  elif args.labels: # CSV file is given
    # The format should be: Node ID == line number. Each class is a column, values either 0 or 1
    labels_matrix = csr_matrix(genfromtxt(args.labels, delimiter=args.labels_sep))
  else:
    raise ArgumentError("Either --network or --labels must be given.")

  num_nodes = labels_matrix.shape[0]
  labels_count = labels_matrix.shape[1]
  mlb = MultiLabelBinarizer(range(labels_count))
  
  # Map nodes to their features (note:  assumes nodes are labeled as integers 1:N)
  features_matrix = numpy.asarray([model[str(node)] for node in range(num_nodes)])
  
  # 2. Shuffle, to create train/test groups
  shuffles = []
  for x in range(args.num_shuffles):
    shuffles.append(skshuffle(features_matrix, labels_matrix))
  
  # 3. to score each train/test group
  all_results = defaultdict(list)
  
  if args.all:
    training_percents = numpy.asarray(range(1, 10)) * .1
  else:
    training_percents = [0.02, 0.05, 0.1]
  for train_percent in training_percents:
    for shuf in shuffles:
  
      X, y = shuf
  
      training_size = int(train_percent * X.shape[0])
  
      X_train = X[:training_size, :]
      y_train_ = y[:training_size]
  
      y_train = [[] for x in range(y_train_.shape[0])]
  
  
      cy =  y_train_.tocoo()
      for i, j in zip(cy.row, cy.col):
          y_train[i].append(j)
  
      assert sum(len(l) for l in y_train) == y_train_.nnz
  
      X_test = X[training_size:, :]
      y_test_ = y[training_size:]
  
      y_test = [[] for _ in range(y_test_.shape[0])]
  
      cy =  y_test_.tocoo()
      for i, j in zip(cy.row, cy.col):
          y_test[i].append(j)
  
      clf = TopKRanker(LogisticRegression())
      clf.fit(X_train, y_train_)
  
      # find out how many labels should be predicted
      top_k_list = [len(l) for l in y_test]
      preds = clf.predict(X_test, top_k_list)
  
      results = {}
      averages = ["micro", "macro"]
      for average in averages:
          results[average] = f1_score(mlb.fit_transform(y_test), mlb.fit_transform(preds), average=average)
  
      all_results[train_percent].append(results)
  
  print ('Results, using embeddings of dimensionality', X.shape[1])
  print ('-------------------')
  for train_percent in sorted(all_results.keys()):
    print ('Train percent:', train_percent)
    for index, result in enumerate(all_results[train_percent]):
      print ('Shuffle #%d:   ' % (index + 1), result)
    avg_score = defaultdict(float)
    for score_dict in all_results[train_percent]:
      for metric, score in iteritems(score_dict):
        avg_score[metric] += score
    for metric in avg_score:
      avg_score[metric] /= len(all_results[train_percent])
    print ('Average score:', dict(avg_score))
    print ('-------------------')

In [6]:
#Loading CORA Dataset from DGL
data = citegrh.load_cora()
features = th.FloatTensor(data.features)
labels = th.LongTensor(data.labels)
mask = th.ByteTensor(data.train_mask)
g = data.graph
#g2 = DGLGraph(g)
#Add self loop
#g2.add_edges(g.nodes(), g.nodes())

In [8]:
#Save edges per node internally
edges_per_node = {}
for x in g.adj.items():
  z = [] 
  for i in x[1]:
    z.append(i)
  edges_per_node[x[0]] = z

In [10]:
#Create and save adjlist format for deepwalk
f = open("data/cora.adjlist", "w")
for i in range(len(edges_per_node)):
    f.write(str(i) + " " + str(edges_per_node[i]).strip('[]').replace(",","") + "\n")

f.close()
#Check if neccesary to remove last empty line

In [11]:
#Create and save labels format for deepwalk
f = open("data/cora_labels.csv", "w")
for x in labels:
    if (int(x)==0):
        f.write("1,0,0,0,0,0,0" + "\n")
    if (int(x)==1):
        f.write("0,1,0,0,0,0,0" + "\n")
    if (int(x)==2):
        f.write("0,0,1,0,0,0,0" + "\n")
    if (int(x)==3):
        f.write("0,0,0,1,0,0,0" + "\n")
    if (int(x)==4):
        f.write("0,0,0,0,1,0,0" + "\n")
    if (int(x)==5):
        f.write("0,0,0,0,0,1,0" + "\n")
    if (int(x)==6):
        f.write("0,0,0,0,0,0,1" + "\n")  
f.close()
#Check if neccesary to remove last empty line

In [12]:
#Start DeepWalk on Cora
#F1 score 60 
start_deepwalk("--format adjlist --input ../data/cora.adjlist --number-walks 10 --representation-size 64 --walk-length 6 --window-size 6 --workers 1 --output ../data/cora.embeddings")
#Get F1 scores for  permutations
scoring("--emb ../data/cora.embeddings --labels ../data/cora_labels.csv --num-shuffle 10")

2020-01-08 12:42:51 INFO <ipython-input-2-5bd29a7870c9>: 220 Parsed 2708 edges with 0 chunks in 0.03299760818481445s
2020-01-08 12:42:51 INFO <ipython-input-2-5bd29a7870c9>: 226 Converted edges to graph in 0.001005411148071289s
2020-01-08 12:42:51 INFO <ipython-input-2-5bd29a7870c9>: 57 make_directed: added missing edges 0.0020101070404052734s
2020-01-08 12:42:51 INFO <ipython-input-2-5bd29a7870c9>: 68 make_consistent: made consistent in 0.0039997100830078125s
2020-01-08 12:42:51 INFO <ipython-input-2-5bd29a7870c9>: 86 remove_self_loops: removed 0 loops in 0.0009987354278564453s
2020-01-08 12:42:51 INFO <ipython-input-2-5bd29a7870c9>: 232 Made graph undirected in 0.008999824523925781s


Number of nodes: 2708
Number of walks: 27080
Data size (walks*length): 162480
Walking...
Training...














Results, using embeddings of dimensionality 64
-------------------
Train percent: 0.02
Shuffle #1:    {'micro': 0.5674453654860587, 'macro': 0.5276020868975161}
Shuffle #2:    {'micro': 0.5859080633006782, 'macro': 0.5108139206679072}
Shuffle #3:    {'micro': 0.5180859080633007, 'macro': 0.5103781756599577}
Shuffle #4:    {'micro': 0.48907309721175585, 'macro': 0.4316608961060943}
Shuffle #5:    {'micro': 0.6149208741522231, 'macro': 0.594582175036426}
Shuffle #6:    {'micro': 0.5730972117558403, 'macro': 0.5573646748016523}
Shuffle #7:    {'micro': 0.6017332328560663, 'macro': 0.5587587850225393}
Shuffle #8:    {'micro': 0.5727204220045214, 'macro': 0.5274573686481564}
Shuffle #9:    {'micro': 0.4996232102486812, 'macro': 0.4009785524550694}
Shuffle #10:    {'micro': 0.6239638281838734, 'macro': 0.5812908698972509}
Average score: {'micro': 0.5646571213262999, 'macro': 0.5200887505192571}
-------------------
Train percent: 0.05
Shuffle #1:    {'micro': 0.6401088223863195, 'macro': 0.61

In [17]:
#Higher settings, takes up to 1 min with workers/cores set to 8
#F1 score 65
start_deepwalk("--format adjlist --input data/cora.adjlist --number-walks 60 --representation-size 128 --walk-length 20 --window-size 15 --workers 8 --output data/cora.embeddings")
#Get F1 scores for  permutations
scoring("--emb data/cora.embeddings --labels data/cora_labels.csv --num-shuffle 10")

2019-12-04 14:33:06 INFO <ipython-input-2-5bd29a7870c9>: 220 Parsed 2708 edges with 0 chunks in 0.08099770545959473s
2019-12-04 14:33:06 INFO <ipython-input-2-5bd29a7870c9>: 226 Converted edges to graph in 0.0010004043579101562s
2019-12-04 14:33:06 INFO <ipython-input-2-5bd29a7870c9>: 57 make_directed: added missing edges 0.005999565124511719s
2019-12-04 14:33:06 INFO <ipython-input-2-5bd29a7870c9>: 68 make_consistent: made consistent in 0.004999637603759766s
2019-12-04 14:33:06 INFO <ipython-input-2-5bd29a7870c9>: 86 remove_self_loops: removed 0 loops in 0.0010001659393310547s
2019-12-04 14:33:06 INFO <ipython-input-2-5bd29a7870c9>: 232 Made graph undirected in 0.014999628067016602s


Number of nodes: 2708
Number of walks: 162480
Data size (walks*length): 3249600
Walking...
Training...




  str(classes[c]))
  'precision', 'predicted', average, warn_for)










Results, using embeddings of dimensionality 128
-------------------
Train percent: 0.02
Shuffle #1:    {'micro': 0.680482290881688, 'macro': 0.6562143366490787}
Shuffle #2:    {'micro': 0.6107761868877166, 'macro': 0.5904230792729148}
Shuffle #3:    {'micro': 0.6831198191409193, 'macro': 0.655644271956677}
Shuffle #4:    {'micro': 0.6529766390354182, 'macro': 0.6319600082963264}
Shuffle #5:    {'micro': 0.6186887716654107, 'macro': 0.5548279888699779}
Shuffle #6:    {'micro': 0.6495855312735493, 'macro': 0.5815924695603616}
Shuffle #7:    {'micro': 0.6695553880934438, 'macro': 0.6381792791209596}
Shuffle #8:    {'micro': 0.6311228334589299, 'macro': 0.5506262844482733}
Shuffle #9:    {'micro': 0.6748304446119066, 'macro': 0.6214758182306978}
Shuffle #10:    {'micro': 0.6525998492840994, 'macro': 0.6032369746341116}
Average score: {'micro': 0.6523737754333082, 'macro': 0.6084180511039378}
-------------------
Train percent: 0.05
Shuffle #1:    {'micro': 0.7481539059463661, 'macro': 0.738

In [None]:
#DeepWalk for other datasets

In [27]:
start_deepwalk("--input karate.adjlist --output karate2.emb")

2019-12-04 11:30:37 INFO <ipython-input-22-d6ca7d315758>: 219 Parsed 34 edges with 0 chunks in 0.04687213897705078s
2019-12-04 11:30:37 INFO <ipython-input-22-d6ca7d315758>: 225 Converted edges to graph in 0.0s
2019-12-04 11:30:37 INFO <ipython-input-22-d6ca7d315758>: 56 make_directed: added missing edges 0.0s
2019-12-04 11:30:37 INFO <ipython-input-22-d6ca7d315758>: 67 make_consistent: made consistent in 0.0s
2019-12-04 11:30:37 INFO <ipython-input-22-d6ca7d315758>: 85 remove_self_loops: removed 0 loops in 0.0s
2019-12-04 11:30:37 INFO <ipython-input-22-d6ca7d315758>: 231 Made graph undirected in 0.0s


Number of nodes: 34
Number of walks: 340
Data size (walks*length): 13600
Walking...
Training...


In [None]:
start_deepwalk("--format mat --input blogcatalog.mat --number-walks 5 --representation-size 128 --walk-length 5 --window-size 5 --workers 1 --output blogcatalog.embeddings")

In [55]:
start_deepwalk("--format mat --input blogcatalog.mat --number-walks 30 --representation-size 64 --walk-length 15 --window-size 10 --workers 8 --output blogcatalog.embeddings")

2019-12-04 12:43:26 INFO <ipython-input-22-d6ca7d315758>: 56 make_directed: added missing edges 0.21200299263000488s
2019-12-04 12:43:26 INFO <ipython-input-22-d6ca7d315758>: 67 make_consistent: made consistent in 0.19399380683898926s
2019-12-04 12:43:26 INFO <ipython-input-22-d6ca7d315758>: 85 remove_self_loops: removed 0 loops in 0.03301191329956055s
2019-12-04 12:43:27 INFO <ipython-input-22-d6ca7d315758>: 67 make_consistent: made consistent in 0.14600372314453125s
2019-12-04 12:43:27 INFO <ipython-input-22-d6ca7d315758>: 85 remove_self_loops: removed 0 loops in 0.03200483322143555s


Number of nodes: 10312
Number of walks: 309360
Data size (walks*length): 4640400
Walking...
Training...


In [131]:
scoring("--emb blogcatalog.embeddings --labels blogcatalog_labels.csv --num-shuffle 4")

  str(classes[c]))
  'precision', 'predicted', average, warn_for)


  'precision', 'predicted', average, warn_for)


  'precision', 'predicted', average, warn_for)


  'precision', 'predicted', average, warn_for)


  'precision', 'predicted', average, warn_for)


  'precision', 'predicted', average, warn_for)


  'precision', 'predicted', average, warn_for)


  'precision', 'predicted', average, warn_for)


  'precision', 'predicted', average, warn_for)


  'recall', 'true', average, warn_for)


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Results, using embeddings of dimensionality 64
-------------------
Train percent: 0.1
Shuffle #1:    {'micro': 0.3481663342028541, 'macro': 0.1857935969716386}
Shuffle #2:    {'micro': 0.3473861720067454, 'macro': 0.1882592439623669}
Shuffle #3:    {'micro': 0.3490515321403886, 'macro': 0.1979648053052886}
Shuffle #4:    {'micro': 0.3460329628210042, 'macro': 0.19694310429037268}
Average score: {'micro': 0.3476592502927481, 'macro': 0.19224018763241668}
-------------------
Train percent: 0.5
Shuffle #1:    {'micro': 0.3922575274039128, 'macro': 0.24184408696342424}
Shuffle #2:    {'micro': 0.3927777777777777, 'macro': 0.23305086041588588}
Shuffle #3:    {'micro': 0.3903640843142622, 'macro': 0.23621637961202518}
Shuffle #4:    {'micro': 0.38935845919357076, 'macro': 0.24036801150310091}
Average score: {'micro': 0.3911894621723809, 'macro': 0.23786983462360908}
-------------------
Train percent: 0.9
Shuffle #1:    {'micro': 0.4041666666666667, 'macro': 0.25591910413140256}
Shuffle #2:  

In [76]:
start_deepwalk("--format adjlist --input karate.adjlist --number-walks 10 --representation-size 64 --walk-length 6 --window-size 6 --workers 1 --output karate3.embeddings")

2019-12-04 13:08:44 INFO <ipython-input-22-d6ca7d315758>: 219 Parsed 34 edges with 0 chunks in 0.029984235763549805s
2019-12-04 13:08:44 INFO <ipython-input-22-d6ca7d315758>: 225 Converted edges to graph in 0.0s
2019-12-04 13:08:44 INFO <ipython-input-22-d6ca7d315758>: 56 make_directed: added missing edges 0.0s
2019-12-04 13:08:44 INFO <ipython-input-22-d6ca7d315758>: 67 make_consistent: made consistent in 0.0s
2019-12-04 13:08:44 INFO <ipython-input-22-d6ca7d315758>: 85 remove_self_loops: removed 0 loops in 0.0s
2019-12-04 13:08:44 INFO <ipython-input-22-d6ca7d315758>: 231 Made graph undirected in 0.0020003318786621094s


Number of nodes: 34
Number of walks: 340
Data size (walks*length): 2040
Walking...
Training...




In [77]:
scoring("--emb karate3.embeddings --labels karate_labels.csv --num-shuffle 4 --all")

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Results, using embeddings of dimensionality 64
-------------------
Train percent: 0.1
Shuffle #1:    {'micro': 0.4838709677419355, 'macro': 0.32608695652173914}
Shuffle #2:    {'micro': 0.4838709677419355, 'macro': 0.32608695652173914}
Shuffle #3:    {'micro': 0.4838709677419355, 'macro': 0.32608695652173914}
Shuffle #4:    {'micro': 0.4838709677419355, 'macro': 0.32608695652173914}
Average score: {'micro': 0.4838709677419355, 'macro': 0.32608695652173914}
-------------------
Train percent: 0.2
Shuffle #1:    {'micro': 0.42857142857142855, 'macro': 0.3}
Shuffle #2:    {'micro': 0.4642857142857143, 'macro': 0.42857142857142855}
Shuffle #3:    {'micro': 0.5, 'macro': 0.4759358288770053}
Shuffle #4:    {'micro': 0.4642857142857143, 'macro': 0.3170731707317073}
Average score: {'micro': 0.4642857142857143, 'macro': 0.3803951070450353}
-------------------
Train percent: 0.30000000000000004
Shuffle #1:    {'micro': 0.4583333333333333, 'macro': 0.3142857142857143}
Shuffle #2:    {'micro': 0.45