In [1]:
import importlib as imp
import argparse
import numpy as np
import networkx as nx
import n2v
from gensim.models import Word2Vec

In [2]:
def parse_args():
    '''
    Parses the node2vec arguments.
    '''
    parser = argparse.ArgumentParser(description="Run node2vec.")

    parser.add_argument('--input', nargs='?', default='graph/karate.edgelist',
                        help='Input graph path')

    parser.add_argument('--output', nargs='?', default='emb/karate.emb',
                        help='Embeddings path')

    parser.add_argument('--dimensions', type=int, default=128,
                        help='Number of dimensions. Default is 128.')

    parser.add_argument('--walk-length', type=int, default=80,
                        help='Length of walk per source. Default is 80.')

    parser.add_argument('--num-walks', type=int, default=10,
                        help='Number of walks per source. Default is 10.')

    parser.add_argument('--window-size', type=int, default=10,
                        help='Context size for optimization. Default is 10.')

    parser.add_argument('--iter', default=1, type=int,
                      help='Number of epochs in SGD')

    parser.add_argument('--workers', type=int, default=8,
                        help='Number of parallel workers. Default is 8.')

    parser.add_argument('--p', type=float, default=1,
                        help='Return hyperparameter. Default is 1.')

    parser.add_argument('--q', type=float, default=1,
                        help='Inout hyperparameter. Default is 1.')

    parser.add_argument('--weighted', dest='weighted', action='store_true',
                        help='Boolean specifying (un)weighted. Default is unweighted.')
    parser.add_argument('--unweighted', dest='unweighted', action='store_false')
    parser.set_defaults(weighted=False)

    parser.add_argument('--directed', dest='directed', action='store_true',
                        help='Graph is (un)directed. Default is undirected.')
    parser.add_argument('--undirected', dest='undirected', action='store_false')
    parser.set_defaults(directed=False)

    return parser.parse_args()

def read_graph(input, is_weighted, is_directed, nodetype=int, is_using_adjlist=False):
    '''
    Reads the input network in networkx.
    '''
    if is_using_adjlist:
        G = nx.read_adjlist(input, comments='#', delimiter=" ", nodetype=nodetype)
        for edge in G.edges():
            G[edge[0]][edge[1]]['weight'] = 1
    else:
        if is_weighted:
            G = nx.read_edgelist(input, nodetype=nodetype, data=(('weight',float),), create_using=nx.DiGraph())
        else:
            G = nx.read_edgelist(input, nodetype=nodetype, create_using=nx.DiGraph())
            for edge in G.edges():
                G[edge[0]][edge[1]]['weight'] = 1

    if not is_directed:
        G = G.to_undirected()
    return G

def learn_embeddings(walks, dimensions, window_size, workers, iter, output):
    '''
    Learn embeddings by optimizing the Skipgram objective using SGD.
    '''
    walks = [list(map(str, walk)) for walk in walks]
    model = Word2Vec(walks, size=dimensions, window=window_size, min_count=0, sg=1, workers=workers, iter=iter)
    model.wv.save_word2vec_format(output)
    return

In [21]:
# Simple demo
n2v = imp.reload(n2v)

In [22]:
# build
FN_EDGELIST = "../data/karate.edgelist"
FN_EMBEDDINGS = "../output/karate.emb"
nx_G = read_graph(input=FN_EDGELIST, is_weighted=False, is_directed=False)
G = n2v.Graph(nx_G, is_directed=False, is_weighted=False, p=1, q=1)
G.preprocess_transition_probs()
walks = G.simulate_walks(num_walks=10, walk_length=80)
learn_embeddings(walks, dimensions=128, window_size=10, workers=8, iter=1, output=FN_EMBEDDINGS)

Walk iteration:
1 / 10
2 / 10
3 / 10
4 / 10
5 / 10
6 / 10
7 / 10
8 / 10
9 / 10
10 / 10


In [5]:
# search
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format(FN_EMBEDDINGS, binary=False)
print("most_similar: {}".format(model.most_similar("1"), topn=10))

most_similar: [('4', 0.9985132217407227), ('8', 0.9975525140762329), ('13', 0.9954157471656799), ('2', 0.9953797459602356), ('14', 0.9941872358322144), ('20', 0.9923473596572876), ('18', 0.9899183511734009), ('17', 0.9883556962013245), ('7', 0.9867613315582275), ('12', 0.984230637550354)]


In [6]:
# Get hands dirty

In [7]:
FN_USER_ITEM_LABEL = "../data/user_item_label.txt"
FN_ITEM_TITLE = "../data/item_title.index"
FN_UI_ADJ = "../data/user_items.adjlist"

In [8]:
# load item titles
def load_item_titles(fn):
    item_title_dict = {}
    idx = 1
    with open(fn) as fd:
        for line in fd:
            line = line.rstrip()
            item_title_dict["I{}".format(idx)] = line
            idx += 1
    return item_title_dict
item_title_dict = load_item_titles(FN_ITEM_TITLE)

In [9]:
print("I1: {}".format(item_title_dict["I1"]))

I1: 动画片 | 海绵宝宝：剧情幽默而充满想象力


In [10]:
def tmp_inspect(fn):
    valid_users = set()
    valid_items = set()
    all_users = set()
    all_items = set()
    with open(fn) as fd:
        for line in fd:
            arr = line.rstrip().split(",")
            if len(arr) != 3:
                continue
            all_users.add(arr[0])
            all_items.add(arr[1])
            if arr[2] == "1":
                valid_users.add("U{}".format(arr[0]))
                valid_items.add("I{}".format(arr[1]))
    return valid_users, all_users, valid_items, all_items
valid_users, all_users, valid_items, all_items = tmp_inspect(FN_USER_ITEM_LABEL)

In [11]:
print(len(valid_users), len(all_users), len(valid_items), len(all_items))

83719 100000 299 571


In [12]:
# print(sorted(list(valid_items), key=lambda x: int(x[1:]), reverse=False))

In [13]:
from collections import defaultdict
def generate_adjlist_file(fn_in, fn_out):
    user_items_dict = defaultdict(set)
    with open(fn_in) as fd:
        for line in fd:
            arr = line.rstrip().split(",")
            if len(arr) != 3:
                continue
            if arr[2] == "1":
                user_items_dict["U{}".format(arr[0])].add("I{}".format(arr[1]))
    with open(fn_out, "w") as fd:
        for user, item_set in user_items_dict.items():
            fd.write("{} {}\n".format(user, " ".join(item_set)))
    return user_items_dict
user_items_dict = generate_adjlist_file(FN_USER_ITEM_LABEL, FN_UI_ADJ)

In [14]:
print(user_items_dict["U2"])

{'I147', 'I12', 'I46', 'I146', 'I91', 'I44', 'I41'}


In [16]:
# build
n2v = imp.reload(n2v)
print("Read graph ..")
nx_G = read_graph(FN_UI_ADJ, is_weighted=False, is_directed=False, nodetype=str, is_using_adjlist=True)
print("Init graph ..")
G = n2v.Graph(nx_G, is_directed=False, is_weighted=False, p=1, q=1)

Read graph ..
Init graph ..


In [None]:
print("Preprocessing probs ..")
G.preprocess_transition_probs()
print("Generate walks ..")
walks = G.simulate_walks(num_walks=10, walk_length=80)
print("Generate Embeddings ..")
learn_embeddings(walks, dimensions=128, window_size=10, workers=8, iter=1, output=FN_EMBEDDINGS)

In [None]:
# search
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format(FN_EMBEDDINGS, binary=False)
print("most_similar: {}".format(model.most_similar("1"), topn=10))

In [20]:
# test
nx_G.degree("I24")
nx_G.is_directed()

False