In [1]:
import numpy as np
import networkx as nx

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.datasets import Planetoid

from gensim.models import Word2Vec

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression

import seaborn as sns
import matplotlib.pyplot as plt

import os
import pandas as pd
from tqdm import *
from scipy import stats

from pathlib import Path

NathanPath="d:\Documents\Info\INF554\INF554_Kaggle_Project"
NathanPath="/users/eleves-a/2019/nathan.peluso/INF554/INF554_Kaggle_Project"

In [2]:
project_path = str(Path(os.getcwd()).parent.absolute())
print("Current directory : " + os.getcwd() + ", Project directory : " + project_path)

Current directory : /Users/maximebonnin/Notebooks/3A_Notebook/INF554/INF554_Kaggle_Project/DeepWalk, Project directory : /Users/maximebonnin/Notebooks/3A_Notebook/INF554/INF554_Kaggle_Project


In [4]:
os.chdir(project_path)
#os.chdir(NathanPath)

In [5]:

# Read training data
df_train = pd.read_csv('data/train.csv', dtype={'author': np.int64, 'hindex': np.float32})
n_train = df_train.shape[0]

# Read test data
df_test = pd.read_csv('data/test.csv', dtype={'author': np.int64})
n_test = df_test.shape[0]

df_dummy_sub = pd.read_csv('submissions/dummy_submission.csv', dtype={'author': np.int64, 'hindex': np.float32})

In [6]:
G = nx.read_edgelist('data/coauthorship.edgelist', delimiter=' ', nodetype=int)
n=G.number_of_nodes()
print("Number of nodes : " + str(n))

abs_nodeID_Graph=dict(enumerate(G.nodes))
nodeID_abs_Graph=dict([(b,a) for a,b in enumerate(G.nodes)])

Number of nodes : 217801


In [10]:
print(list(G.nodes)[:10])
print(list(abs_nodeID_Graph)[:10])

[2002218453, 1999212242, 2032640503, 2475931411, 2477743428, 2504846374, 2597456557, 2598017501, 2134271654, 2138551865]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [11]:
abs_nodeID_Train=dict(df_train["author"])
abs_hindex_Train=dict(df_train["hindex"])
nodeID_abs_Train=dict([(b,a) for a,b in abs_nodeID_Train.items()])

abs_nodeID_Test=dict(df_test["author"])
abs_hindex_Test=dict(df_test["hindex"])
nodeID_abs_Test=dict([(b,a) for a,b in abs_nodeID_Test.items()])

In [12]:
"""train_dict=[(a,b) for a,b in np.array(df_train, dtype=np.int64)]
id_train=[i for i,_ in train_dict]
id_test=[x for x,_ in np.array(df_test, dtype=np.int64)]

idx = np.random.permutation(len(id_train))

idx_train_indexes=idx[:int(0.8*len(id_train))]
idx_train = [id_train[i] for i in idx_train_indexes]
idx_val_indexes=idx[int(0.8*len(id_train)):]
idx_val = [id_train[i] for i in idx_val_indexes]



G_train=G.subgraph(id_train)
h_index_train=[x for _,x in train_dict]

print(len(idx_train))
print(len(idx_val))

h_index_trainx=[h_index_train[x] for x in idx[:int(0.8*len(id_train))]]
h_index_valx=[h_index_train[x] for x in idx[int(0.8*len(id_train)):]]"""

'train_dict=[(a,b) for a,b in np.array(df_train, dtype=np.int64)]\nid_train=[i for i,_ in train_dict]\nid_test=[x for x,_ in np.array(df_test, dtype=np.int64)]\n\nidx = np.random.permutation(len(id_train))\n\nidx_train_indexes=idx[:int(0.8*len(id_train))]\nidx_train = [id_train[i] for i in idx_train_indexes]\nidx_val_indexes=idx[int(0.8*len(id_train)):]\nidx_val = [id_train[i] for i in idx_val_indexes]\n\n\n\nG_train=G.subgraph(id_train)\nh_index_train=[x for _,x in train_dict]\n\nprint(len(idx_train))\nprint(len(idx_val))\n\nh_index_trainx=[h_index_train[x] for x in idx[:int(0.8*len(id_train))]]\nh_index_valx=[h_index_train[x] for x in idx[int(0.8*len(id_train)):]]'

In [13]:
def random_walk(G, node, walk_length):
    # Simulates a random walk of length "walk_length" starting from node "node"
    walk = [node]
    for i in range(walk_length):
        walk.append(np.random.choice(list(G.neighbors(walk[i]))))

    walk = [str(node) for node in walk]
    return walk


In [14]:
def generate_walks(G, num_walks, walk_length):
    # Runs "num_walks" random walks from each node
    walks = []
    for x in G.nodes():
        for _ in range(num_walks):
            walks.append(random_walk(G,x,walk_length))
    np.random.shuffle(walks)
    return walks

In [15]:
def deepwalk(G, num_walks, walk_length, n_dim):
    # Simulates walks and uses the Skipgram model to learn node representations
    print("Generating walks")
    walks = generate_walks(G, num_walks, walk_length)
    print("Training word2vec")
    model = Word2Vec(vector_size=n_dim, window=8, min_count=0, sg=1, workers=8, hs=1)
    model.build_vocab(walks)
    model.train(walks, total_examples=model.corpus_count, epochs=5)
    return model

n_dim = 128
n_walks = 10
walk_length = 10
#model = deepwalk(G, n_walks, walk_length, n_dim) 
#model.save("DeepWalk/Models/model_"+str(n_walks)+'_'+str(walk_length)+'_'+str(n_dim))

In [16]:
n_dim = 128
num_walks = 10
walk_length = 10
model = deepwalk(G, num_walks, walk_length, n_dim)
#model.save("DeepWalk/Models/model_"+str(n_walks)+'_'+str(walk_length)+'_'+str(n_dim))

Generating walks
Training word2vec


In [20]:
type(model)
model.save("model_"+str(n_walks)+'_'+str(walk_length)+'_'+str(n_dim))

In [None]:
#model=Word2Vec.load("DeepWalk/Models/model_"+str(n_walks)+'_'+str(walk_length)+'_'+str(n_dim))
model=Word2Vec.load("DeepWalk/Models/model_10_10_128")

In [25]:
model.save_embedding

AttributeError: 'Word2Vec' object has no attribute 'save_embedding'

In [21]:
print(len(model.wv), G.number_of_nodes())
list(model.wv)

217762 217801


KeyError: "Key '217762' not present"

In [34]:
print(model.wv.key_to_index)
#model.wv.index_to_key

{737083156: 0, 288512535: 1, 2017222965: 2, 2165267828: 3, 2035273997: 4, 717547165: 5, 514836396: 6, 2295682018: 7, 2122310831: 8, 2168417342: 9, 703301073: 10, 2127483189: 11, 2469462003: 12, 2293758362: 13, 2066282134: 14, 2138706875: 15, 1970640687: 16, 2070167002: 17, 1476557878: 18, 2238793978: 19, 1988113753: 20, 2435511731: 21, 2061074560: 22, 2118802686: 23, 2035111836: 24, 2130941901: 25, 563121846: 26, 2149153931: 27, 1941402522: 28, 2038216534: 29, 2046588481: 30, 2004763151: 31, 2145586806: 32, 1643248573: 33, 2044930014: 34, 229163800: 35, 2155833130: 36, 2345075623: 37, 2167405706: 38, 2122019980: 39, 2117427889: 40, 2011313876: 41, 1969876091: 42, 2111158966: 43, 698114488: 44, 2142246817: 45, 2155816217: 46, 2182667553: 47, 2086393978: 48, 2157372337: 49, 2252067276: 50, 2161153869: 51, 1975726436: 52, 2183882076: 53, 691071242: 54, 2125988131: 55, 2150384665: 56, 2196998088: 57, 70153750: 58, 2647056301: 59, 2121075939: 60, 2151341763: 61, 2242174962: 62, 269509060: 6

In [37]:
DeepWalk_embeddings = np.empty(shape=(n, n_dim))
i = 0
print("Filling embeddings")
for i, node in enumerate(G.nodes()):
    #try:
    #print(type(node))
    index = model.wv.key_to_index[node]
    DeepWalk_embeddings[i]=model.wv[index]
    # except:
    #     i+=1
    #     print(node)
    #     pass

print(i)


Filling embeddings
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<

KeyError: 2856462529

In [None]:
def RSE(X,Y):
    return (X-Y)@(X-Y) / len(X)

"""X_train_x = [DeepWalk_embeddings[i] for i in idx_train_indexes]
X_validate_x = [DeepWalk_embeddings[i] for i in idx_val_indexes]

lin_reg=LinearRegression()
lin_reg.fit(X_train_x,h_index_trainx)
_pred=lin_reg.predict(X_validate_x)
score=RSE(h_index_valx,_pred)


print("Accuracy using DeepWalk embeddings ", score)"""